Master 4.0 (#2210)

* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
This commit is contained in:
binary-husky
2025-08-23 15:59:22 +08:00
committed by GitHub
parent 65a4cf59c2
commit 8042750d41
79 changed files with 20850 additions and 57 deletions

View File

@@ -0,0 +1,68 @@
from typing import List
from crazy_functions.review_fns.data_sources.base_source import PaperMetadata
class EndNoteFormatter:
"""EndNote参考文献格式生成器"""
def __init__(self):
pass
def create_document(self, papers: List[PaperMetadata]) -> str:
"""生成EndNote格式的参考文献文本
Args:
papers: 论文列表
Returns:
str: EndNote格式的参考文献文本
"""
endnote_text = ""
for paper in papers:
# 开始一个新条目
endnote_text += "%0 Journal Article\n" # 默认类型为期刊文章
# 根据venue_type调整条目类型
if hasattr(paper, 'venue_type') and paper.venue_type:
if paper.venue_type.lower() == 'conference':
endnote_text = endnote_text.replace("Journal Article", "Conference Paper")
elif paper.venue_type.lower() == 'preprint':
endnote_text = endnote_text.replace("Journal Article", "Electronic Article")
# 添加标题
endnote_text += f"%T {paper.title}\n"
# 添加作者
for author in paper.authors:
endnote_text += f"%A {author}\n"
# 添加年份
if paper.year:
endnote_text += f"%D {paper.year}\n"
# 添加期刊/会议名称
if hasattr(paper, 'venue_name') and paper.venue_name:
endnote_text += f"%J {paper.venue_name}\n"
elif paper.venue:
endnote_text += f"%J {paper.venue}\n"
# 添加DOI
if paper.doi:
endnote_text += f"%R {paper.doi}\n"
endnote_text += f"%U https://doi.org/{paper.doi}\n"
elif paper.url:
endnote_text += f"%U {paper.url}\n"
# 添加摘要
if paper.abstract:
endnote_text += f"%X {paper.abstract}\n"
# 添加机构
if hasattr(paper, 'institutions'):
for institution in paper.institutions:
endnote_text += f"%I {institution}\n"
# 条目之间添加空行
endnote_text += "\n"
return endnote_text

View File

@@ -0,0 +1,211 @@
import re
import os
import pandas as pd
from datetime import datetime
class ExcelTableFormatter:
"""聊天记录中Markdown表格转Excel生成器"""
def __init__(self):
"""初始化Excel文档对象"""
from openpyxl import Workbook
self.workbook = Workbook()
self._table_count = 0
self._current_sheet = None
def _normalize_table_row(self, row):
"""标准化表格行,处理不同的分隔符情况"""
row = row.strip()
if row.startswith('|'):
row = row[1:]
if row.endswith('|'):
row = row[:-1]
return [cell.strip() for cell in row.split('|')]
def _is_separator_row(self, row):
"""检查是否是分隔行(由 - 或 : 组成)"""
clean_row = re.sub(r'[\s|]', '', row)
return bool(re.match(r'^[-:]+$', clean_row))
def _extract_tables_from_text(self, text):
"""从文本中提取所有表格内容"""
if not isinstance(text, str):
return []
tables = []
current_table = []
is_in_table = False
for line in text.split('\n'):
line = line.strip()
if not line:
if is_in_table and current_table:
if len(current_table) >= 2:
tables.append(current_table)
current_table = []
is_in_table = False
continue
if '|' in line:
if not is_in_table:
is_in_table = True
current_table.append(line)
else:
if is_in_table and current_table:
if len(current_table) >= 2:
tables.append(current_table)
current_table = []
is_in_table = False
if is_in_table and current_table and len(current_table) >= 2:
tables.append(current_table)
return tables
def _parse_table(self, table_lines):
"""解析表格内容为结构化数据"""
try:
headers = self._normalize_table_row(table_lines[0])
separator_index = next(
(i for i, line in enumerate(table_lines) if self._is_separator_row(line)),
1
)
data_rows = []
for line in table_lines[separator_index + 1:]:
cells = self._normalize_table_row(line)
# 确保单元格数量与表头一致
while len(cells) < len(headers):
cells.append('')
cells = cells[:len(headers)]
data_rows.append(cells)
if headers and data_rows:
return {
'headers': headers,
'data': data_rows
}
except Exception as e:
print(f"解析表格时发生错误: {str(e)}")
return None
def _create_sheet(self, question_num, table_num):
"""创建新的工作表"""
sheet_name = f'Q{question_num}_T{table_num}'
if len(sheet_name) > 31:
sheet_name = f'Table{self._table_count}'
if sheet_name in self.workbook.sheetnames:
sheet_name = f'{sheet_name}_{datetime.now().strftime("%H%M%S")}'
return self.workbook.create_sheet(title=sheet_name)
def create_document(self, history):
"""
处理聊天历史中的所有表格并创建Excel文档
Args:
history: 聊天历史列表
Returns:
Workbook: 处理完成的Excel工作簿对象如果没有表格则返回None
"""
has_tables = False
# 删除默认创建的工作表
default_sheet = self.workbook['Sheet']
self.workbook.remove(default_sheet)
# 遍历所有回答
for i in range(1, len(history), 2):
answer = history[i]
tables = self._extract_tables_from_text(answer)
for table_lines in tables:
parsed_table = self._parse_table(table_lines)
if parsed_table:
self._table_count += 1
sheet = self._create_sheet(i // 2 + 1, self._table_count)
# 写入表头
for col, header in enumerate(parsed_table['headers'], 1):
sheet.cell(row=1, column=col, value=header)
# 写入数据
for row_idx, row_data in enumerate(parsed_table['data'], 2):
for col_idx, value in enumerate(row_data, 1):
sheet.cell(row=row_idx, column=col_idx, value=value)
has_tables = True
return self.workbook if has_tables else None
def save_chat_tables(history, save_dir, base_name):
"""
保存聊天历史中的表格到Excel文件
Args:
history: 聊天历史列表
save_dir: 保存目录
base_name: 基础文件名
Returns:
list: 保存的文件路径列表
"""
result_files = []
try:
# 创建Excel格式
excel_formatter = ExcelTableFormatter()
workbook = excel_formatter.create_document(history)
if workbook is not None:
# 确保保存目录存在
os.makedirs(save_dir, exist_ok=True)
# 生成Excel文件路径
excel_file = os.path.join(save_dir, base_name + '.xlsx')
# 保存Excel文件
workbook.save(excel_file)
result_files.append(excel_file)
print(f"已保存表格到Excel文件: {excel_file}")
except Exception as e:
print(f"保存Excel格式失败: {str(e)}")
return result_files
# 使用示例
if __name__ == "__main__":
# 示例聊天历史
history = [
"问题1",
"""这是第一个表格:
| A | B | C |
|---|---|---|
| 1 | 2 | 3 |""",
"问题2",
"这是没有表格的回答",
"问题3",
"""回答包含多个表格:
| Name | Age |
|------|-----|
| Tom | 20 |
第二个表格:
| X | Y |
|---|---|
| 1 | 2 |"""
]
# 保存表格
save_dir = "output"
base_name = "chat_tables"
saved_files = save_chat_tables(history, save_dir, base_name)

View File

@@ -0,0 +1,472 @@
class HtmlFormatter:
"""聊天记录HTML格式生成器"""
def __init__(self):
self.css_styles = """
:root {
--primary-color: #2563eb;
--primary-light: #eff6ff;
--secondary-color: #1e293b;
--background-color: #f8fafc;
--text-color: #334155;
--border-color: #e2e8f0;
--card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
}
body {
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
line-height: 1.8;
margin: 0;
padding: 2rem;
color: var(--text-color);
background-color: var(--background-color);
}
.container {
max-width: 1200px;
margin: 0 auto;
background: white;
padding: 2rem;
border-radius: 16px;
box-shadow: var(--card-shadow);
}
::selection {
background: var(--primary-light);
color: var(--primary-color);
}
@keyframes fadeIn {
from { opacity: 0; transform: translateY(20px); }
to { opacity: 1; transform: translateY(0); }
}
@keyframes slideIn {
from { transform: translateX(-20px); opacity: 0; }
to { transform: translateX(0); opacity: 1; }
}
.container {
animation: fadeIn 0.6s ease-out;
}
.QaBox {
animation: slideIn 0.5s ease-out;
transition: all 0.3s ease;
}
.QaBox:hover {
transform: translateX(5px);
}
.Question, .Answer, .historyBox {
transition: all 0.3s ease;
}
.chat-title {
color: var(--primary-color);
font-size: 2em;
text-align: center;
margin: 1rem 0 2rem;
padding-bottom: 1rem;
border-bottom: 2px solid var(--primary-color);
}
.chat-body {
display: flex;
flex-direction: column;
gap: 1.5rem;
margin: 2rem 0;
}
.QaBox {
background: white;
padding: 1.5rem;
border-radius: 8px;
border-left: 4px solid var(--primary-color);
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
margin-bottom: 1.5rem;
}
.Question {
color: var(--secondary-color);
font-weight: 500;
margin-bottom: 1rem;
}
.Answer {
color: var(--text-color);
background: var(--primary-light);
padding: 1rem;
border-radius: 6px;
}
.history-section {
margin-top: 3rem;
padding-top: 2rem;
border-top: 2px solid var(--border-color);
}
.history-title {
color: var(--secondary-color);
font-size: 1.5em;
margin-bottom: 1.5rem;
text-align: center;
}
.historyBox {
background: white;
padding: 1rem;
margin: 0.5rem 0;
border-radius: 6px;
border: 1px solid var(--border-color);
}
@media (prefers-color-scheme: dark) {
:root {
--background-color: #0f172a;
--text-color: #e2e8f0;
--border-color: #1e293b;
}
.container, .QaBox {
background: #1e293b;
}
}
"""
def create_document(self, question: str, answer: str, ranked_papers: list = None) -> str:
"""生成完整的HTML文档
Args:
question: str, 用户问题
answer: str, AI回答
ranked_papers: list, 排序后的论文列表
Returns:
str: 完整的HTML文档字符串
"""
chat_content = f'''
<div class="QaBox">
<div class="Question">{question}</div>
<div class="Answer markdown-body" id="answer-content">{answer}</div>
</div>
'''
references_content = ""
if ranked_papers:
references_content = '<div class="history-section"><h2 class="history-title">参考文献</h2>'
for idx, paper in enumerate(ranked_papers, 1):
authors = ', '.join(paper.authors)
# 构建引用信息
citations_info = f"被引用次数:{paper.citations}" if paper.citations is not None else "引用信息未知"
# 构建下载链接
download_links = []
if paper.doi:
# 检查是否是arXiv链接
if 'arxiv.org' in paper.doi:
# 如果DOI中包含完整的arXiv URL直接使用
arxiv_url = paper.doi if paper.doi.startswith('http') else f'http://{paper.doi}'
download_links.append(f'<a href="{arxiv_url}">arXiv链接</a>')
# 提取arXiv ID并添加PDF链接
arxiv_id = arxiv_url.split('abs/')[-1].split('v')[0]
download_links.append(f'<a href="https://arxiv.org/pdf/{arxiv_id}.pdf">PDF下载</a>')
else:
# 非arXiv的DOI使用标准格式
download_links.append(f'<a href="https://doi.org/{paper.doi}">DOI: {paper.doi}</a>')
if hasattr(paper, 'url') and paper.url and 'arxiv.org' not in str(paper.url):
# 只有当URL不是arXiv链接时才添加
download_links.append(f'<a href="{paper.url}">原文链接</a>')
download_section = ' | '.join(download_links) if download_links else "无直接下载链接"
# 构建来源信息
source_info = []
if paper.venue_type:
source_info.append(f"类型:{paper.venue_type}")
if paper.venue_name:
source_info.append(f"来源:{paper.venue_name}")
# 添加期刊指标信息
if hasattr(paper, 'if_factor') and paper.if_factor:
source_info.append(f"<span class='journal-metric'>IF: {paper.if_factor}</span>")
if hasattr(paper, 'jcr_division') and paper.jcr_division:
source_info.append(f"<span class='journal-metric'>JCR分区: {paper.jcr_division}</span>")
if hasattr(paper, 'cas_division') and paper.cas_division:
source_info.append(f"<span class='journal-metric'>中科院分区: {paper.cas_division}</span>")
if hasattr(paper, 'venue_info') and paper.venue_info:
if paper.venue_info.get('journal_ref'):
source_info.append(f"期刊参考:{paper.venue_info['journal_ref']}")
if paper.venue_info.get('publisher'):
source_info.append(f"出版商:{paper.venue_info['publisher']}")
source_section = ' | '.join(source_info) if source_info else ""
# 构建标准引用格式
standard_citation = f"[{idx}] "
# 添加作者最多3个超过则添加et al.
author_list = paper.authors[:3]
if len(paper.authors) > 3:
author_list.append("et al.")
standard_citation += ", ".join(author_list) + ". "
# 添加标题
standard_citation += f"<i>{paper.title}</i>"
# 添加期刊/会议名称
if paper.venue_name:
standard_citation += f". {paper.venue_name}"
# 添加年份
if paper.year:
standard_citation += f", {paper.year}"
# 添加DOI
if paper.doi:
if 'arxiv.org' in paper.doi:
# 如果是arXiv链接直接使用arXiv URL
arxiv_url = paper.doi if paper.doi.startswith('http') else f'http://{paper.doi}'
standard_citation += f". {arxiv_url}"
else:
# 非arXiv的DOI使用标准格式
standard_citation += f". DOI: {paper.doi}"
standard_citation += "."
references_content += f'''
<div class="historyBox">
<div class="entry">
<p class="paper-title"><b>[{idx}]</b> <i>{paper.title}</i></p>
<p class="paper-authors">作者:{authors}</p>
<p class="paper-year">发表年份:{paper.year if paper.year else "未知"}</p>
<p class="paper-citations">{citations_info}</p>
{f'<p class="paper-source">{source_section}</p>' if source_section else ""}
<p class="paper-abstract">摘要:{paper.abstract if paper.abstract else "无摘要"}</p>
<p class="paper-links">链接:{download_section}</p>
<div class="standard-citation">
<p class="citation-title">标准引用格式:</p>
<p class="citation-text">{standard_citation}</p>
<button class="copy-btn" onclick="copyToClipboard(this.previousElementSibling)">复制引用格式</button>
</div>
</div>
</div>
'''
references_content += '</div>'
# 添加新的CSS样式
css_additions = """
.paper-title {
font-size: 1.1em;
margin-bottom: 0.5em;
}
.paper-authors {
color: var(--secondary-color);
margin: 0.3em 0;
}
.paper-year, .paper-citations {
color: var(--text-color);
margin: 0.3em 0;
}
.paper-source {
color: var(--text-color);
font-style: italic;
margin: 0.3em 0;
}
.paper-abstract {
margin: 0.8em 0;
padding: 0.8em;
background: var(--primary-light);
border-radius: 4px;
}
.paper-links {
margin-top: 0.5em;
}
.paper-links a {
color: var(--primary-color);
text-decoration: none;
margin-right: 1em;
}
.paper-links a:hover {
text-decoration: underline;
}
.standard-citation {
margin-top: 1em;
padding: 1em;
background: #f8fafc;
border-radius: 4px;
border: 1px solid var(--border-color);
}
.citation-title {
font-weight: bold;
margin-bottom: 0.5em;
color: var(--secondary-color);
}
.citation-text {
font-family: 'Times New Roman', Times, serif;
line-height: 1.6;
margin-bottom: 0.5em;
padding: 0.5em;
background: white;
border-radius: 4px;
border: 1px solid var(--border-color);
}
.copy-btn {
background: var(--primary-color);
color: white;
border: none;
padding: 0.5em 1em;
border-radius: 4px;
cursor: pointer;
font-size: 0.9em;
transition: background-color 0.2s;
}
.copy-btn:hover {
background: #1e40af;
}
@media (prefers-color-scheme: dark) {
.standard-citation {
background: #1e293b;
}
.citation-text {
background: #0f172a;
}
}
/* 添加期刊指标样式 */
.journal-metric {
display: inline-block;
padding: 0.2em 0.6em;
margin: 0 0.3em;
background: var(--primary-light);
border-radius: 4px;
font-weight: 500;
color: var(--primary-color);
}
@media (prefers-color-scheme: dark) {
.journal-metric {
background: #1e293b;
color: #60a5fa;
}
}
"""
# 修改 js_code 部分,添加 markdown 解析功能
js_code = """
<script>
// 复制功能
function copyToClipboard(element) {
const text = element.innerText;
navigator.clipboard.writeText(text).then(function() {
const btn = element.nextElementSibling;
const originalText = btn.innerText;
btn.innerText = '已复制!';
setTimeout(() => {
btn.innerText = originalText;
}, 2000);
}).catch(function(err) {
console.error('复制失败:', err);
});
}
// Markdown解析
document.addEventListener('DOMContentLoaded', function() {
const answerContent = document.getElementById('answer-content');
if (answerContent) {
const markdown = answerContent.textContent;
answerContent.innerHTML = marked.parse(markdown);
}
});
</script>
"""
# 将新的CSS样式添加到现有样式中
self.css_styles += css_additions
return f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>学术对话存档</title>
<!-- 添加 marked.js -->
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<!-- 添加 GitHub Markdown CSS -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/sindresorhus/github-markdown-css@4.0.0/github-markdown.min.css">
<style>
{self.css_styles}
/* 添加 Markdown 相关样式 */
.markdown-body {{
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
padding: 1rem;
background: var(--primary-light);
border-radius: 6px;
}}
.markdown-body pre {{
background-color: #f6f8fa;
border-radius: 6px;
padding: 16px;
overflow: auto;
}}
.markdown-body code {{
background-color: rgba(175,184,193,0.2);
border-radius: 6px;
padding: 0.2em 0.4em;
font-size: 85%;
}}
.markdown-body pre code {{
background-color: transparent;
padding: 0;
}}
.markdown-body blockquote {{
border-left: 0.25em solid #d0d7de;
padding: 0 1em;
color: #656d76;
}}
.markdown-body table {{
border-collapse: collapse;
width: 100%;
margin: 1em 0;
}}
.markdown-body table th,
.markdown-body table td {{
border: 1px solid #d0d7de;
padding: 6px 13px;
}}
.markdown-body table tr:nth-child(2n) {{
background-color: #f6f8fa;
}}
@media (prefers-color-scheme: dark) {{
.markdown-body {{
background: #1e293b;
color: #e2e8f0;
}}
.markdown-body pre {{
background-color: #0f172a;
}}
.markdown-body code {{
background-color: rgba(99,110,123,0.4);
}}
.markdown-body blockquote {{
border-left-color: #30363d;
color: #8b949e;
}}
.markdown-body table th,
.markdown-body table td {{
border-color: #30363d;
}}
.markdown-body table tr:nth-child(2n) {{
background-color: #0f172a;
}}
}}
</style>
</head>
<body>
<div class="container">
<h1 class="chat-title">学术对话存档</h1>
<div class="chat-body">
{chat_content}
{references_content}
</div>
</div>
{js_code}
</body>
</html>
"""

View File

@@ -0,0 +1,47 @@
class MarkdownFormatter:
"""Markdown格式文档生成器 - 用于生成对话记录的markdown文档"""
def __init__(self):
self.content = []
def _add_content(self, text: str):
"""添加正文内容"""
if text:
self.content.append(f"\n{text}\n")
def create_document(self, question: str, answer: str, ranked_papers: list = None) -> str:
"""创建完整的Markdown文档
Args:
question: str, 用户问题
answer: str, AI回答
ranked_papers: list, 排序后的论文列表
Returns:
str: 生成的Markdown文本
"""
content = []
# 添加问答部分
content.append("## 问题")
content.append(question)
content.append("\n## 回答")
content.append(answer)
# 添加参考文献
if ranked_papers:
content.append("\n## 参考文献")
for idx, paper in enumerate(ranked_papers, 1):
authors = ', '.join(paper.authors[:3])
if len(paper.authors) > 3:
authors += ' et al.'
ref = f"[{idx}] {authors}. *{paper.title}*"
if paper.venue_name:
ref += f". {paper.venue_name}"
if paper.year:
ref += f", {paper.year}"
if paper.doi:
ref += f". [DOI: {paper.doi}](https://doi.org/{paper.doi})"
content.append(ref)
return "\n\n".join(content)

View File

@@ -0,0 +1,174 @@
from typing import List
from crazy_functions.review_fns.data_sources.base_source import PaperMetadata
import re
class ReferenceFormatter:
"""通用参考文献格式生成器"""
def __init__(self):
pass
def _sanitize_bibtex(self, text: str) -> str:
"""清理BibTeX字符串处理特殊字符"""
if not text:
return ""
# 替换特殊字符
replacements = {
'&': '\\&',
'%': '\\%',
'$': '\\$',
'#': '\\#',
'_': '\\_',
'{': '\\{',
'}': '\\}',
'~': '\\textasciitilde{}',
'^': '\\textasciicircum{}',
'\\': '\\textbackslash{}',
'<': '\\textless{}',
'>': '\\textgreater{}',
'"': '``',
"'": "'",
'-': '--',
'': '---',
}
for char, replacement in replacements.items():
text = text.replace(char, replacement)
return text
def _generate_cite_key(self, paper: PaperMetadata) -> str:
"""生成引用键
格式: 第一作者姓氏_年份_第一个实词
"""
# 获取第一作者姓氏
first_author = ""
if paper.authors and len(paper.authors) > 0:
first_author = paper.authors[0].split()[-1].lower()
# 获取年份
year = str(paper.year) if paper.year else "0000"
# 从标题中获取第一个实词
title_word = ""
if paper.title:
# 移除特殊字符,分割成单词
words = re.findall(r'\w+', paper.title.lower())
# 过滤掉常见的停用词
stop_words = {'a', 'an', 'the', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
for word in words:
if word not in stop_words and len(word) > 2:
title_word = word
break
# 组合cite key
cite_key = f"{first_author}{year}{title_word}"
# 确保cite key只包含合法字符
cite_key = re.sub(r'[^a-z0-9]', '', cite_key.lower())
return cite_key
def _get_entry_type(self, paper: PaperMetadata) -> str:
"""确定BibTeX条目类型"""
if hasattr(paper, 'venue_type') and paper.venue_type:
venue_type = paper.venue_type.lower()
if venue_type == 'conference':
return 'inproceedings'
elif venue_type == 'preprint':
return 'unpublished'
elif venue_type == 'journal':
return 'article'
elif venue_type == 'book':
return 'book'
elif venue_type == 'thesis':
return 'phdthesis'
return 'article' # 默认为期刊文章
def create_document(self, papers: List[PaperMetadata]) -> str:
"""生成BibTeX格式的参考文献文本"""
bibtex_text = "% This file was automatically generated by GPT-Academic\n"
bibtex_text += "% Compatible with: EndNote, Zotero, JabRef, and LaTeX\n\n"
for paper in papers:
entry_type = self._get_entry_type(paper)
cite_key = self._generate_cite_key(paper)
bibtex_text += f"@{entry_type}{{{cite_key},\n"
# 添加标题
if paper.title:
bibtex_text += f" title = {{{self._sanitize_bibtex(paper.title)}}},\n"
# 添加作者
if paper.authors:
# 确保每个作者的姓和名正确分隔
processed_authors = []
for author in paper.authors:
names = author.split()
if len(names) > 1:
# 假设最后一个词是姓,其他的是名
surname = names[-1]
given_names = ' '.join(names[:-1])
processed_authors.append(f"{surname}, {given_names}")
else:
processed_authors.append(author)
authors = " and ".join([self._sanitize_bibtex(author) for author in processed_authors])
bibtex_text += f" author = {{{authors}}},\n"
# 添加年份
if paper.year:
bibtex_text += f" year = {{{paper.year}}},\n"
# 添加期刊/会议名称
if hasattr(paper, 'venue_name') and paper.venue_name:
if entry_type == 'inproceedings':
bibtex_text += f" booktitle = {{{self._sanitize_bibtex(paper.venue_name)}}},\n"
elif entry_type == 'article':
bibtex_text += f" journal = {{{self._sanitize_bibtex(paper.venue_name)}}},\n"
# 添加期刊相关信息
if hasattr(paper, 'venue_info'):
if 'volume' in paper.venue_info:
bibtex_text += f" volume = {{{paper.venue_info['volume']}}},\n"
if 'number' in paper.venue_info:
bibtex_text += f" number = {{{paper.venue_info['number']}}},\n"
if 'pages' in paper.venue_info:
bibtex_text += f" pages = {{{paper.venue_info['pages']}}},\n"
elif paper.venue:
venue_field = "booktitle" if entry_type == "inproceedings" else "journal"
bibtex_text += f" {venue_field} = {{{self._sanitize_bibtex(paper.venue)}}},\n"
# 添加DOI
if paper.doi:
bibtex_text += f" doi = {{{paper.doi}}},\n"
# 添加URL
if paper.url:
bibtex_text += f" url = {{{paper.url}}},\n"
elif paper.doi:
bibtex_text += f" url = {{https://doi.org/{paper.doi}}},\n"
# 添加摘要
if paper.abstract:
bibtex_text += f" abstract = {{{self._sanitize_bibtex(paper.abstract)}}},\n"
# 添加机构
if hasattr(paper, 'institutions') and paper.institutions:
institutions = " and ".join([self._sanitize_bibtex(inst) for inst in paper.institutions])
bibtex_text += f" institution = {{{institutions}}},\n"
# 添加月份
if hasattr(paper, 'month'):
bibtex_text += f" month = {{{paper.month}}},\n"
# 添加注释字段
if hasattr(paper, 'note'):
bibtex_text += f" note = {{{self._sanitize_bibtex(paper.note)}}},\n"
# 移除最后一个逗号并关闭条目
bibtex_text = bibtex_text.rstrip(',\n') + "\n}\n\n"
return bibtex_text

View File

@@ -0,0 +1,138 @@
from docx2pdf import convert
import os
import platform
from typing import Union
from pathlib import Path
from datetime import datetime
class WordToPdfConverter:
"""Word文档转PDF转换器"""
@staticmethod
def _replace_docx_in_filename(filename: Union[str, Path]) -> Path:
"""
将文件名中的'docx'替换为'pdf'
例如: 'docx_test.pdf' -> 'pdf_test.pdf'
"""
path = Path(filename)
new_name = path.stem.replace('docx', 'pdf')
return path.parent / f"{new_name}{path.suffix}"
@staticmethod
def convert_to_pdf(word_path: Union[str, Path], pdf_path: Union[str, Path] = None) -> str:
"""
将Word文档转换为PDF
参数:
word_path: Word文档的路径
pdf_path: 可选PDF文件的输出路径。如果未指定将使用与Word文档相同的名称和位置
返回:
生成的PDF文件路径
异常:
如果转换失败,将抛出相应异常
"""
try:
word_path = Path(word_path)
if pdf_path is None:
# 创建新的pdf路径同时替换文件名中的docx
pdf_path = WordToPdfConverter._replace_docx_in_filename(word_path).with_suffix('.pdf')
else:
pdf_path = WordToPdfConverter._replace_docx_in_filename(Path(pdf_path))
# 检查操作系统
if platform.system() == 'Linux':
# Linux系统需要安装libreoffice
if not os.system('which libreoffice') == 0:
raise RuntimeError("请先安装LibreOffice: sudo apt-get install libreoffice")
# 使用libreoffice进行转换
os.system(f'libreoffice --headless --convert-to pdf "{word_path}" --outdir "{pdf_path.parent}"')
# 如果输出路径与默认生成的不同,则重命名
default_pdf = word_path.with_suffix('.pdf')
if default_pdf != pdf_path:
os.rename(default_pdf, pdf_path)
else:
# Windows和MacOS使用 docx2pdf
convert(word_path, pdf_path)
return str(pdf_path)
except Exception as e:
raise Exception(f"转换PDF失败: {str(e)}")
@staticmethod
def batch_convert(word_dir: Union[str, Path], pdf_dir: Union[str, Path] = None) -> list:
"""
批量转换目录下的所有Word文档
参数:
word_dir: 包含Word文档的目录路径
pdf_dir: 可选PDF文件的输出目录。如果未指定将使用与Word文档相同的目录
返回:
生成的PDF文件路径列表
"""
word_dir = Path(word_dir)
if pdf_dir:
pdf_dir = Path(pdf_dir)
pdf_dir.mkdir(parents=True, exist_ok=True)
converted_files = []
for word_file in word_dir.glob("*.docx"):
try:
if pdf_dir:
pdf_path = pdf_dir / WordToPdfConverter._replace_docx_in_filename(
word_file.with_suffix('.pdf')
).name
else:
pdf_path = WordToPdfConverter._replace_docx_in_filename(
word_file.with_suffix('.pdf')
)
pdf_file = WordToPdfConverter.convert_to_pdf(word_file, pdf_path)
converted_files.append(pdf_file)
except Exception as e:
print(f"转换 {word_file} 失败: {str(e)}")
return converted_files
@staticmethod
def convert_doc_to_pdf(doc, output_dir: Union[str, Path] = None) -> str:
"""
将docx对象直接转换为PDF
参数:
doc: python-docx的Document对象
output_dir: 可选,输出目录。如果未指定,将使用当前目录
返回:
生成的PDF文件路径
"""
try:
# 设置临时文件路径和输出路径
output_dir = Path(output_dir) if output_dir else Path.cwd()
output_dir.mkdir(parents=True, exist_ok=True)
# 生成临时word文件
temp_docx = output_dir / f"temp_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
doc.save(temp_docx)
# 转换为PDF
pdf_path = temp_docx.with_suffix('.pdf')
WordToPdfConverter.convert_to_pdf(temp_docx, pdf_path)
# 删除临时word文件
temp_docx.unlink()
return str(pdf_path)
except Exception as e:
if temp_docx.exists():
temp_docx.unlink()
raise Exception(f"转换PDF失败: {str(e)}")

View File

@@ -0,0 +1,246 @@
import re
from docx import Document
from docx.shared import Cm, Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
from docx.enum.style import WD_STYLE_TYPE
from docx.oxml.ns import qn
from datetime import datetime
import docx
from docx.oxml import shared
from crazy_functions.doc_fns.conversation_doc.word_doc import convert_markdown_to_word
class WordFormatter:
"""聊天记录Word文档生成器 - 符合中国政府公文格式规范(GB/T 9704-2012)"""
def __init__(self):
self.doc = Document()
self._setup_document()
self._create_styles()
def _setup_document(self):
"""设置文档基本格式,包括页面设置和页眉"""
sections = self.doc.sections
for section in sections:
# 设置页面大小为A4
section.page_width = Cm(21)
section.page_height = Cm(29.7)
# 设置页边距
section.top_margin = Cm(3.7) # 上边距37mm
section.bottom_margin = Cm(3.5) # 下边距35mm
section.left_margin = Cm(2.8) # 左边距28mm
section.right_margin = Cm(2.6) # 右边距26mm
# 设置页眉页脚距离
section.header_distance = Cm(2.0)
section.footer_distance = Cm(2.0)
# 修改页眉
header = section.header
header_para = header.paragraphs[0]
header_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
header_run = header_para.add_run("GPT-Academic学术对话 (体验地址https://auth.gpt-academic.top/)")
header_run.font.name = '仿宋'
header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
header_run.font.size = Pt(9)
def _create_styles(self):
"""创建文档样式"""
# 创建正文样式
style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH)
style.font.name = '仿宋'
style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
style.font.size = Pt(12)
style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
style.paragraph_format.space_after = Pt(0)
# 创建问题样式
question_style = self.doc.styles.add_style('Question_Style', WD_STYLE_TYPE.PARAGRAPH)
question_style.font.name = '黑体'
question_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
question_style.font.size = Pt(14) # 调整为14磅
question_style.font.bold = True
question_style.paragraph_format.space_before = Pt(12) # 减小段前距
question_style.paragraph_format.space_after = Pt(6)
question_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
question_style.paragraph_format.left_indent = Pt(0) # 移除左缩进
# 创建回答样式
answer_style = self.doc.styles.add_style('Answer_Style', WD_STYLE_TYPE.PARAGRAPH)
answer_style.font.name = '仿宋'
answer_style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
answer_style.font.size = Pt(12) # 调整为12磅
answer_style.paragraph_format.space_before = Pt(6)
answer_style.paragraph_format.space_after = Pt(12)
answer_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
answer_style.paragraph_format.left_indent = Pt(0) # 移除左缩进
# 创建标题样式
title_style = self.doc.styles.add_style('Title_Custom', WD_STYLE_TYPE.PARAGRAPH)
title_style.font.name = '黑体' # 改用黑体
title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
title_style.font.size = Pt(22) # 调整为22磅
title_style.font.bold = True
title_style.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
title_style.paragraph_format.space_before = Pt(0)
title_style.paragraph_format.space_after = Pt(24)
title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
# 添加参考文献样式
ref_style = self.doc.styles.add_style('Reference_Style', WD_STYLE_TYPE.PARAGRAPH)
ref_style.font.name = '宋体'
ref_style._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
ref_style.font.size = Pt(10.5) # 参考文献使用小号字体
ref_style.paragraph_format.space_before = Pt(3)
ref_style.paragraph_format.space_after = Pt(3)
ref_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE
ref_style.paragraph_format.left_indent = Pt(21)
ref_style.paragraph_format.first_line_indent = Pt(-21)
# 添加参考文献标题样式
ref_title_style = self.doc.styles.add_style('Reference_Title_Style', WD_STYLE_TYPE.PARAGRAPH)
ref_title_style.font.name = '黑体'
ref_title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
ref_title_style.font.size = Pt(16) # 参考文献标题与问题同样大小
ref_title_style.font.bold = True
ref_title_style.paragraph_format.space_before = Pt(24) # 增加段前距
ref_title_style.paragraph_format.space_after = Pt(12)
ref_title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
def create_document(self, question: str, answer: str, ranked_papers: list = None):
"""写入聊天历史
Args:
question: str, 用户问题
answer: str, AI回答
ranked_papers: list, 排序后的论文列表
"""
try:
# 添加标题
title_para = self.doc.add_paragraph(style='Title_Custom')
title_run = title_para.add_run('GPT-Academic 对话记录')
# 添加日期
try:
date_para = self.doc.add_paragraph()
date_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
date_run = date_para.add_run(datetime.now().strftime('%Y年%m月%d'))
date_run.font.name = '仿宋'
date_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
date_run.font.size = Pt(16)
except Exception as e:
print(f"添加日期失败: {str(e)}")
raise
self.doc.add_paragraph() # 添加空行
# 添加问答对话
try:
q_para = self.doc.add_paragraph(style='Question_Style')
q_para.add_run('问题:').bold = True
q_para.add_run(str(question))
a_para = self.doc.add_paragraph(style='Answer_Style')
a_para.add_run('回答:').bold = True
a_para.add_run(convert_markdown_to_word(str(answer)))
except Exception as e:
print(f"添加问答对话失败: {str(e)}")
raise
# 添加参考文献部分
if ranked_papers:
try:
ref_title = self.doc.add_paragraph(style='Reference_Title_Style')
ref_title.add_run("参考文献")
for idx, paper in enumerate(ranked_papers, 1):
try:
ref_para = self.doc.add_paragraph(style='Reference_Style')
ref_para.add_run(f'[{idx}] ').bold = True
# 添加作者
authors = ', '.join(paper.authors[:3])
if len(paper.authors) > 3:
authors += ' et al.'
ref_para.add_run(f'{authors}. ')
# 添加标题
title_run = ref_para.add_run(paper.title)
title_run.italic = True
if hasattr(paper, 'url') and paper.url:
try:
title_run._element.rPr.rStyle = self._create_hyperlink_style()
self._add_hyperlink(ref_para, paper.title, paper.url)
except Exception as e:
print(f"添加超链接失败: {str(e)}")
# 添加期刊/会议信息
if paper.venue_name:
ref_para.add_run(f'. {paper.venue_name}')
# 添加年份
if paper.year:
ref_para.add_run(f', {paper.year}')
# 添加DOI
if paper.doi:
ref_para.add_run('. ')
if "arxiv" in paper.url:
doi_url = paper.doi
else:
doi_url = f'https://doi.org/{paper.doi}'
self._add_hyperlink(ref_para, f'DOI: {paper.doi}', doi_url)
ref_para.add_run('.')
except Exception as e:
print(f"添加第 {idx} 篇参考文献失败: {str(e)}")
continue
except Exception as e:
print(f"添加参考文献部分失败: {str(e)}")
raise
return self.doc
except Exception as e:
print(f"Word文档创建失败: {str(e)}")
import traceback
print(f"详细错误信息: {traceback.format_exc()}")
raise
def _create_hyperlink_style(self):
"""创建超链接样式"""
styles = self.doc.styles
if 'Hyperlink' not in styles:
hyperlink_style = styles.add_style('Hyperlink', WD_STYLE_TYPE.CHARACTER)
# 使用科技蓝 (#0066CC)
hyperlink_style.font.color.rgb = 0x0066CC # 科技蓝
hyperlink_style.font.underline = True
return styles['Hyperlink']
def _add_hyperlink(self, paragraph, text, url):
"""添加超链接到段落"""
# 这个是在XML级别添加超链接
part = paragraph.part
r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
# 创建超链接XML元素
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), r_id)
# 创建文本运行
new_run = docx.oxml.shared.OxmlElement('w:r')
rPr = docx.oxml.shared.OxmlElement('w:rPr')
# 应用超链接样式
rStyle = docx.oxml.shared.OxmlElement('w:rStyle')
rStyle.set(docx.oxml.shared.qn('w:val'), 'Hyperlink')
rPr.append(rStyle)
# 添加文本
t = docx.oxml.shared.OxmlElement('w:t')
t.text = text
new_run.append(rPr)
new_run.append(t)
hyperlink.append(new_run)
# 将超链接添加到段落
paragraph._p.append(hyperlink)