Master 4.0 (#2210)

* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能，支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能，支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-08-23 15:59:22 +08:00
parent 65a4cf59c2
commit 8042750d41
79 changed files with 20850 additions and 57 deletions
--- a/crazy_functions/review_fns/conversation_doc/endnote_doc.py
+++ b/crazy_functions/review_fns/conversation_doc/endnote_doc.py
@@ -0,0 +1,68 @@
+from typing import List
+from crazy_functions.review_fns.data_sources.base_source import PaperMetadata
+
+class EndNoteFormatter:
+    """EndNote参考文献格式生成器"""
+
+    def __init__(self):
+        pass
+
+    def create_document(self, papers: List[PaperMetadata]) -> str:
+        """生成EndNote格式的参考文献文本
+
+        Args:
+            papers: 论文列表
+
+        Returns:
+            str: EndNote格式的参考文献文本
+        """
+        endnote_text = ""
+
+        for paper in papers:
+            # 开始一个新条目
+            endnote_text += "%0 Journal Article\n"  # 默认类型为期刊文章
+
+            # 根据venue_type调整条目类型
+            if hasattr(paper, 'venue_type') and paper.venue_type:
+                if paper.venue_type.lower() == 'conference':
+                    endnote_text = endnote_text.replace("Journal Article", "Conference Paper")
+                elif paper.venue_type.lower() == 'preprint':
+                    endnote_text = endnote_text.replace("Journal Article", "Electronic Article")
+
+            # 添加标题
+            endnote_text += f"%T {paper.title}\n"
+
+            # 添加作者
+            for author in paper.authors:
+                endnote_text += f"%A {author}\n"
+
+            # 添加年份
+            if paper.year:
+                endnote_text += f"%D {paper.year}\n"
+
+            # 添加期刊/会议名称
+            if hasattr(paper, 'venue_name') and paper.venue_name:
+                endnote_text += f"%J {paper.venue_name}\n"
+            elif paper.venue:
+                endnote_text += f"%J {paper.venue}\n"
+
+            # 添加DOI
+            if paper.doi:
+                endnote_text += f"%R {paper.doi}\n"
+                endnote_text += f"%U https://doi.org/{paper.doi}\n"
+            elif paper.url:
+                endnote_text += f"%U {paper.url}\n"
+
+            # 添加摘要
+            if paper.abstract:
+                endnote_text += f"%X {paper.abstract}\n"
+
+            # 添加机构
+            if hasattr(paper, 'institutions'):
+                for institution in paper.institutions:
+                    endnote_text += f"%I {institution}\n"
+
+            # 条目之间添加空行
+            endnote_text += "\n"
+
+        return endnote_text
--- a/crazy_functions/review_fns/conversation_doc/excel_doc.py
+++ b/crazy_functions/review_fns/conversation_doc/excel_doc.py
@@ -0,0 +1,211 @@
+import re
+import os
+import pandas as pd
+from datetime import datetime
+
+
+class ExcelTableFormatter:
+    """聊天记录中Markdown表格转Excel生成器"""
+
+    def __init__(self):
+        """初始化Excel文档对象"""
+        from openpyxl import Workbook
+        self.workbook = Workbook()
+        self._table_count = 0
+        self._current_sheet = None
+
+    def _normalize_table_row(self, row):
+        """标准化表格行，处理不同的分隔符情况"""
+        row = row.strip()
+        if row.startswith('|'):
+            row = row[1:]
+        if row.endswith('|'):
+            row = row[:-1]
+        return [cell.strip() for cell in row.split('|')]
+
+    def _is_separator_row(self, row):
+        """检查是否是分隔行（由 - 或 : 组成）"""
+        clean_row = re.sub(r'[\s|]', '', row)
+        return bool(re.match(r'^[-:]+$', clean_row))
+
+    def _extract_tables_from_text(self, text):
+        """从文本中提取所有表格内容"""
+        if not isinstance(text, str):
+            return []
+
+        tables = []
+        current_table = []
+        is_in_table = False
+
+        for line in text.split('\n'):
+            line = line.strip()
+            if not line:
+                if is_in_table and current_table:
+                    if len(current_table) >= 2:
+                        tables.append(current_table)
+                    current_table = []
+                    is_in_table = False
+                continue
+
+            if '|' in line:
+                if not is_in_table:
+                    is_in_table = True
+                current_table.append(line)
+            else:
+                if is_in_table and current_table:
+                    if len(current_table) >= 2:
+                        tables.append(current_table)
+                    current_table = []
+                    is_in_table = False
+
+        if is_in_table and current_table and len(current_table) >= 2:
+            tables.append(current_table)
+
+        return tables
+
+    def _parse_table(self, table_lines):
+        """解析表格内容为结构化数据"""
+        try:
+            headers = self._normalize_table_row(table_lines[0])
+
+            separator_index = next(
+                (i for i, line in enumerate(table_lines) if self._is_separator_row(line)),
+                1
+            )
+
+            data_rows = []
+            for line in table_lines[separator_index + 1:]:
+                cells = self._normalize_table_row(line)
+                # 确保单元格数量与表头一致
+                while len(cells) < len(headers):
+                    cells.append('')
+                cells = cells[:len(headers)]
+                data_rows.append(cells)
+
+            if headers and data_rows:
+                return {
+                    'headers': headers,
+                    'data': data_rows
+                }
+        except Exception as e:
+            print(f"解析表格时发生错误: {str(e)}")
+
+        return None
+
+    def _create_sheet(self, question_num, table_num):
+        """创建新的工作表"""
+        sheet_name = f'Q{question_num}_T{table_num}'
+        if len(sheet_name) > 31:
+            sheet_name = f'Table{self._table_count}'
+
+        if sheet_name in self.workbook.sheetnames:
+            sheet_name = f'{sheet_name}_{datetime.now().strftime("%H%M%S")}'
+
+        return self.workbook.create_sheet(title=sheet_name)
+
+    def create_document(self, history):
+        """
+        处理聊天历史中的所有表格并创建Excel文档
+
+        Args:
+            history: 聊天历史列表
+
+        Returns:
+            Workbook: 处理完成的Excel工作簿对象，如果没有表格则返回None
+        """
+        has_tables = False
+
+        # 删除默认创建的工作表
+        default_sheet = self.workbook['Sheet']
+        self.workbook.remove(default_sheet)
+
+        # 遍历所有回答
+        for i in range(1, len(history), 2):
+            answer = history[i]
+            tables = self._extract_tables_from_text(answer)
+
+            for table_lines in tables:
+                parsed_table = self._parse_table(table_lines)
+                if parsed_table:
+                    self._table_count += 1
+                    sheet = self._create_sheet(i // 2 + 1, self._table_count)
+
+                    # 写入表头
+                    for col, header in enumerate(parsed_table['headers'], 1):
+                        sheet.cell(row=1, column=col, value=header)
+
+                    # 写入数据
+                    for row_idx, row_data in enumerate(parsed_table['data'], 2):
+                        for col_idx, value in enumerate(row_data, 1):
+                            sheet.cell(row=row_idx, column=col_idx, value=value)
+
+                    has_tables = True
+
+        return self.workbook if has_tables else None
+
+
+def save_chat_tables(history, save_dir, base_name):
+    """
+    保存聊天历史中的表格到Excel文件
+
+    Args:
+        history: 聊天历史列表
+        save_dir: 保存目录
+        base_name: 基础文件名
+
+    Returns:
+        list: 保存的文件路径列表
+    """
+    result_files = []
+
+    try:
+        # 创建Excel格式
+        excel_formatter = ExcelTableFormatter()
+        workbook = excel_formatter.create_document(history)
+
+        if workbook is not None:
+            # 确保保存目录存在
+            os.makedirs(save_dir, exist_ok=True)
+
+            # 生成Excel文件路径
+            excel_file = os.path.join(save_dir, base_name + '.xlsx')
+
+            # 保存Excel文件
+            workbook.save(excel_file)
+            result_files.append(excel_file)
+            print(f"已保存表格到Excel文件: {excel_file}")
+    except Exception as e:
+        print(f"保存Excel格式失败: {str(e)}")
+
+    return result_files
+
+
+# 使用示例
+if __name__ == "__main__":
+    # 示例聊天历史
+    history = [
+        "问题1",
+        """这是第一个表格：
+        | A | B | C |
+        |---|---|---|
+        | 1 | 2 | 3 |""",
+
+        "问题2",
+        "这是没有表格的回答",
+
+        "问题3",
+        """回答包含多个表格：
+        | Name | Age |
+        |------|-----|
+        | Tom  | 20  |
+
+        第二个表格：
+        | X | Y |
+        |---|---|
+        | 1 | 2 |"""
+    ]
+
+    # 保存表格
+    save_dir = "output"
+    base_name = "chat_tables"
+    saved_files = save_chat_tables(history, save_dir, base_name)
--- a/crazy_functions/review_fns/conversation_doc/html_doc.py
+++ b/crazy_functions/review_fns/conversation_doc/html_doc.py
@@ -0,0 +1,472 @@
+class HtmlFormatter:
+    """聊天记录HTML格式生成器"""
+    
+    def __init__(self):
+        self.css_styles = """
+        :root {
+            --primary-color: #2563eb;
+            --primary-light: #eff6ff;
+            --secondary-color: #1e293b;
+            --background-color: #f8fafc;
+            --text-color: #334155;
+            --border-color: #e2e8f0;
+            --card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
+        }
+
+        body {
+            font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            line-height: 1.8;
+            margin: 0;
+            padding: 2rem;
+            color: var(--text-color);
+            background-color: var(--background-color);
+        }
+
+        .container {
+            max-width: 1200px;
+            margin: 0 auto;
+            background: white;
+            padding: 2rem;
+            border-radius: 16px;
+            box-shadow: var(--card-shadow);
+        }
+        ::selection {
+            background: var(--primary-light);
+            color: var(--primary-color);
+        }
+        @keyframes fadeIn {
+            from { opacity: 0; transform: translateY(20px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        
+        @keyframes slideIn {
+            from { transform: translateX(-20px); opacity: 0; }
+            to { transform: translateX(0); opacity: 1; }
+        }
+        
+        .container {
+            animation: fadeIn 0.6s ease-out;
+        }
+        
+        .QaBox {
+            animation: slideIn 0.5s ease-out;
+            transition: all 0.3s ease;
+        }
+        
+        .QaBox:hover {
+            transform: translateX(5px);
+        }
+        .Question, .Answer, .historyBox {
+            transition: all 0.3s ease;
+        }
+        .chat-title {
+            color: var(--primary-color);
+            font-size: 2em;
+            text-align: center;
+            margin: 1rem 0 2rem;
+            padding-bottom: 1rem;
+            border-bottom: 2px solid var(--primary-color);
+        }
+
+        .chat-body {
+            display: flex;
+            flex-direction: column;
+            gap: 1.5rem;
+            margin: 2rem 0;
+        }
+
+        .QaBox {
+            background: white;
+            padding: 1.5rem;
+            border-radius: 8px;
+            border-left: 4px solid var(--primary-color);
+            box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+            margin-bottom: 1.5rem;
+        }
+
+        .Question {
+            color: var(--secondary-color);
+            font-weight: 500;
+            margin-bottom: 1rem;
+        }
+
+        .Answer {
+            color: var(--text-color);
+            background: var(--primary-light);
+            padding: 1rem;
+            border-radius: 6px;
+        }
+
+        .history-section {
+            margin-top: 3rem;
+            padding-top: 2rem;
+            border-top: 2px solid var(--border-color);
+        }
+
+        .history-title {
+            color: var(--secondary-color);
+            font-size: 1.5em;
+            margin-bottom: 1.5rem;
+            text-align: center;
+        }
+
+        .historyBox {
+            background: white;
+            padding: 1rem;
+            margin: 0.5rem 0;
+            border-radius: 6px;
+            border: 1px solid var(--border-color);
+        }
+
+        @media (prefers-color-scheme: dark) {
+            :root {
+                --background-color: #0f172a;
+                --text-color: #e2e8f0;
+                --border-color: #1e293b;
+            }
+            
+            .container, .QaBox {
+                background: #1e293b;
+            }
+        }
+        """
+
+    def create_document(self, question: str, answer: str, ranked_papers: list = None) -> str:
+        """生成完整的HTML文档
+        Args:
+            question: str, 用户问题
+            answer: str, AI回答
+            ranked_papers: list, 排序后的论文列表
+        Returns:
+            str: 完整的HTML文档字符串
+        """
+        chat_content = f'''
+        <div class="QaBox">
+            <div class="Question">{question}</div>
+            <div class="Answer markdown-body" id="answer-content">{answer}</div>
+        </div>
+        '''
+
+        references_content = ""
+        if ranked_papers:
+            references_content = '<div class="history-section"><h2 class="history-title">参考文献</h2>'
+            for idx, paper in enumerate(ranked_papers, 1):
+                authors = ', '.join(paper.authors)
+                
+                # 构建引用信息
+                citations_info = f"被引用次数：{paper.citations}" if paper.citations is not None else "引用信息未知"
+                
+                # 构建下载链接
+                download_links = []
+                if paper.doi:
+                    # 检查是否是arXiv链接
+                    if 'arxiv.org' in paper.doi:
+                        # 如果DOI中包含完整的arXiv URL，直接使用
+                        arxiv_url = paper.doi if paper.doi.startswith('http') else f'http://{paper.doi}'
+                        download_links.append(f'<a href="{arxiv_url}">arXiv链接</a>')
+                        # 提取arXiv ID并添加PDF链接
+                        arxiv_id = arxiv_url.split('abs/')[-1].split('v')[0]
+                        download_links.append(f'<a href="https://arxiv.org/pdf/{arxiv_id}.pdf">PDF下载</a>')
+                    else:
+                        # 非arXiv的DOI使用标准格式
+                        download_links.append(f'<a href="https://doi.org/{paper.doi}">DOI: {paper.doi}</a>')
+
+                if hasattr(paper, 'url') and paper.url and 'arxiv.org' not in str(paper.url):
+                    # 只有当URL不是arXiv链接时才添加
+                    download_links.append(f'<a href="{paper.url}">原文链接</a>')
+                download_section = ' | '.join(download_links) if download_links else "无直接下载链接"
+                
+                # 构建来源信息
+                source_info = []
+                if paper.venue_type:
+                    source_info.append(f"类型：{paper.venue_type}")
+                if paper.venue_name:
+                    source_info.append(f"来源：{paper.venue_name}")
+                    
+                # 添加期刊指标信息
+                if hasattr(paper, 'if_factor') and paper.if_factor:
+                    source_info.append(f"<span class='journal-metric'>IF: {paper.if_factor}</span>")
+                if hasattr(paper, 'jcr_division') and paper.jcr_division:
+                    source_info.append(f"<span class='journal-metric'>JCR分区: {paper.jcr_division}</span>")
+                if hasattr(paper, 'cas_division') and paper.cas_division:
+                    source_info.append(f"<span class='journal-metric'>中科院分区: {paper.cas_division}</span>")
+                    
+                if hasattr(paper, 'venue_info') and paper.venue_info:
+                    if paper.venue_info.get('journal_ref'):
+                        source_info.append(f"期刊参考：{paper.venue_info['journal_ref']}")
+                    if paper.venue_info.get('publisher'):
+                        source_info.append(f"出版商：{paper.venue_info['publisher']}")
+                source_section = ' | '.join(source_info) if source_info else ""
+
+                # 构建标准引用格式
+                standard_citation = f"[{idx}] "
+                # 添加作者（最多3个，超过则添加et al.）
+                author_list = paper.authors[:3]
+                if len(paper.authors) > 3:
+                    author_list.append("et al.")
+                standard_citation += ", ".join(author_list) + ". "
+                # 添加标题
+                standard_citation += f"<i>{paper.title}</i>"
+                # 添加期刊/会议名称
+                if paper.venue_name:
+                    standard_citation += f". {paper.venue_name}"
+                # 添加年份
+                if paper.year:
+                    standard_citation += f", {paper.year}"
+                # 添加DOI
+                if paper.doi:
+                    if 'arxiv.org' in paper.doi:
+                        # 如果是arXiv链接，直接使用arXiv URL
+                        arxiv_url = paper.doi if paper.doi.startswith('http') else f'http://{paper.doi}'
+                        standard_citation += f". {arxiv_url}"
+                    else:
+                        # 非arXiv的DOI使用标准格式
+                        standard_citation += f". DOI: {paper.doi}"
+                standard_citation += "."
+                
+                references_content += f'''
+                <div class="historyBox">
+                    <div class="entry">
+                        <p class="paper-title"><b>[{idx}]</b> <i>{paper.title}</i></p>
+                        <p class="paper-authors">作者：{authors}</p>
+                        <p class="paper-year">发表年份：{paper.year if paper.year else "未知"}</p>
+                        <p class="paper-citations">{citations_info}</p>
+                        {f'<p class="paper-source">{source_section}</p>' if source_section else ""}
+                        <p class="paper-abstract">摘要：{paper.abstract if paper.abstract else "无摘要"}</p>
+                        <p class="paper-links">链接：{download_section}</p>
+                        <div class="standard-citation">
+                            <p class="citation-title">标准引用格式：</p>
+                            <p class="citation-text">{standard_citation}</p>
+                            <button class="copy-btn" onclick="copyToClipboard(this.previousElementSibling)">复制引用格式</button>
+                        </div>
+                    </div>
+                </div>
+                '''
+            references_content += '</div>'
+
+        # 添加新的CSS样式
+        css_additions = """
+            .paper-title {
+                font-size: 1.1em;
+                margin-bottom: 0.5em;
+            }
+            .paper-authors {
+                color: var(--secondary-color);
+                margin: 0.3em 0;
+            }
+            .paper-year, .paper-citations {
+                color: var(--text-color);
+                margin: 0.3em 0;
+            }
+            .paper-source {
+                color: var(--text-color);
+                font-style: italic;
+                margin: 0.3em 0;
+            }
+            .paper-abstract {
+                margin: 0.8em 0;
+                padding: 0.8em;
+                background: var(--primary-light);
+                border-radius: 4px;
+            }
+            .paper-links {
+                margin-top: 0.5em;
+            }
+            .paper-links a {
+                color: var(--primary-color);
+                text-decoration: none;
+                margin-right: 1em;
+            }
+            .paper-links a:hover {
+                text-decoration: underline;
+            }
+            .standard-citation {
+                margin-top: 1em;
+                padding: 1em;
+                background: #f8fafc;
+                border-radius: 4px;
+                border: 1px solid var(--border-color);
+            }
+            
+            .citation-title {
+                font-weight: bold;
+                margin-bottom: 0.5em;
+                color: var(--secondary-color);
+            }
+            
+            .citation-text {
+                font-family: 'Times New Roman', Times, serif;
+                line-height: 1.6;
+                margin-bottom: 0.5em;
+                padding: 0.5em;
+                background: white;
+                border-radius: 4px;
+                border: 1px solid var(--border-color);
+            }
+            
+            .copy-btn {
+                background: var(--primary-color);
+                color: white;
+                border: none;
+                padding: 0.5em 1em;
+                border-radius: 4px;
+                cursor: pointer;
+                font-size: 0.9em;
+                transition: background-color 0.2s;
+            }
+            
+            .copy-btn:hover {
+                background: #1e40af;
+            }
+            
+            @media (prefers-color-scheme: dark) {
+                .standard-citation {
+                    background: #1e293b;
+                }
+                .citation-text {
+                    background: #0f172a;
+                }
+            }
+            
+            /* 添加期刊指标样式 */
+            .journal-metric {
+                display: inline-block;
+                padding: 0.2em 0.6em;
+                margin: 0 0.3em;
+                background: var(--primary-light);
+                border-radius: 4px;
+                font-weight: 500;
+                color: var(--primary-color);
+            }
+            
+            @media (prefers-color-scheme: dark) {
+                .journal-metric {
+                    background: #1e293b;
+                    color: #60a5fa;
+                }
+            }
+        """
+        
+        # 修改 js_code 部分，添加 markdown 解析功能
+        js_code = """
+        <script>
+        // 复制功能
+        function copyToClipboard(element) {
+            const text = element.innerText;
+            navigator.clipboard.writeText(text).then(function() {
+                const btn = element.nextElementSibling;
+                const originalText = btn.innerText;
+                btn.innerText = '已复制！';
+                setTimeout(() => {
+                    btn.innerText = originalText;
+                }, 2000);
+            }).catch(function(err) {
+                console.error('复制失败:', err);
+            });
+        }
+
+        // Markdown解析
+        document.addEventListener('DOMContentLoaded', function() {
+            const answerContent = document.getElementById('answer-content');
+            if (answerContent) {
+                const markdown = answerContent.textContent;
+                answerContent.innerHTML = marked.parse(markdown);
+            }
+        });
+        </script>
+        """
+
+        # 将新的CSS样式添加到现有样式中
+        self.css_styles += css_additions
+
+        return f"""
+        <!DOCTYPE html>
+        <html lang="zh-CN">
+        <head>
+            <meta charset="utf-8">
+            <meta name="viewport" content="width=device-width, initial-scale=1">
+            <title>学术对话存档</title>
+            <!-- 添加 marked.js -->
+            <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
+            <!-- 添加 GitHub Markdown CSS -->
+            <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/sindresorhus/github-markdown-css@4.0.0/github-markdown.min.css">
+            <style>
+                {self.css_styles}
+                /* 添加 Markdown 相关样式 */
+                .markdown-body {{
+                    font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+                    padding: 1rem;
+                    background: var(--primary-light);
+                    border-radius: 6px;
+                }}
+                .markdown-body pre {{
+                    background-color: #f6f8fa;
+                    border-radius: 6px;
+                    padding: 16px;
+                    overflow: auto;
+                }}
+                .markdown-body code {{
+                    background-color: rgba(175,184,193,0.2);
+                    border-radius: 6px;
+                    padding: 0.2em 0.4em;
+                    font-size: 85%;
+                }}
+                .markdown-body pre code {{
+                    background-color: transparent;
+                    padding: 0;
+                }}
+                .markdown-body blockquote {{
+                    border-left: 0.25em solid #d0d7de;
+                    padding: 0 1em;
+                    color: #656d76;
+                }}
+                .markdown-body table {{
+                    border-collapse: collapse;
+                    width: 100%;
+                    margin: 1em 0;
+                }}
+                .markdown-body table th,
+                .markdown-body table td {{
+                    border: 1px solid #d0d7de;
+                    padding: 6px 13px;
+                }}
+                .markdown-body table tr:nth-child(2n) {{
+                    background-color: #f6f8fa;
+                }}
+                @media (prefers-color-scheme: dark) {{
+                    .markdown-body {{
+                        background: #1e293b;
+                        color: #e2e8f0;
+                    }}
+                    .markdown-body pre {{
+                        background-color: #0f172a;
+                    }}
+                    .markdown-body code {{
+                        background-color: rgba(99,110,123,0.4);
+                    }}
+                    .markdown-body blockquote {{
+                        border-left-color: #30363d;
+                        color: #8b949e;
+                    }}
+                    .markdown-body table th,
+                    .markdown-body table td {{
+                        border-color: #30363d;
+                    }}
+                    .markdown-body table tr:nth-child(2n) {{
+                        background-color: #0f172a;
+                    }}
+                }}
+            </style>
+        </head>
+        <body>
+            <div class="container">
+                <h1 class="chat-title">学术对话存档</h1>
+                <div class="chat-body">
+                    {chat_content}
+                    {references_content}
+                </div>
+            </div>
+            {js_code}
+        </body>
+        </html>
+        """
--- a/crazy_functions/review_fns/conversation_doc/markdown_doc.py
+++ b/crazy_functions/review_fns/conversation_doc/markdown_doc.py
@@ -0,0 +1,47 @@
+class MarkdownFormatter:
+    """Markdown格式文档生成器 - 用于生成对话记录的markdown文档"""
+
+    def __init__(self):
+        self.content = []
+
+    def _add_content(self, text: str):
+        """添加正文内容"""
+        if text:
+            self.content.append(f"\n{text}\n")
+
+    def create_document(self, question: str, answer: str, ranked_papers: list = None) -> str:
+        """创建完整的Markdown文档
+        Args:
+            question: str, 用户问题
+            answer: str, AI回答
+            ranked_papers: list, 排序后的论文列表
+        Returns:
+            str: 生成的Markdown文本
+        """
+        content = []
+
+        # 添加问答部分
+        content.append("## 问题")
+        content.append(question)
+        content.append("\n## 回答")
+        content.append(answer)
+
+        # 添加参考文献
+        if ranked_papers:
+            content.append("\n## 参考文献")
+            for idx, paper in enumerate(ranked_papers, 1):
+                authors = ', '.join(paper.authors[:3])
+                if len(paper.authors) > 3:
+                    authors += ' et al.'
+
+                ref = f"[{idx}] {authors}. *{paper.title}*"
+                if paper.venue_name:
+                    ref += f". {paper.venue_name}"
+                if paper.year:
+                    ref += f", {paper.year}"
+                if paper.doi:
+                    ref += f". [DOI: {paper.doi}](https://doi.org/{paper.doi})"
+
+                content.append(ref)
+
+        return "\n\n".join(content)
--- a/crazy_functions/review_fns/conversation_doc/reference_formatter.py
+++ b/crazy_functions/review_fns/conversation_doc/reference_formatter.py
@@ -0,0 +1,174 @@
+from typing import List
+from crazy_functions.review_fns.data_sources.base_source import PaperMetadata
+import re
+
+class ReferenceFormatter:
+    """通用参考文献格式生成器"""
+
+    def __init__(self):
+        pass
+
+    def _sanitize_bibtex(self, text: str) -> str:
+        """清理BibTeX字符串，处理特殊字符"""
+        if not text:
+            return ""
+
+        # 替换特殊字符
+        replacements = {
+            '&': '\\&',
+            '%': '\\%',
+            '$': '\\$',
+            '#': '\\#',
+            '_': '\\_',
+            '{': '\\{',
+            '}': '\\}',
+            '~': '\\textasciitilde{}',
+            '^': '\\textasciicircum{}',
+            '\\': '\\textbackslash{}',
+            '<': '\\textless{}',
+            '>': '\\textgreater{}',
+            '"': '``',
+            "'": "'",
+            '-': '--',
+            '—': '---',
+        }
+
+        for char, replacement in replacements.items():
+            text = text.replace(char, replacement)
+
+        return text
+
+    def _generate_cite_key(self, paper: PaperMetadata) -> str:
+        """生成引用键
+        格式: 第一作者姓氏_年份_第一个实词
+        """
+        # 获取第一作者姓氏
+        first_author = ""
+        if paper.authors and len(paper.authors) > 0:
+            first_author = paper.authors[0].split()[-1].lower()
+
+        # 获取年份
+        year = str(paper.year) if paper.year else "0000"
+
+        # 从标题中获取第一个实词
+        title_word = ""
+        if paper.title:
+            # 移除特殊字符，分割成单词
+            words = re.findall(r'\w+', paper.title.lower())
+            # 过滤掉常见的停用词
+            stop_words = {'a', 'an', 'the', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
+            for word in words:
+                if word not in stop_words and len(word) > 2:
+                    title_word = word
+                    break
+
+        # 组合cite key
+        cite_key = f"{first_author}{year}{title_word}"
+
+        # 确保cite key只包含合法字符
+        cite_key = re.sub(r'[^a-z0-9]', '', cite_key.lower())
+
+        return cite_key
+
+    def _get_entry_type(self, paper: PaperMetadata) -> str:
+        """确定BibTeX条目类型"""
+        if hasattr(paper, 'venue_type') and paper.venue_type:
+            venue_type = paper.venue_type.lower()
+            if venue_type == 'conference':
+                return 'inproceedings'
+            elif venue_type == 'preprint':
+                return 'unpublished'
+            elif venue_type == 'journal':
+                return 'article'
+            elif venue_type == 'book':
+                return 'book'
+            elif venue_type == 'thesis':
+                return 'phdthesis'
+        return 'article'  # 默认为期刊文章
+
+
+    def create_document(self, papers: List[PaperMetadata]) -> str:
+        """生成BibTeX格式的参考文献文本"""
+        bibtex_text = "% This file was automatically generated by GPT-Academic\n"
+        bibtex_text += "% Compatible with: EndNote, Zotero, JabRef, and LaTeX\n\n"
+
+        for paper in papers:
+            entry_type = self._get_entry_type(paper)
+            cite_key = self._generate_cite_key(paper)
+
+            bibtex_text += f"@{entry_type}{{{cite_key},\n"
+
+            # 添加标题
+            if paper.title:
+                bibtex_text += f"  title = {{{self._sanitize_bibtex(paper.title)}}},\n"
+
+            # 添加作者
+            if paper.authors:
+                # 确保每个作者的姓和名正确分隔
+                processed_authors = []
+                for author in paper.authors:
+                    names = author.split()
+                    if len(names) > 1:
+                        # 假设最后一个词是姓，其他的是名
+                        surname = names[-1]
+                        given_names = ' '.join(names[:-1])
+                        processed_authors.append(f"{surname}, {given_names}")
+                    else:
+                        processed_authors.append(author)
+
+                authors = " and ".join([self._sanitize_bibtex(author) for author in processed_authors])
+                bibtex_text += f"  author = {{{authors}}},\n"
+
+            # 添加年份
+            if paper.year:
+                bibtex_text += f"  year = {{{paper.year}}},\n"
+
+            # 添加期刊/会议名称
+            if hasattr(paper, 'venue_name') and paper.venue_name:
+                if entry_type == 'inproceedings':
+                    bibtex_text += f"  booktitle = {{{self._sanitize_bibtex(paper.venue_name)}}},\n"
+                elif entry_type == 'article':
+                    bibtex_text += f"  journal = {{{self._sanitize_bibtex(paper.venue_name)}}},\n"
+                    # 添加期刊相关信息
+                    if hasattr(paper, 'venue_info'):
+                        if 'volume' in paper.venue_info:
+                            bibtex_text += f"  volume = {{{paper.venue_info['volume']}}},\n"
+                        if 'number' in paper.venue_info:
+                            bibtex_text += f"  number = {{{paper.venue_info['number']}}},\n"
+                        if 'pages' in paper.venue_info:
+                            bibtex_text += f"  pages = {{{paper.venue_info['pages']}}},\n"
+            elif paper.venue:
+                venue_field = "booktitle" if entry_type == "inproceedings" else "journal"
+                bibtex_text += f"  {venue_field} = {{{self._sanitize_bibtex(paper.venue)}}},\n"
+
+            # 添加DOI
+            if paper.doi:
+                bibtex_text += f"  doi = {{{paper.doi}}},\n"
+
+            # 添加URL
+            if paper.url:
+                bibtex_text += f"  url = {{{paper.url}}},\n"
+            elif paper.doi:
+                bibtex_text += f"  url = {{https://doi.org/{paper.doi}}},\n"
+
+            # 添加摘要
+            if paper.abstract:
+                bibtex_text += f"  abstract = {{{self._sanitize_bibtex(paper.abstract)}}},\n"
+
+            # 添加机构
+            if hasattr(paper, 'institutions') and paper.institutions:
+                institutions = " and ".join([self._sanitize_bibtex(inst) for inst in paper.institutions])
+                bibtex_text += f"  institution = {{{institutions}}},\n"
+
+            # 添加月份
+            if hasattr(paper, 'month'):
+                bibtex_text += f"  month = {{{paper.month}}},\n"
+
+            # 添加注释字段
+            if hasattr(paper, 'note'):
+                bibtex_text += f"  note = {{{self._sanitize_bibtex(paper.note)}}},\n"
+
+            # 移除最后一个逗号并关闭条目
+            bibtex_text = bibtex_text.rstrip(',\n') + "\n}\n\n"
+
+        return bibtex_text
--- a/crazy_functions/review_fns/conversation_doc/word2pdf.py
+++ b/crazy_functions/review_fns/conversation_doc/word2pdf.py
@@ -0,0 +1,138 @@
+from docx2pdf import convert
+import os
+import platform
+from typing import Union
+from pathlib import Path
+from datetime import datetime
+
+class WordToPdfConverter:
+    """Word文档转PDF转换器"""
+
+    @staticmethod
+    def _replace_docx_in_filename(filename: Union[str, Path]) -> Path:
+        """
+        将文件名中的'docx'替换为'pdf'
+        例如: 'docx_test.pdf' -> 'pdf_test.pdf'
+        """
+        path = Path(filename)
+        new_name = path.stem.replace('docx', 'pdf')
+        return path.parent / f"{new_name}{path.suffix}"
+
+    @staticmethod
+    def convert_to_pdf(word_path: Union[str, Path], pdf_path: Union[str, Path] = None) -> str:
+        """
+        将Word文档转换为PDF
+
+        参数:
+            word_path: Word文档的路径
+            pdf_path: 可选，PDF文件的输出路径。如果未指定，将使用与Word文档相同的名称和位置
+
+        返回:
+            生成的PDF文件路径
+
+        异常:
+            如果转换失败，将抛出相应异常
+        """
+        try:
+            word_path = Path(word_path)
+
+            if pdf_path is None:
+                # 创建新的pdf路径，同时替换文件名中的docx
+                pdf_path = WordToPdfConverter._replace_docx_in_filename(word_path).with_suffix('.pdf')
+            else:
+                pdf_path = WordToPdfConverter._replace_docx_in_filename(Path(pdf_path))
+
+            # 检查操作系统
+            if platform.system() == 'Linux':
+                # Linux系统需要安装libreoffice
+                if not os.system('which libreoffice') == 0:
+                    raise RuntimeError("请先安装LibreOffice: sudo apt-get install libreoffice")
+
+                # 使用libreoffice进行转换
+                os.system(f'libreoffice --headless --convert-to pdf "{word_path}" --outdir "{pdf_path.parent}"')
+
+                # 如果输出路径与默认生成的不同，则重命名
+                default_pdf = word_path.with_suffix('.pdf')
+                if default_pdf != pdf_path:
+                    os.rename(default_pdf, pdf_path)
+            else:
+                # Windows和MacOS使用 docx2pdf
+                convert(word_path, pdf_path)
+
+            return str(pdf_path)
+
+        except Exception as e:
+            raise Exception(f"转换PDF失败: {str(e)}")
+
+    @staticmethod
+    def batch_convert(word_dir: Union[str, Path], pdf_dir: Union[str, Path] = None) -> list:
+        """
+        批量转换目录下的所有Word文档
+
+        参数:
+            word_dir: 包含Word文档的目录路径
+            pdf_dir: 可选，PDF文件的输出目录。如果未指定，将使用与Word文档相同的目录
+
+        返回:
+            生成的PDF文件路径列表
+        """
+        word_dir = Path(word_dir)
+        if pdf_dir:
+            pdf_dir = Path(pdf_dir)
+            pdf_dir.mkdir(parents=True, exist_ok=True)
+
+        converted_files = []
+
+        for word_file in word_dir.glob("*.docx"):
+            try:
+                if pdf_dir:
+                    pdf_path = pdf_dir / WordToPdfConverter._replace_docx_in_filename(
+                        word_file.with_suffix('.pdf')
+                    ).name
+                else:
+                    pdf_path = WordToPdfConverter._replace_docx_in_filename(
+                        word_file.with_suffix('.pdf')
+                    )
+
+                pdf_file = WordToPdfConverter.convert_to_pdf(word_file, pdf_path)
+                converted_files.append(pdf_file)
+
+            except Exception as e:
+                print(f"转换 {word_file} 失败: {str(e)}")
+
+        return converted_files
+
+    @staticmethod
+    def convert_doc_to_pdf(doc, output_dir: Union[str, Path] = None) -> str:
+        """
+        将docx对象直接转换为PDF
+
+        参数:
+            doc: python-docx的Document对象
+            output_dir: 可选，输出目录。如果未指定，将使用当前目录
+
+        返回:
+            生成的PDF文件路径
+        """
+        try:
+            # 设置临时文件路径和输出路径
+            output_dir = Path(output_dir) if output_dir else Path.cwd()
+            output_dir.mkdir(parents=True, exist_ok=True)
+
+            # 生成临时word文件
+            temp_docx = output_dir / f"temp_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
+            doc.save(temp_docx)
+
+            # 转换为PDF
+            pdf_path = temp_docx.with_suffix('.pdf')
+            WordToPdfConverter.convert_to_pdf(temp_docx, pdf_path)
+
+            # 删除临时word文件
+            temp_docx.unlink()
+
+            return str(pdf_path)
+
+        except Exception as e:
+            if temp_docx.exists():
+                temp_docx.unlink()
+            raise Exception(f"转换PDF失败: {str(e)}")
--- a/crazy_functions/review_fns/conversation_doc/word_doc.py
+++ b/crazy_functions/review_fns/conversation_doc/word_doc.py
@@ -0,0 +1,246 @@
+import re
+from docx import Document
+from docx.shared import Cm, Pt
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
+from docx.enum.style import WD_STYLE_TYPE
+from docx.oxml.ns import qn
+from datetime import datetime
+import docx
+from docx.oxml import shared
+from crazy_functions.doc_fns.conversation_doc.word_doc import convert_markdown_to_word
+
+
+class WordFormatter:
+    """聊天记录Word文档生成器 - 符合中国政府公文格式规范(GB/T 9704-2012)"""
+
+    def __init__(self):
+        self.doc = Document()
+        self._setup_document()
+        self._create_styles()
+
+    def _setup_document(self):
+        """设置文档基本格式，包括页面设置和页眉"""
+        sections = self.doc.sections
+        for section in sections:
+            # 设置页面大小为A4
+            section.page_width = Cm(21)
+            section.page_height = Cm(29.7)
+            # 设置页边距
+            section.top_margin = Cm(3.7)  # 上边距37mm
+            section.bottom_margin = Cm(3.5)  # 下边距35mm
+            section.left_margin = Cm(2.8)  # 左边距28mm
+            section.right_margin = Cm(2.6)  # 右边距26mm
+            # 设置页眉页脚距离
+            section.header_distance = Cm(2.0)
+            section.footer_distance = Cm(2.0)
+
+            # 修改页眉
+            header = section.header
+            header_para = header.paragraphs[0]
+            header_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
+            header_run = header_para.add_run("GPT-Academic学术对话 (体验地址：https://auth.gpt-academic.top/)")
+            header_run.font.name = '仿宋'
+            header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
+            header_run.font.size = Pt(9)
+
+    def _create_styles(self):
+        """创建文档样式"""
+        # 创建正文样式
+        style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH)
+        style.font.name = '仿宋'
+        style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
+        style.font.size = Pt(12)
+        style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+        style.paragraph_format.space_after = Pt(0)
+
+        # 创建问题样式
+        question_style = self.doc.styles.add_style('Question_Style', WD_STYLE_TYPE.PARAGRAPH)
+        question_style.font.name = '黑体'
+        question_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
+        question_style.font.size = Pt(14)  # 调整为14磅
+        question_style.font.bold = True
+        question_style.paragraph_format.space_before = Pt(12)  # 减小段前距
+        question_style.paragraph_format.space_after = Pt(6)
+        question_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+        question_style.paragraph_format.left_indent = Pt(0)  # 移除左缩进
+
+        # 创建回答样式
+        answer_style = self.doc.styles.add_style('Answer_Style', WD_STYLE_TYPE.PARAGRAPH)
+        answer_style.font.name = '仿宋'
+        answer_style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
+        answer_style.font.size = Pt(12)  # 调整为12磅
+        answer_style.paragraph_format.space_before = Pt(6)
+        answer_style.paragraph_format.space_after = Pt(12)
+        answer_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+        answer_style.paragraph_format.left_indent = Pt(0)  # 移除左缩进
+
+        # 创建标题样式
+        title_style = self.doc.styles.add_style('Title_Custom', WD_STYLE_TYPE.PARAGRAPH)
+        title_style.font.name = '黑体'  # 改用黑体
+        title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
+        title_style.font.size = Pt(22)  # 调整为22磅
+        title_style.font.bold = True
+        title_style.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
+        title_style.paragraph_format.space_before = Pt(0)
+        title_style.paragraph_format.space_after = Pt(24)
+        title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+
+        # 添加参考文献样式
+        ref_style = self.doc.styles.add_style('Reference_Style', WD_STYLE_TYPE.PARAGRAPH)
+        ref_style.font.name = '宋体'
+        ref_style._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
+        ref_style.font.size = Pt(10.5)  # 参考文献使用小号字体
+        ref_style.paragraph_format.space_before = Pt(3)
+        ref_style.paragraph_format.space_after = Pt(3)
+        ref_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE
+        ref_style.paragraph_format.left_indent = Pt(21)
+        ref_style.paragraph_format.first_line_indent = Pt(-21)
+
+        # 添加参考文献标题样式
+        ref_title_style = self.doc.styles.add_style('Reference_Title_Style', WD_STYLE_TYPE.PARAGRAPH)
+        ref_title_style.font.name = '黑体'
+        ref_title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
+        ref_title_style.font.size = Pt(16)  # 参考文献标题与问题同样大小
+        ref_title_style.font.bold = True
+        ref_title_style.paragraph_format.space_before = Pt(24)  # 增加段前距
+        ref_title_style.paragraph_format.space_after = Pt(12)
+        ref_title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+
+    def create_document(self, question: str, answer: str, ranked_papers: list = None):
+        """写入聊天历史
+        Args:
+            question: str, 用户问题
+            answer: str, AI回答
+            ranked_papers: list, 排序后的论文列表
+        """
+        try:
+            # 添加标题
+            title_para = self.doc.add_paragraph(style='Title_Custom')
+            title_run = title_para.add_run('GPT-Academic 对话记录')
+
+            # 添加日期
+            try:
+                date_para = self.doc.add_paragraph()
+                date_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
+                date_run = date_para.add_run(datetime.now().strftime('%Y年%m月%d日'))
+                date_run.font.name = '仿宋'
+                date_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
+                date_run.font.size = Pt(16)
+            except Exception as e:
+                print(f"添加日期失败: {str(e)}")
+                raise
+
+            self.doc.add_paragraph()  # 添加空行
+
+            # 添加问答对话
+            try:
+                q_para = self.doc.add_paragraph(style='Question_Style')
+                q_para.add_run('问题：').bold = True
+                q_para.add_run(str(question))
+
+                a_para = self.doc.add_paragraph(style='Answer_Style')
+                a_para.add_run('回答：').bold = True
+                a_para.add_run(convert_markdown_to_word(str(answer)))
+            except Exception as e:
+                print(f"添加问答对话失败: {str(e)}")
+                raise
+
+            # 添加参考文献部分
+            if ranked_papers:
+                try:
+                    ref_title = self.doc.add_paragraph(style='Reference_Title_Style')
+                    ref_title.add_run("参考文献")
+                    
+                    for idx, paper in enumerate(ranked_papers, 1):
+                        try:
+                            ref_para = self.doc.add_paragraph(style='Reference_Style')
+                            ref_para.add_run(f'[{idx}] ').bold = True
+                            
+                            # 添加作者
+                            authors = ', '.join(paper.authors[:3])
+                            if len(paper.authors) > 3:
+                                authors += ' et al.'
+                            ref_para.add_run(f'{authors}. ')
+                            
+                            # 添加标题
+                            title_run = ref_para.add_run(paper.title)
+                            title_run.italic = True
+                            if hasattr(paper, 'url') and paper.url:
+                                try:
+                                    title_run._element.rPr.rStyle = self._create_hyperlink_style()
+                                    self._add_hyperlink(ref_para, paper.title, paper.url)
+                                except Exception as e:
+                                    print(f"添加超链接失败: {str(e)}")
+                            
+                            # 添加期刊/会议信息
+                            if paper.venue_name:
+                                ref_para.add_run(f'. {paper.venue_name}')
+                            
+                            # 添加年份
+                            if paper.year:
+                                ref_para.add_run(f', {paper.year}')
+                            
+                            # 添加DOI
+                            if paper.doi:
+                                ref_para.add_run('. ')
+                                if "arxiv" in paper.url:
+                                    doi_url = paper.doi
+                                else:   
+                                    doi_url = f'https://doi.org/{paper.doi}'
+                                self._add_hyperlink(ref_para, f'DOI: {paper.doi}', doi_url)
+                            
+                            ref_para.add_run('.')
+                        except Exception as e:
+                            print(f"添加第 {idx} 篇参考文献失败: {str(e)}")
+                            continue
+                except Exception as e:
+                    print(f"添加参考文献部分失败: {str(e)}")
+                    raise
+
+            return self.doc
+        
+        except Exception as e:
+            print(f"Word文档创建失败: {str(e)}")
+            import traceback
+            print(f"详细错误信息: {traceback.format_exc()}")
+            raise
+
+    def _create_hyperlink_style(self):
+        """创建超链接样式"""
+        styles = self.doc.styles
+        if 'Hyperlink' not in styles:
+            hyperlink_style = styles.add_style('Hyperlink', WD_STYLE_TYPE.CHARACTER)
+            # 使用科技蓝 (#0066CC)
+            hyperlink_style.font.color.rgb = 0x0066CC  # 科技蓝
+            hyperlink_style.font.underline = True
+        return styles['Hyperlink']
+
+    def _add_hyperlink(self, paragraph, text, url):
+        """添加超链接到段落"""
+        # 这个是在XML级别添加超链接
+        part = paragraph.part
+        r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
+        
+        # 创建超链接XML元素
+        hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
+        hyperlink.set(docx.oxml.shared.qn('r:id'), r_id)
+        
+        # 创建文本运行
+        new_run = docx.oxml.shared.OxmlElement('w:r')
+        rPr = docx.oxml.shared.OxmlElement('w:rPr')
+        
+        # 应用超链接样式
+        rStyle = docx.oxml.shared.OxmlElement('w:rStyle')
+        rStyle.set(docx.oxml.shared.qn('w:val'), 'Hyperlink')
+        rPr.append(rStyle)
+        
+        # 添加文本
+        t = docx.oxml.shared.OxmlElement('w:t')
+        t.text = text
+        new_run.append(rPr)
+        new_run.append(t)
+        hyperlink.append(new_run)
+        
+        # 将超链接添加到段落
+        paragraph._p.append(hyperlink)
+