add tex html formatter

2024-12-01 23:26:02 +08:00
parent 3beb22a347
commit 795a6a9333
4 changed files with 390 additions and 14 deletions
--- a/config.py
+++ b/config.py
@@ -39,7 +39,7 @@ AVAIL_LLM_MODELS = ["gpt-4-1106-preview", "gpt-4-turbo-preview", "gpt-4-vision-p
                    "gemini-1.5-pro", "chatglm3"
                    ]

-EMBEDDING_MODEL = "netease-youdao/bce-embedding-base_v1"
+EMBEDDING_MODEL = "text-embedding-3-small"

 # --- --- --- ---
 # P.S. 其他可用的模型还包括
--- a/crazy_functions/Arxiv_论文对话.py
+++ b/crazy_functions/Arxiv_论文对话.py
@@ -414,6 +414,7 @@ def Arxiv论文对话(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot:
        web_port: Web端口
    """
    # 初始化时，提示用户需要 arxiv ID/URL
+    from toolbox import promote_file_to_downloadzone
    if len(history) == 0 and not txt.lower().strip().startswith(('https://arxiv.org', 'arxiv.org', '0', '1', '2')):
        chatbot.append((txt, "请先提供Arxiv论文链接或ID。"))
        yield from update_ui(chatbot=chatbot, history=history)
@@ -421,14 +422,16 @@ def Arxiv论文对话(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot:

    user_name = chatbot.get_user()
    arxiv_worker = ArxivRagWorker(user_name, llm_kwargs, arxiv_id=txt)
+    arxiv_id = arxiv_worker.arxiv_id

    # 处理新论文的情况
    if txt.lower().strip().startswith(('https://arxiv.org', 'arxiv.org', '0', '1', '2')) and not arxiv_worker.loading:
        chatbot.append((txt, "正在处理论文，请稍等..."))
        yield from update_ui(chatbot=chatbot, history=history)
-        arxiv_id = arxiv_worker.arxiv_id
-        fragments, formatted_content, output_dir = process_arxiv_sync(arxiv_worker.arxiv_splitter,                                                                  arxiv_worker.arxiv_id)
-        chatbot.append(["论文下载成功，接下来将编码论文，预计等待两分钟，请耐心等待，论文内容如下：", formatted_content])
+        fragments, formatted_content, fragment_output_dir = process_arxiv_sync(arxiv_worker.arxiv_splitter, arxiv_id)
+        promote_file_to_downloadzone(fragment_output_dir, chatbot=chatbot)
+        chatbot.append(["论文下载成功，接下来将编码论文，预计等待两分钟，请耐心等待，等待过程中，可以查看论文：", formatted_content])
+        yield from update_ui(chatbot=chatbot, history=history)
        try:
            # 创建新的事件循环
            loop = asyncio.new_event_loop()
@@ -471,8 +474,15 @@ def Arxiv论文对话(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot:
    # 获取用户询问指令
    user_query = plugin_kwargs.get("advanced_arg",
                                   "What is the main research question or problem addressed in this paper?")
+    if len(history)<2:
+        fragments, formatted_content, fragment_output_files = process_arxiv_sync(arxiv_worker.arxiv_splitter, arxiv_id)
+        for file in fragment_output_files:
+            promote_file_to_downloadzone(file, chatbot=chatbot)
+        chatbot.append(["论文的文字内容为：", formatted_content])
+        chatbot.append(["处理完成", f"论文文字内容已保存至下载区"])
+        yield from update_ui(chatbot=chatbot, history=history)
    if not user_query:
-        user_query = "What is the main research question or problem addressed in this paper about graph attention network?"
+        user_query = "What is the main research question or problem addressed in this paper?"
        # chatbot.append((txt, "请提供您的问题。"))
        # yield from update_ui(chatbot=chatbot, history=history)
        # return
--- a/crazy_functions/doc_fns/tex_html_formatter.py
+++ b/crazy_functions/doc_fns/tex_html_formatter.py
@@ -0,0 +1,354 @@
+from pathlib import Path
+from typing import List, Dict
+from dataclasses import dataclass
+from datetime import datetime
+import os
+import re
+
+
+@dataclass
+class SectionFragment:
+    """Arxiv论文片段数据类"""
+    title: str
+    authors: str
+    abstract: str
+    catalogs: str
+    arxiv_id: str = ""
+    current_section: str = "Introduction"
+    content: str = ''
+    bibliography: str = ''
+
+
+class PaperHtmlFormatter:
+    """HTML格式论文文档生成器"""
+
+    def __init__(self, fragments: List[SectionFragment], output_dir: Path):
+        self.fragments = fragments
+        self.output_dir = output_dir
+        self.css_styles = """
+        :root {
+            --primary-color: #1a73e8;
+            --secondary-color: #34495e;
+            --background-color: #f8f9fa;
+            --text-color: #2c3e50;
+            --border-color: #e0e0e0;
+            --code-bg-color: #f6f8fa;
+        }
+
+        body {
+            font-family: "Source Serif Pro", "Times New Roman", serif;
+            line-height: 1.8;
+            max-width: 1000px;
+            margin: 0 auto;
+            padding: 2rem;
+            color: var(--text-color);
+            background-color: var(--background-color);
+            font-size: 16px;
+        }
+
+        .container {
+            background: white;
+            padding: 2rem;
+            border-radius: 8px;
+            box-shadow: 0 2px 12px rgba(0,0,0,0.1);
+        }
+
+        h1 {
+            color: var(--primary-color);
+            font-size: 2.2em;
+            text-align: center;
+            margin: 1.5rem 0;
+            padding-bottom: 1rem;
+            border-bottom: 3px solid var(--primary-color);
+        }
+
+        h2 {
+            color: var(--secondary-color);
+            font-size: 1.8em;
+            margin-top: 2rem;
+            padding-left: 1rem;
+            border-left: 4px solid var(--primary-color);
+        }
+
+        h3 {
+            color: var(--text-color);
+            font-size: 1.5em;
+            margin-top: 1.5rem;
+            border-bottom: 2px solid var(--border-color);
+            padding-bottom: 0.5rem;
+        }
+
+        .authors {
+            text-align: center;
+            color: var(--secondary-color);
+            font-size: 1.1em;
+            margin: 1rem 0 2rem;
+        }
+
+        .abstract-container {
+            background: var(--background-color);
+            padding: 1.5rem;
+            border-radius: 6px;
+            margin: 2rem 0;
+        }
+
+        .abstract-title {
+            font-weight: bold;
+            color: var(--primary-color);
+            margin-bottom: 1rem;
+        }
+
+        .abstract-content {
+            font-style: italic;
+            line-height: 1.7;
+        }
+
+        .toc {
+            background: white;
+            padding: 1.5rem;
+            border-radius: 6px;
+            margin: 2rem 0;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.05);
+        }
+
+        .toc-title {
+            color: var(--primary-color);
+            font-size: 1.4em;
+            margin-bottom: 1rem;
+        }
+
+        .section-content {
+            background: white;
+            padding: 1.5rem;
+            border-radius: 6px;
+            margin: 1.5rem 0;
+            box-shadow: 0 1px 3px rgba(0,0,0,0.05);
+        }
+
+        .fragment {
+            margin: 2rem 0;
+            padding-left: 1rem;
+            border-left: 3px solid var(--border-color);
+        }
+
+        .fragment:hover {
+            border-left-color: var(--primary-color);
+        }
+
+        .bibliography {
+            background: var(--code-bg-color);
+            padding: 1rem;
+            border-radius: 4px;
+            font-family: "Source Code Pro", monospace;
+            font-size: 0.9em;
+            white-space: pre-wrap;
+            margin-top: 1rem;
+        }
+
+        pre {
+            background: var(--code-bg-color);
+            padding: 1rem;
+            border-radius: 4px;
+            overflow-x: auto;
+            font-family: "Source Code Pro", monospace;
+        }
+
+        .paper-info {
+            background: white;
+            padding: 2rem;
+            border-radius: 8px;
+            margin: 2rem 0;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+        }
+
+        .arxiv-id {
+            text-align: center;
+            color: #666;
+            font-size: 0.9em;
+            margin: 1rem 0;
+        }
+
+        .section-title {
+            display: flex;
+            align-items: center;
+            gap: 0.5rem;
+            color: var(--secondary-color);
+        }
+
+        .section-icon {
+            color: var(--primary-color);
+        }
+
+        @media print {
+            body {
+                background: white;
+            }
+            .container {
+                box-shadow: none;
+            }
+        }
+        """
+
+    def _sanitize_html(self, text: str) -> str:
+        """清理HTML特殊字符"""
+        if not text:
+            return ""
+
+        replacements = {
+            "&": "&amp;",
+            "<": "&lt;",
+            ">": "&gt;",
+            '"': "&quot;",
+            "'": "&#39;"
+        }
+
+        for old, new in replacements.items():
+            text = text.replace(old, new)
+        return text
+
+    def _create_section_id(self, section: str) -> str:
+        """创建section的ID"""
+        section = section.strip() or "uncategorized"
+        # 移除特殊字符,转换为小写并用连字符替换空格
+        section_id = re.sub(r'[^\w\s-]', '', section.lower())
+        return section_id.replace(' ', '-')
+
+    def format_paper_info(self) -> str:
+        """格式化论文基本信息"""
+        if not self.fragments:
+            return ""
+
+        first_fragment = self.fragments[0]
+        paper_info = ['<div class="paper-info">']
+
+        # 添加标题
+        if first_fragment.title:
+            paper_info.append(f'<h1>{self._sanitize_html(first_fragment.title)}</h1>')
+
+        # 添加arXiv ID
+        if first_fragment.arxiv_id:
+            paper_info.append(f'<div class="arxiv-id">arXiv: {self._sanitize_html(first_fragment.arxiv_id)}</div>')
+
+        # 添加作者
+        if first_fragment.authors:
+            paper_info.append(f'<div class="authors">{self._sanitize_html(first_fragment.authors)}</div>')
+
+        # 添加摘要
+        if first_fragment.abstract:
+            paper_info.append('<div class="abstract-container">')
+            paper_info.append('<div class="abstract-title">Abstract</div>')
+            paper_info.append(f'<div class="abstract-content">{self._sanitize_html(first_fragment.abstract)}</div>')
+            paper_info.append('</div>')
+
+        # 添加目录结构
+        if first_fragment.catalogs:
+            paper_info.append('<h2>Document Structure</h2>')
+            paper_info.append('<pre>')
+            paper_info.append(self._sanitize_html(first_fragment.catalogs))
+            paper_info.append('</pre>')
+
+        paper_info.append('</div>')
+        return '\n'.join(paper_info)
+
+    def format_table_of_contents(self, sections: Dict[str, List[SectionFragment]]) -> str:
+        """生成目录"""
+        toc = ['<div class="toc">']
+        toc.append('<div class="toc-title">Table of Contents</div>')
+        toc.append('<nav>')
+
+        for section in sections:
+            section_id = self._create_section_id(section)
+            clean_section = section.strip() or "Uncategorized"
+            toc.append(f'<div><a href="#{section_id}">{self._sanitize_html(clean_section)} '
+                       f'</a></div>')
+
+        toc.append('</nav>')
+        toc.append('</div>')
+        return '\n'.join(toc)
+
+    def format_sections(self) -> str:
+        """格式化论文各部分内容"""
+        sections = {}
+        for fragment in self.fragments:
+            section = fragment.current_section or "Uncategorized"
+            if section not in sections:
+                sections[section] = []
+            sections[section].append(fragment)
+
+        formatted_html = ['<div class="content">']
+        formatted_html.append(self.format_table_of_contents(sections))
+
+        # 生成各部分内容
+        for section, fragments in sections.items():
+            section_id = self._create_section_id(section)
+            formatted_html.append(f'<h2 id="{section_id}">')
+            formatted_html.append(f'<span class="section-title">')
+            formatted_html.append(f'<span class="section-icon">§</span>')
+            formatted_html.append(f'{self._sanitize_html(section)}')
+            formatted_html.append('</span>')
+            formatted_html.append('</h2>')
+
+            formatted_html.append('<div class="section-content">')
+
+            for i, fragment in enumerate(fragments, 1):
+                formatted_html.append('<div class="fragment">')
+
+                # 添加内容
+                if fragment.content:
+                    formatted_html.append(
+                        f'<div class="fragment-content">{self._sanitize_html(fragment.content)}</div>'
+                    )
+
+                # 添加参考文献
+                if fragment.bibliography:
+                    formatted_html.append('<div class="bibliography">')
+                    formatted_html.append(f'{self._sanitize_html(fragment.bibliography)}')
+                    formatted_html.append('</div>')
+
+                formatted_html.append('</div>')
+
+            formatted_html.append('</div>')
+
+        formatted_html.append('</div>')
+        return '\n'.join(formatted_html)
+
+    def save_html(self) -> Path:
+        """保存HTML文档"""
+        try:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"paper_content_{timestamp}.html"
+            file_path = self.output_dir / filename
+
+            html_content = f"""
+            <!DOCTYPE html>
+            <html lang="en">
+            <head>
+                <meta charset="utf-8">
+                <meta name="viewport" content="width=device-width, initial-scale=1">
+                <title>{self._sanitize_html(self.fragments[0].title if self.fragments else 'Paper Content')}</title>
+                <style>
+                {self.css_styles}
+                </style>
+            </head>
+            <body>
+                <div class="container">
+                    {self.format_paper_info()}
+                    {self.format_sections()}
+                </div>
+            </body>
+            </html>
+            """
+
+            with open(file_path, "w", encoding="utf-8") as f:
+                f.write(html_content)
+
+            print(f"HTML document saved to: {file_path}")
+            return file_path
+
+        except Exception as e:
+            print(f"Error saving HTML document: {str(e)}")
+            raise
+
+# 使用示例：
+# formatter = PaperHtmlFormatter(fragments, output_dir)
+# output_path = formatter.save_html()
--- a/crazy_functions/rag_fns/arxiv_fns/arxiv_splitter.py
+++ b/crazy_functions/rag_fns/arxiv_fns/arxiv_splitter.py
@@ -734,7 +734,7 @@ class ArxivSplitter:
        return content.strip()


-def process_arxiv_sync(splitter: ArxivSplitter, arxiv_id: str) -> tuple[List[SectionFragment], str, Path]:
+def process_arxiv_sync(splitter: ArxivSplitter, arxiv_id: str) -> tuple[List[SectionFragment], str, List[Path]]:
    """
    同步处理 ArXiv 文档并返回分割后的片段

@@ -746,19 +746,24 @@ def process_arxiv_sync(splitter: ArxivSplitter, arxiv_id: str) -> tuple[List[Sec
        list: 分割后的文档片段列表
    """
    try:
+        from crazy_functions.doc_fns.tex_html_formatter import PaperHtmlFormatter
        # 创建一个异步函数来执行异步操作
        async def _process():
            return await splitter.process(arxiv_id)

        # 使用 asyncio.run() 运行异步函数
+        output_files=[]
        fragments = asyncio.run(_process())
-
+        file_save_path = splitter.root_dir / "arxiv_fragments"
        # 保存片段到文件
-        output_dir = save_fragments_to_file(
+        try:
+            md_output_dir = save_fragments_to_file(
                fragments,
-            output_dir=splitter.root_dir / "arxiv_fragments"
+                output_dir = file_save_path
            )
-        print(f"Output saved to: {output_dir}")
+            output_files.append(md_output_dir)
+        except:
+            pass
        # 创建论文格式化器
        formatter = PaperContentFormatter()

@@ -775,7 +780,14 @@ def process_arxiv_sync(splitter: ArxivSplitter, arxiv_id: str) -> tuple[List[Sec

        # 格式化内容
        formatted_content = formatter.format(fragments, metadata)
-        return fragments, formatted_content, output_dir
+
+        try:
+            html_formatter = PaperHtmlFormatter(fragments, file_save_path)
+            html_output_dir = html_formatter.save_html()
+            output_files.append(html_output_dir)
+        except:
+            pass
+        return fragments, formatted_content, output_files

    except Exception as e:
        print(f"✗ Processing failed for {arxiv_id}: {str(e)}")
@@ -821,4 +833,4 @@ def test_arxiv_splitter():


 if __name__ == "__main__":
-    asyncio.run(test_arxiv_splitter())
+    test_arxiv_splitter()