add tex html formatter

This commit is contained in:
lbykkkk
2024-12-01 23:26:02 +08:00
parent 3beb22a347
commit 795a6a9333
4 changed files with 390 additions and 14 deletions

View File

@@ -39,7 +39,7 @@ AVAIL_LLM_MODELS = ["gpt-4-1106-preview", "gpt-4-turbo-preview", "gpt-4-vision-p
"gemini-1.5-pro", "chatglm3" "gemini-1.5-pro", "chatglm3"
] ]
EMBEDDING_MODEL = "netease-youdao/bce-embedding-base_v1" EMBEDDING_MODEL = "text-embedding-3-small"
# --- --- --- --- # --- --- --- ---
# P.S. 其他可用的模型还包括 # P.S. 其他可用的模型还包括

View File

@@ -414,6 +414,7 @@ def Arxiv论文对话(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot:
web_port: Web端口 web_port: Web端口
""" """
# 初始化时,提示用户需要 arxiv ID/URL # 初始化时,提示用户需要 arxiv ID/URL
from toolbox import promote_file_to_downloadzone
if len(history) == 0 and not txt.lower().strip().startswith(('https://arxiv.org', 'arxiv.org', '0', '1', '2')): if len(history) == 0 and not txt.lower().strip().startswith(('https://arxiv.org', 'arxiv.org', '0', '1', '2')):
chatbot.append((txt, "请先提供Arxiv论文链接或ID。")) chatbot.append((txt, "请先提供Arxiv论文链接或ID。"))
yield from update_ui(chatbot=chatbot, history=history) yield from update_ui(chatbot=chatbot, history=history)
@@ -421,14 +422,16 @@ def Arxiv论文对话(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot:
user_name = chatbot.get_user() user_name = chatbot.get_user()
arxiv_worker = ArxivRagWorker(user_name, llm_kwargs, arxiv_id=txt) arxiv_worker = ArxivRagWorker(user_name, llm_kwargs, arxiv_id=txt)
arxiv_id = arxiv_worker.arxiv_id
# 处理新论文的情况 # 处理新论文的情况
if txt.lower().strip().startswith(('https://arxiv.org', 'arxiv.org', '0', '1', '2')) and not arxiv_worker.loading: if txt.lower().strip().startswith(('https://arxiv.org', 'arxiv.org', '0', '1', '2')) and not arxiv_worker.loading:
chatbot.append((txt, "正在处理论文,请稍等...")) chatbot.append((txt, "正在处理论文,请稍等..."))
yield from update_ui(chatbot=chatbot, history=history) yield from update_ui(chatbot=chatbot, history=history)
arxiv_id = arxiv_worker.arxiv_id fragments, formatted_content, fragment_output_dir = process_arxiv_sync(arxiv_worker.arxiv_splitter, arxiv_id)
fragments, formatted_content, output_dir = process_arxiv_sync(arxiv_worker.arxiv_splitter, arxiv_worker.arxiv_id) promote_file_to_downloadzone(fragment_output_dir, chatbot=chatbot)
chatbot.append(["论文下载成功,接下来将编码论文,预计等待两分钟,请耐心等待,论文内容如下", formatted_content]) chatbot.append(["论文下载成功,接下来将编码论文,预计等待两分钟,请耐心等待,等待过程中,可以查看论文:", formatted_content])
yield from update_ui(chatbot=chatbot, history=history)
try: try:
# 创建新的事件循环 # 创建新的事件循环
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
@@ -471,8 +474,15 @@ def Arxiv论文对话(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot:
# 获取用户询问指令 # 获取用户询问指令
user_query = plugin_kwargs.get("advanced_arg", user_query = plugin_kwargs.get("advanced_arg",
"What is the main research question or problem addressed in this paper?") "What is the main research question or problem addressed in this paper?")
if len(history)<2:
fragments, formatted_content, fragment_output_files = process_arxiv_sync(arxiv_worker.arxiv_splitter, arxiv_id)
for file in fragment_output_files:
promote_file_to_downloadzone(file, chatbot=chatbot)
chatbot.append(["论文的文字内容为:", formatted_content])
chatbot.append(["处理完成", f"论文文字内容已保存至下载区"])
yield from update_ui(chatbot=chatbot, history=history)
if not user_query: if not user_query:
user_query = "What is the main research question or problem addressed in this paper about graph attention network?" user_query = "What is the main research question or problem addressed in this paper?"
# chatbot.append((txt, "请提供您的问题。")) # chatbot.append((txt, "请提供您的问题。"))
# yield from update_ui(chatbot=chatbot, history=history) # yield from update_ui(chatbot=chatbot, history=history)
# return # return

View File

@@ -0,0 +1,354 @@
from pathlib import Path
from typing import List, Dict
from dataclasses import dataclass
from datetime import datetime
import os
import re
@dataclass
class SectionFragment:
"""Arxiv论文片段数据类"""
title: str
authors: str
abstract: str
catalogs: str
arxiv_id: str = ""
current_section: str = "Introduction"
content: str = ''
bibliography: str = ''
class PaperHtmlFormatter:
"""HTML格式论文文档生成器"""
def __init__(self, fragments: List[SectionFragment], output_dir: Path):
self.fragments = fragments
self.output_dir = output_dir
self.css_styles = """
:root {
--primary-color: #1a73e8;
--secondary-color: #34495e;
--background-color: #f8f9fa;
--text-color: #2c3e50;
--border-color: #e0e0e0;
--code-bg-color: #f6f8fa;
}
body {
font-family: "Source Serif Pro", "Times New Roman", serif;
line-height: 1.8;
max-width: 1000px;
margin: 0 auto;
padding: 2rem;
color: var(--text-color);
background-color: var(--background-color);
font-size: 16px;
}
.container {
background: white;
padding: 2rem;
border-radius: 8px;
box-shadow: 0 2px 12px rgba(0,0,0,0.1);
}
h1 {
color: var(--primary-color);
font-size: 2.2em;
text-align: center;
margin: 1.5rem 0;
padding-bottom: 1rem;
border-bottom: 3px solid var(--primary-color);
}
h2 {
color: var(--secondary-color);
font-size: 1.8em;
margin-top: 2rem;
padding-left: 1rem;
border-left: 4px solid var(--primary-color);
}
h3 {
color: var(--text-color);
font-size: 1.5em;
margin-top: 1.5rem;
border-bottom: 2px solid var(--border-color);
padding-bottom: 0.5rem;
}
.authors {
text-align: center;
color: var(--secondary-color);
font-size: 1.1em;
margin: 1rem 0 2rem;
}
.abstract-container {
background: var(--background-color);
padding: 1.5rem;
border-radius: 6px;
margin: 2rem 0;
}
.abstract-title {
font-weight: bold;
color: var(--primary-color);
margin-bottom: 1rem;
}
.abstract-content {
font-style: italic;
line-height: 1.7;
}
.toc {
background: white;
padding: 1.5rem;
border-radius: 6px;
margin: 2rem 0;
box-shadow: 0 2px 8px rgba(0,0,0,0.05);
}
.toc-title {
color: var(--primary-color);
font-size: 1.4em;
margin-bottom: 1rem;
}
.section-content {
background: white;
padding: 1.5rem;
border-radius: 6px;
margin: 1.5rem 0;
box-shadow: 0 1px 3px rgba(0,0,0,0.05);
}
.fragment {
margin: 2rem 0;
padding-left: 1rem;
border-left: 3px solid var(--border-color);
}
.fragment:hover {
border-left-color: var(--primary-color);
}
.bibliography {
background: var(--code-bg-color);
padding: 1rem;
border-radius: 4px;
font-family: "Source Code Pro", monospace;
font-size: 0.9em;
white-space: pre-wrap;
margin-top: 1rem;
}
pre {
background: var(--code-bg-color);
padding: 1rem;
border-radius: 4px;
overflow-x: auto;
font-family: "Source Code Pro", monospace;
}
.paper-info {
background: white;
padding: 2rem;
border-radius: 8px;
margin: 2rem 0;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
}
.arxiv-id {
text-align: center;
color: #666;
font-size: 0.9em;
margin: 1rem 0;
}
.section-title {
display: flex;
align-items: center;
gap: 0.5rem;
color: var(--secondary-color);
}
.section-icon {
color: var(--primary-color);
}
@media print {
body {
background: white;
}
.container {
box-shadow: none;
}
}
"""
def _sanitize_html(self, text: str) -> str:
"""清理HTML特殊字符"""
if not text:
return ""
replacements = {
"&": "&amp;",
"<": "&lt;",
">": "&gt;",
'"': "&quot;",
"'": "&#39;"
}
for old, new in replacements.items():
text = text.replace(old, new)
return text
def _create_section_id(self, section: str) -> str:
"""创建section的ID"""
section = section.strip() or "uncategorized"
# 移除特殊字符,转换为小写并用连字符替换空格
section_id = re.sub(r'[^\w\s-]', '', section.lower())
return section_id.replace(' ', '-')
def format_paper_info(self) -> str:
"""格式化论文基本信息"""
if not self.fragments:
return ""
first_fragment = self.fragments[0]
paper_info = ['<div class="paper-info">']
# 添加标题
if first_fragment.title:
paper_info.append(f'<h1>{self._sanitize_html(first_fragment.title)}</h1>')
# 添加arXiv ID
if first_fragment.arxiv_id:
paper_info.append(f'<div class="arxiv-id">arXiv: {self._sanitize_html(first_fragment.arxiv_id)}</div>')
# 添加作者
if first_fragment.authors:
paper_info.append(f'<div class="authors">{self._sanitize_html(first_fragment.authors)}</div>')
# 添加摘要
if first_fragment.abstract:
paper_info.append('<div class="abstract-container">')
paper_info.append('<div class="abstract-title">Abstract</div>')
paper_info.append(f'<div class="abstract-content">{self._sanitize_html(first_fragment.abstract)}</div>')
paper_info.append('</div>')
# 添加目录结构
if first_fragment.catalogs:
paper_info.append('<h2>Document Structure</h2>')
paper_info.append('<pre>')
paper_info.append(self._sanitize_html(first_fragment.catalogs))
paper_info.append('</pre>')
paper_info.append('</div>')
return '\n'.join(paper_info)
def format_table_of_contents(self, sections: Dict[str, List[SectionFragment]]) -> str:
"""生成目录"""
toc = ['<div class="toc">']
toc.append('<div class="toc-title">Table of Contents</div>')
toc.append('<nav>')
for section in sections:
section_id = self._create_section_id(section)
clean_section = section.strip() or "Uncategorized"
toc.append(f'<div><a href="#{section_id}">{self._sanitize_html(clean_section)} '
f'</a></div>')
toc.append('</nav>')
toc.append('</div>')
return '\n'.join(toc)
def format_sections(self) -> str:
"""格式化论文各部分内容"""
sections = {}
for fragment in self.fragments:
section = fragment.current_section or "Uncategorized"
if section not in sections:
sections[section] = []
sections[section].append(fragment)
formatted_html = ['<div class="content">']
formatted_html.append(self.format_table_of_contents(sections))
# 生成各部分内容
for section, fragments in sections.items():
section_id = self._create_section_id(section)
formatted_html.append(f'<h2 id="{section_id}">')
formatted_html.append(f'<span class="section-title">')
formatted_html.append(f'<span class="section-icon">§</span>')
formatted_html.append(f'{self._sanitize_html(section)}')
formatted_html.append('</span>')
formatted_html.append('</h2>')
formatted_html.append('<div class="section-content">')
for i, fragment in enumerate(fragments, 1):
formatted_html.append('<div class="fragment">')
# 添加内容
if fragment.content:
formatted_html.append(
f'<div class="fragment-content">{self._sanitize_html(fragment.content)}</div>'
)
# 添加参考文献
if fragment.bibliography:
formatted_html.append('<div class="bibliography">')
formatted_html.append(f'{self._sanitize_html(fragment.bibliography)}')
formatted_html.append('</div>')
formatted_html.append('</div>')
formatted_html.append('</div>')
formatted_html.append('</div>')
return '\n'.join(formatted_html)
def save_html(self) -> Path:
"""保存HTML文档"""
try:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"paper_content_{timestamp}.html"
file_path = self.output_dir / filename
html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>{self._sanitize_html(self.fragments[0].title if self.fragments else 'Paper Content')}</title>
<style>
{self.css_styles}
</style>
</head>
<body>
<div class="container">
{self.format_paper_info()}
{self.format_sections()}
</div>
</body>
</html>
"""
with open(file_path, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"HTML document saved to: {file_path}")
return file_path
except Exception as e:
print(f"Error saving HTML document: {str(e)}")
raise
# 使用示例:
# formatter = PaperHtmlFormatter(fragments, output_dir)
# output_path = formatter.save_html()

View File

@@ -734,7 +734,7 @@ class ArxivSplitter:
return content.strip() return content.strip()
def process_arxiv_sync(splitter: ArxivSplitter, arxiv_id: str) -> tuple[List[SectionFragment], str, Path]: def process_arxiv_sync(splitter: ArxivSplitter, arxiv_id: str) -> tuple[List[SectionFragment], str, List[Path]]:
""" """
同步处理 ArXiv 文档并返回分割后的片段 同步处理 ArXiv 文档并返回分割后的片段
@@ -746,19 +746,24 @@ def process_arxiv_sync(splitter: ArxivSplitter, arxiv_id: str) -> tuple[List[Sec
list: 分割后的文档片段列表 list: 分割后的文档片段列表
""" """
try: try:
from crazy_functions.doc_fns.tex_html_formatter import PaperHtmlFormatter
# 创建一个异步函数来执行异步操作 # 创建一个异步函数来执行异步操作
async def _process(): async def _process():
return await splitter.process(arxiv_id) return await splitter.process(arxiv_id)
# 使用 asyncio.run() 运行异步函数 # 使用 asyncio.run() 运行异步函数
output_files=[]
fragments = asyncio.run(_process()) fragments = asyncio.run(_process())
file_save_path = splitter.root_dir / "arxiv_fragments"
# 保存片段到文件 # 保存片段到文件
output_dir = save_fragments_to_file( try:
fragments, md_output_dir = save_fragments_to_file(
output_dir=splitter.root_dir / "arxiv_fragments" fragments,
) output_dir = file_save_path
print(f"Output saved to: {output_dir}") )
output_files.append(md_output_dir)
except:
pass
# 创建论文格式化器 # 创建论文格式化器
formatter = PaperContentFormatter() formatter = PaperContentFormatter()
@@ -775,7 +780,14 @@ def process_arxiv_sync(splitter: ArxivSplitter, arxiv_id: str) -> tuple[List[Sec
# 格式化内容 # 格式化内容
formatted_content = formatter.format(fragments, metadata) formatted_content = formatter.format(fragments, metadata)
return fragments, formatted_content, output_dir
try:
html_formatter = PaperHtmlFormatter(fragments, file_save_path)
html_output_dir = html_formatter.save_html()
output_files.append(html_output_dir)
except:
pass
return fragments, formatted_content, output_files
except Exception as e: except Exception as e:
print(f"✗ Processing failed for {arxiv_id}: {str(e)}") print(f"✗ Processing failed for {arxiv_id}: {str(e)}")
@@ -821,4 +833,4 @@ def test_arxiv_splitter():
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(test_arxiv_splitter()) test_arxiv_splitter()