Files
gpt_academic/crazy_functions/paper_fns/file2file_doc/html_doc.py
binary-husky 8042750d41 Master 4.0 (#2210)
* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-08-23 15:59:22 +08:00

300 lines
9.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

class HtmlFormatter:
"""HTML格式文档生成器 - 保留原始文档结构"""
def __init__(self, processing_type="文本处理"):
self.processing_type = processing_type
self.css_styles = """
:root {
--primary-color: #2563eb;
--primary-light: #eff6ff;
--secondary-color: #1e293b;
--background-color: #f8fafc;
--text-color: #334155;
--border-color: #e2e8f0;
--card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
}
body {
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
line-height: 1.8;
margin: 0;
padding: 2rem;
color: var(--text-color);
background-color: var(--background-color);
}
.container {
max-width: 1200px;
margin: 0 auto;
background: white;
padding: 2rem;
border-radius: 16px;
box-shadow: var(--card-shadow);
}
::selection {
background: var(--primary-light);
color: var(--primary-color);
}
@keyframes fadeIn {
from { opacity: 0; transform: translateY(20px); }
to { opacity: 1; transform: translateY(0); }
}
.container {
animation: fadeIn 0.6s ease-out;
}
.document-title {
color: var(--primary-color);
font-size: 2em;
text-align: center;
margin: 1rem 0 2rem;
padding-bottom: 1rem;
border-bottom: 2px solid var(--primary-color);
}
.document-body {
display: flex;
flex-direction: column;
gap: 1.5rem;
margin: 2rem 0;
}
.document-header {
display: flex;
flex-direction: column;
align-items: center;
margin-bottom: 2rem;
}
.processing-type {
color: var(--secondary-color);
font-size: 1.2em;
margin: 0.5rem 0;
}
.processing-date {
color: var(--text-color);
font-size: 0.9em;
opacity: 0.8;
}
.document-content {
background: white;
padding: 1.5rem;
border-radius: 8px;
border-left: 4px solid var(--primary-color);
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
/* 保留文档结构的样式 */
h1, h2, h3, h4, h5, h6 {
color: var(--secondary-color);
margin-top: 1.5em;
margin-bottom: 0.5em;
}
h1 { font-size: 1.8em; }
h2 { font-size: 1.5em; }
h3 { font-size: 1.3em; }
h4 { font-size: 1.1em; }
p {
margin: 0.8em 0;
}
ul, ol {
margin: 1em 0;
padding-left: 2em;
}
li {
margin: 0.5em 0;
}
blockquote {
margin: 1em 0;
padding: 0.5em 1em;
border-left: 4px solid var(--primary-light);
background: rgba(0,0,0,0.02);
}
code {
font-family: monospace;
background: rgba(0,0,0,0.05);
padding: 0.2em 0.4em;
border-radius: 3px;
}
pre {
background: rgba(0,0,0,0.05);
padding: 1em;
border-radius: 5px;
overflow-x: auto;
}
pre code {
background: transparent;
padding: 0;
}
@media (prefers-color-scheme: dark) {
:root {
--background-color: #0f172a;
--text-color: #e2e8f0;
--border-color: #1e293b;
}
.container, .document-content {
background: #1e293b;
}
blockquote {
background: rgba(255,255,255,0.05);
}
code, pre {
background: rgba(255,255,255,0.05);
}
}
"""
def _escape_html(self, text):
"""转义HTML特殊字符"""
import html
return html.escape(text)
def _markdown_to_html(self, text):
"""将Markdown格式转换为HTML格式保留文档结构"""
try:
import markdown
# 使用Python-Markdown库将markdown转换为HTML启用更多扩展以支持嵌套列表
return markdown.markdown(text, extensions=['tables', 'fenced_code', 'codehilite', 'nl2br', 'sane_lists', 'smarty', 'extra'])
except ImportError:
# 如果没有markdown库使用更复杂的替换来处理嵌套列表
import re
# 替换标题
text = re.sub(r'^# (.+)$', r'<h1>\1</h1>', text, flags=re.MULTILINE)
text = re.sub(r'^## (.+)$', r'<h2>\1</h2>', text, flags=re.MULTILINE)
text = re.sub(r'^### (.+)$', r'<h3>\1</h3>', text, flags=re.MULTILINE)
# 预处理列表 - 在列表项之间添加空行以正确分隔
# 处理编号列表
text = re.sub(r'(\n\d+\.\s.+)(\n\d+\.\s)', r'\1\n\2', text)
# 处理项目符号列表
text = re.sub(r'(\n•\s.+)(\n•\s)', r'\1\n\2', text)
text = re.sub(r'(\n\*\s.+)(\n\*\s)', r'\1\n\2', text)
text = re.sub(r'(\n-\s.+)(\n-\s)', r'\1\n\2', text)
# 处理嵌套列表 - 确保正确的缩进和结构
lines = text.split('\n')
in_list = False
list_type = None # 'ol' 或 'ul'
list_html = []
normal_lines = []
i = 0
while i < len(lines):
line = lines[i]
# 匹配编号列表项
numbered_match = re.match(r'^(\d+)\.\s+(.+)$', line)
# 匹配项目符号列表项
bullet_match = re.match(r'^[•\*-]\s+(.+)$', line)
if numbered_match:
if not in_list or list_type != 'ol':
# 开始新的编号列表
if in_list:
# 关闭前一个列表
list_html.append(f'</{list_type}>')
list_html.append('<ol>')
in_list = True
list_type = 'ol'
num, content = numbered_match.groups()
list_html.append(f'<li>{content}</li>')
elif bullet_match:
if not in_list or list_type != 'ul':
# 开始新的项目符号列表
if in_list:
# 关闭前一个列表
list_html.append(f'</{list_type}>')
list_html.append('<ul>')
in_list = True
list_type = 'ul'
content = bullet_match.group(1)
list_html.append(f'<li>{content}</li>')
else:
if in_list:
# 结束当前列表
list_html.append(f'</{list_type}>')
in_list = False
# 将完成的列表添加到正常行中
normal_lines.append(''.join(list_html))
list_html = []
normal_lines.append(line)
i += 1
# 如果最后还在列表中,确保关闭列表
if in_list:
list_html.append(f'</{list_type}>')
normal_lines.append(''.join(list_html))
# 重建文本
text = '\n'.join(normal_lines)
# 替换段落但避免处理已经是HTML标签的部分
paragraphs = text.split('\n\n')
for i, p in enumerate(paragraphs):
# 如果不是以HTML标签开始且不为空
if not (p.strip().startswith('<') and p.strip().endswith('>')) and p.strip() != '':
paragraphs[i] = f'<p>{p}</p>'
return '\n'.join(paragraphs)
def create_document(self, content: str) -> str:
"""生成完整的HTML文档保留原始文档结构
Args:
content: 处理后的文档内容
Returns:
str: 完整的HTML文档字符串
"""
from datetime import datetime
# 将markdown内容转换为HTML
html_content = self._markdown_to_html(content)
return f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>文档处理结果</title>
<style>{self.css_styles}</style>
</head>
<body>
<div class="container">
<h1 class="document-title">文档处理结果</h1>
<div class="document-header">
<div class="processing-type">处理方式: {self._escape_html(self.processing_type)}</div>
<div class="processing-date">处理时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
</div>
<div class="document-content">
{html_content}
</div>
</div>
</body>
</html>
"""