Master 4.0 (#2210)
* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能,支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能,支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
This commit is contained in:
4
crazy_functions/paper_fns/file2file_doc/__init__.py
Normal file
4
crazy_functions/paper_fns/file2file_doc/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .txt_doc import TxtFormatter
|
||||
from .markdown_doc import MarkdownFormatter
|
||||
from .html_doc import HtmlFormatter
|
||||
from .word_doc import WordFormatter
|
||||
300
crazy_functions/paper_fns/file2file_doc/html_doc.py
Normal file
300
crazy_functions/paper_fns/file2file_doc/html_doc.py
Normal file
@@ -0,0 +1,300 @@
|
||||
class HtmlFormatter:
|
||||
"""HTML格式文档生成器 - 保留原始文档结构"""
|
||||
|
||||
def __init__(self, processing_type="文本处理"):
|
||||
self.processing_type = processing_type
|
||||
self.css_styles = """
|
||||
:root {
|
||||
--primary-color: #2563eb;
|
||||
--primary-light: #eff6ff;
|
||||
--secondary-color: #1e293b;
|
||||
--background-color: #f8fafc;
|
||||
--text-color: #334155;
|
||||
--border-color: #e2e8f0;
|
||||
--card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
line-height: 1.8;
|
||||
margin: 0;
|
||||
padding: 2rem;
|
||||
color: var(--text-color);
|
||||
background-color: var(--background-color);
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 1200px;
|
||||
margin: 0 auto;
|
||||
background: white;
|
||||
padding: 2rem;
|
||||
border-radius: 16px;
|
||||
box-shadow: var(--card-shadow);
|
||||
}
|
||||
::selection {
|
||||
background: var(--primary-light);
|
||||
color: var(--primary-color);
|
||||
}
|
||||
@keyframes fadeIn {
|
||||
from { opacity: 0; transform: translateY(20px); }
|
||||
to { opacity: 1; transform: translateY(0); }
|
||||
}
|
||||
|
||||
.container {
|
||||
animation: fadeIn 0.6s ease-out;
|
||||
}
|
||||
|
||||
.document-title {
|
||||
color: var(--primary-color);
|
||||
font-size: 2em;
|
||||
text-align: center;
|
||||
margin: 1rem 0 2rem;
|
||||
padding-bottom: 1rem;
|
||||
border-bottom: 2px solid var(--primary-color);
|
||||
}
|
||||
|
||||
.document-body {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1.5rem;
|
||||
margin: 2rem 0;
|
||||
}
|
||||
|
||||
.document-header {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
.processing-type {
|
||||
color: var(--secondary-color);
|
||||
font-size: 1.2em;
|
||||
margin: 0.5rem 0;
|
||||
}
|
||||
|
||||
.processing-date {
|
||||
color: var(--text-color);
|
||||
font-size: 0.9em;
|
||||
opacity: 0.8;
|
||||
}
|
||||
|
||||
.document-content {
|
||||
background: white;
|
||||
padding: 1.5rem;
|
||||
border-radius: 8px;
|
||||
border-left: 4px solid var(--primary-color);
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
/* 保留文档结构的样式 */
|
||||
h1, h2, h3, h4, h5, h6 {
|
||||
color: var(--secondary-color);
|
||||
margin-top: 1.5em;
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
|
||||
h1 { font-size: 1.8em; }
|
||||
h2 { font-size: 1.5em; }
|
||||
h3 { font-size: 1.3em; }
|
||||
h4 { font-size: 1.1em; }
|
||||
|
||||
p {
|
||||
margin: 0.8em 0;
|
||||
}
|
||||
|
||||
ul, ol {
|
||||
margin: 1em 0;
|
||||
padding-left: 2em;
|
||||
}
|
||||
|
||||
li {
|
||||
margin: 0.5em 0;
|
||||
}
|
||||
|
||||
blockquote {
|
||||
margin: 1em 0;
|
||||
padding: 0.5em 1em;
|
||||
border-left: 4px solid var(--primary-light);
|
||||
background: rgba(0,0,0,0.02);
|
||||
}
|
||||
|
||||
code {
|
||||
font-family: monospace;
|
||||
background: rgba(0,0,0,0.05);
|
||||
padding: 0.2em 0.4em;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
pre {
|
||||
background: rgba(0,0,0,0.05);
|
||||
padding: 1em;
|
||||
border-radius: 5px;
|
||||
overflow-x: auto;
|
||||
}
|
||||
|
||||
pre code {
|
||||
background: transparent;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
@media (prefers-color-scheme: dark) {
|
||||
:root {
|
||||
--background-color: #0f172a;
|
||||
--text-color: #e2e8f0;
|
||||
--border-color: #1e293b;
|
||||
}
|
||||
|
||||
.container, .document-content {
|
||||
background: #1e293b;
|
||||
}
|
||||
|
||||
blockquote {
|
||||
background: rgba(255,255,255,0.05);
|
||||
}
|
||||
|
||||
code, pre {
|
||||
background: rgba(255,255,255,0.05);
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
def _escape_html(self, text):
|
||||
"""转义HTML特殊字符"""
|
||||
import html
|
||||
return html.escape(text)
|
||||
|
||||
def _markdown_to_html(self, text):
|
||||
"""将Markdown格式转换为HTML格式,保留文档结构"""
|
||||
try:
|
||||
import markdown
|
||||
# 使用Python-Markdown库将markdown转换为HTML,启用更多扩展以支持嵌套列表
|
||||
return markdown.markdown(text, extensions=['tables', 'fenced_code', 'codehilite', 'nl2br', 'sane_lists', 'smarty', 'extra'])
|
||||
except ImportError:
|
||||
# 如果没有markdown库,使用更复杂的替换来处理嵌套列表
|
||||
import re
|
||||
|
||||
# 替换标题
|
||||
text = re.sub(r'^# (.+)$', r'<h1>\1</h1>', text, flags=re.MULTILINE)
|
||||
text = re.sub(r'^## (.+)$', r'<h2>\1</h2>', text, flags=re.MULTILINE)
|
||||
text = re.sub(r'^### (.+)$', r'<h3>\1</h3>', text, flags=re.MULTILINE)
|
||||
|
||||
# 预处理列表 - 在列表项之间添加空行以正确分隔
|
||||
# 处理编号列表
|
||||
text = re.sub(r'(\n\d+\.\s.+)(\n\d+\.\s)', r'\1\n\2', text)
|
||||
# 处理项目符号列表
|
||||
text = re.sub(r'(\n•\s.+)(\n•\s)', r'\1\n\2', text)
|
||||
text = re.sub(r'(\n\*\s.+)(\n\*\s)', r'\1\n\2', text)
|
||||
text = re.sub(r'(\n-\s.+)(\n-\s)', r'\1\n\2', text)
|
||||
|
||||
# 处理嵌套列表 - 确保正确的缩进和结构
|
||||
lines = text.split('\n')
|
||||
in_list = False
|
||||
list_type = None # 'ol' 或 'ul'
|
||||
list_html = []
|
||||
normal_lines = []
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
|
||||
# 匹配编号列表项
|
||||
numbered_match = re.match(r'^(\d+)\.\s+(.+)$', line)
|
||||
# 匹配项目符号列表项
|
||||
bullet_match = re.match(r'^[•\*-]\s+(.+)$', line)
|
||||
|
||||
if numbered_match:
|
||||
if not in_list or list_type != 'ol':
|
||||
# 开始新的编号列表
|
||||
if in_list:
|
||||
# 关闭前一个列表
|
||||
list_html.append(f'</{list_type}>')
|
||||
list_html.append('<ol>')
|
||||
in_list = True
|
||||
list_type = 'ol'
|
||||
|
||||
num, content = numbered_match.groups()
|
||||
list_html.append(f'<li>{content}</li>')
|
||||
|
||||
elif bullet_match:
|
||||
if not in_list or list_type != 'ul':
|
||||
# 开始新的项目符号列表
|
||||
if in_list:
|
||||
# 关闭前一个列表
|
||||
list_html.append(f'</{list_type}>')
|
||||
list_html.append('<ul>')
|
||||
in_list = True
|
||||
list_type = 'ul'
|
||||
|
||||
content = bullet_match.group(1)
|
||||
list_html.append(f'<li>{content}</li>')
|
||||
|
||||
else:
|
||||
if in_list:
|
||||
# 结束当前列表
|
||||
list_html.append(f'</{list_type}>')
|
||||
in_list = False
|
||||
# 将完成的列表添加到正常行中
|
||||
normal_lines.append(''.join(list_html))
|
||||
list_html = []
|
||||
|
||||
normal_lines.append(line)
|
||||
|
||||
i += 1
|
||||
|
||||
# 如果最后还在列表中,确保关闭列表
|
||||
if in_list:
|
||||
list_html.append(f'</{list_type}>')
|
||||
normal_lines.append(''.join(list_html))
|
||||
|
||||
# 重建文本
|
||||
text = '\n'.join(normal_lines)
|
||||
|
||||
# 替换段落,但避免处理已经是HTML标签的部分
|
||||
paragraphs = text.split('\n\n')
|
||||
for i, p in enumerate(paragraphs):
|
||||
# 如果不是以HTML标签开始且不为空
|
||||
if not (p.strip().startswith('<') and p.strip().endswith('>')) and p.strip() != '':
|
||||
paragraphs[i] = f'<p>{p}</p>'
|
||||
|
||||
return '\n'.join(paragraphs)
|
||||
|
||||
def create_document(self, content: str) -> str:
|
||||
"""生成完整的HTML文档,保留原始文档结构
|
||||
|
||||
Args:
|
||||
content: 处理后的文档内容
|
||||
|
||||
Returns:
|
||||
str: 完整的HTML文档字符串
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
# 将markdown内容转换为HTML
|
||||
html_content = self._markdown_to_html(content)
|
||||
|
||||
return f"""
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>文档处理结果</title>
|
||||
<style>{self.css_styles}</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1 class="document-title">文档处理结果</h1>
|
||||
|
||||
<div class="document-header">
|
||||
<div class="processing-type">处理方式: {self._escape_html(self.processing_type)}</div>
|
||||
<div class="processing-date">处理时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
|
||||
</div>
|
||||
|
||||
<div class="document-content">
|
||||
{html_content}
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
40
crazy_functions/paper_fns/file2file_doc/markdown_doc.py
Normal file
40
crazy_functions/paper_fns/file2file_doc/markdown_doc.py
Normal file
@@ -0,0 +1,40 @@
|
||||
class MarkdownFormatter:
|
||||
"""Markdown格式文档生成器 - 保留原始文档结构"""
|
||||
|
||||
def __init__(self):
|
||||
self.content = []
|
||||
|
||||
def _add_content(self, text: str):
|
||||
"""添加正文内容"""
|
||||
if text:
|
||||
self.content.append(f"\n{text}\n")
|
||||
|
||||
def create_document(self, content: str, processing_type: str = "文本处理") -> str:
|
||||
"""
|
||||
创建完整的Markdown文档,保留原始文档结构
|
||||
Args:
|
||||
content: 处理后的文档内容
|
||||
processing_type: 处理类型(润色、翻译等)
|
||||
Returns:
|
||||
str: 生成的Markdown文本
|
||||
"""
|
||||
self.content = []
|
||||
|
||||
# 添加标题和说明
|
||||
self.content.append(f"# 文档处理结果\n")
|
||||
self.content.append(f"## 处理方式: {processing_type}\n")
|
||||
|
||||
# 添加处理时间
|
||||
from datetime import datetime
|
||||
self.content.append(f"*处理时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n")
|
||||
|
||||
# 添加分隔线
|
||||
self.content.append("---\n")
|
||||
|
||||
# 添加原始内容,保留结构
|
||||
self.content.append(content)
|
||||
|
||||
# 添加结尾分隔线
|
||||
self.content.append("\n---\n")
|
||||
|
||||
return "\n".join(self.content)
|
||||
69
crazy_functions/paper_fns/file2file_doc/txt_doc.py
Normal file
69
crazy_functions/paper_fns/file2file_doc/txt_doc.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import re
|
||||
|
||||
def convert_markdown_to_txt(markdown_text):
|
||||
"""Convert markdown text to plain text while preserving formatting"""
|
||||
# Standardize line endings
|
||||
markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# 1. Handle headers but keep their formatting instead of removing them
|
||||
markdown_text = re.sub(r'^#\s+(.+)$', r'# \1', markdown_text, flags=re.MULTILINE)
|
||||
markdown_text = re.sub(r'^##\s+(.+)$', r'## \1', markdown_text, flags=re.MULTILINE)
|
||||
markdown_text = re.sub(r'^###\s+(.+)$', r'### \1', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 2. Handle bold and italic - simply remove markers
|
||||
markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text)
|
||||
markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text)
|
||||
|
||||
# 3. Handle lists but preserve formatting
|
||||
markdown_text = re.sub(r'^\s*[-*+]\s+(.+?)(?=\n|$)', r'• \1', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 4. Handle links - keep only the text
|
||||
markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 (\2)', markdown_text)
|
||||
|
||||
# 5. Handle HTML links - convert to user-friendly format
|
||||
markdown_text = re.sub(r'<a href=[\'"]([^\'"]+)[\'"](?:\s+target=[\'"][^\'"]+[\'"])?>([^<]+)</a>', r'\2 (\1)', markdown_text)
|
||||
|
||||
# 6. Preserve paragraph breaks
|
||||
markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text) # normalize multiple newlines to double newlines
|
||||
|
||||
# 7. Clean up extra spaces but maintain indentation
|
||||
markdown_text = re.sub(r' +', ' ', markdown_text)
|
||||
|
||||
return markdown_text.strip()
|
||||
|
||||
|
||||
class TxtFormatter:
|
||||
"""文本格式化器 - 保留原始文档结构"""
|
||||
|
||||
def __init__(self):
|
||||
self.content = []
|
||||
self._setup_document()
|
||||
|
||||
def _setup_document(self):
|
||||
"""初始化文档标题"""
|
||||
self.content.append("=" * 50)
|
||||
self.content.append("处理后文档".center(48))
|
||||
self.content.append("=" * 50)
|
||||
|
||||
def _format_header(self):
|
||||
"""创建文档头部信息"""
|
||||
from datetime import datetime
|
||||
date_str = datetime.now().strftime('%Y年%m月%d日')
|
||||
return [
|
||||
date_str.center(48),
|
||||
"\n" # 添加空行
|
||||
]
|
||||
|
||||
def create_document(self, content):
|
||||
"""生成保留原始结构的文档"""
|
||||
# 添加头部信息
|
||||
self.content.extend(self._format_header())
|
||||
|
||||
# 处理内容,保留原始结构
|
||||
processed_content = convert_markdown_to_txt(content)
|
||||
|
||||
# 添加处理后的内容
|
||||
self.content.append(processed_content)
|
||||
|
||||
# 合并所有内容
|
||||
return "\n".join(self.content)
|
||||
125
crazy_functions/paper_fns/file2file_doc/word2pdf.py
Normal file
125
crazy_functions/paper_fns/file2file_doc/word2pdf.py
Normal file
@@ -0,0 +1,125 @@
|
||||
from docx2pdf import convert
|
||||
import os
|
||||
import platform
|
||||
from typing import Union
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
class WordToPdfConverter:
|
||||
"""Word文档转PDF转换器"""
|
||||
|
||||
@staticmethod
|
||||
def convert_to_pdf(word_path: Union[str, Path], pdf_path: Union[str, Path] = None) -> str:
|
||||
"""
|
||||
将Word文档转换为PDF
|
||||
|
||||
参数:
|
||||
word_path: Word文档的路径
|
||||
pdf_path: 可选,PDF文件的输出路径。如果未指定,将使用与Word文档相同的名称和位置
|
||||
|
||||
返回:
|
||||
生成的PDF文件路径
|
||||
|
||||
异常:
|
||||
如果转换失败,将抛出相应异常
|
||||
"""
|
||||
try:
|
||||
# 确保输入路径是Path对象
|
||||
word_path = Path(word_path)
|
||||
|
||||
# 如果未指定pdf_path,则使用与word文档相同的名称
|
||||
if pdf_path is None:
|
||||
pdf_path = word_path.with_suffix('.pdf')
|
||||
else:
|
||||
pdf_path = Path(pdf_path)
|
||||
|
||||
# 检查操作系统
|
||||
if platform.system() == 'Linux':
|
||||
# Linux系统需要安装libreoffice
|
||||
if not os.system('which libreoffice') == 0:
|
||||
raise RuntimeError("请先安装LibreOffice: sudo apt-get install libreoffice")
|
||||
|
||||
# 使用libreoffice进行转换
|
||||
os.system(f'libreoffice --headless --convert-to pdf "{word_path}" --outdir "{pdf_path.parent}"')
|
||||
|
||||
# 如果输出路径与默认生成的不同,则重命名
|
||||
default_pdf = word_path.with_suffix('.pdf')
|
||||
if default_pdf != pdf_path:
|
||||
os.rename(default_pdf, pdf_path)
|
||||
else:
|
||||
# Windows和MacOS使用docx2pdf
|
||||
convert(word_path, pdf_path)
|
||||
|
||||
return str(pdf_path)
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"转换PDF失败: {str(e)}")
|
||||
|
||||
@staticmethod
|
||||
def batch_convert(word_dir: Union[str, Path], pdf_dir: Union[str, Path] = None) -> list:
|
||||
"""
|
||||
批量转换目录下的所有Word文档
|
||||
|
||||
参数:
|
||||
word_dir: 包含Word文档的目录路径
|
||||
pdf_dir: 可选,PDF文件的输出目录。如果未指定,将使用与Word文档相同的目录
|
||||
|
||||
返回:
|
||||
生成的PDF文件路径列表
|
||||
"""
|
||||
word_dir = Path(word_dir)
|
||||
if pdf_dir:
|
||||
pdf_dir = Path(pdf_dir)
|
||||
pdf_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
converted_files = []
|
||||
|
||||
for word_file in word_dir.glob("*.docx"):
|
||||
try:
|
||||
if pdf_dir:
|
||||
pdf_path = pdf_dir / word_file.with_suffix('.pdf').name
|
||||
else:
|
||||
pdf_path = word_file.with_suffix('.pdf')
|
||||
|
||||
pdf_file = WordToPdfConverter.convert_to_pdf(word_file, pdf_path)
|
||||
converted_files.append(pdf_file)
|
||||
|
||||
except Exception as e:
|
||||
print(f"转换 {word_file} 失败: {str(e)}")
|
||||
|
||||
return converted_files
|
||||
|
||||
@staticmethod
|
||||
def convert_doc_to_pdf(doc, output_dir: Union[str, Path] = None) -> str:
|
||||
"""
|
||||
将docx对象直接转换为PDF
|
||||
|
||||
参数:
|
||||
doc: python-docx的Document对象
|
||||
output_dir: 可选,输出目录。如果未指定,将使用当前目录
|
||||
|
||||
返回:
|
||||
生成的PDF文件路径
|
||||
"""
|
||||
try:
|
||||
# 设置临时文件路径和输出路径
|
||||
output_dir = Path(output_dir) if output_dir else Path.cwd()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 生成临时word文件
|
||||
temp_docx = output_dir / f"temp_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
|
||||
doc.save(temp_docx)
|
||||
|
||||
# 转换为PDF
|
||||
pdf_path = temp_docx.with_suffix('.pdf')
|
||||
WordToPdfConverter.convert_to_pdf(temp_docx, pdf_path)
|
||||
|
||||
# 删除临时word文件
|
||||
temp_docx.unlink()
|
||||
|
||||
return str(pdf_path)
|
||||
|
||||
except Exception as e:
|
||||
if temp_docx.exists():
|
||||
temp_docx.unlink()
|
||||
raise Exception(f"转换PDF失败: {str(e)}")
|
||||
236
crazy_functions/paper_fns/file2file_doc/word_doc.py
Normal file
236
crazy_functions/paper_fns/file2file_doc/word_doc.py
Normal file
@@ -0,0 +1,236 @@
|
||||
import re
|
||||
from docx import Document
|
||||
from docx.shared import Cm, Pt
|
||||
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
|
||||
from docx.enum.style import WD_STYLE_TYPE
|
||||
from docx.oxml.ns import qn
|
||||
from datetime import datetime
|
||||
|
||||
def convert_markdown_to_word(markdown_text):
|
||||
# 0. 首先标准化所有换行符为\n
|
||||
markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# 1. 处理标题 - 支持更多级别的标题,使用更精确的正则
|
||||
# 保留标题标记,以便后续处理时还能识别出标题级别
|
||||
markdown_text = re.sub(r'^(#{1,6})\s+(.+?)(?:\s+#+)?$', r'\1 \2', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 2. 处理粗体、斜体和加粗斜体
|
||||
markdown_text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', markdown_text) # 加粗斜体
|
||||
markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text) # 加粗
|
||||
markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text) # 斜体
|
||||
markdown_text = re.sub(r'_(.+?)_', r'\1', markdown_text) # 下划线斜体
|
||||
markdown_text = re.sub(r'__(.+?)__', r'\1', markdown_text) # 下划线加粗
|
||||
|
||||
# 3. 处理代码块 - 不移除,而是简化格式
|
||||
# 多行代码块
|
||||
markdown_text = re.sub(r'```(?:\w+)?\n([\s\S]*?)```', r'[代码块]\n\1[/代码块]', markdown_text)
|
||||
# 单行代码
|
||||
markdown_text = re.sub(r'`([^`]+)`', r'[代码]\1[/代码]', markdown_text)
|
||||
|
||||
# 4. 处理列表 - 保留列表结构
|
||||
# 匹配无序列表
|
||||
markdown_text = re.sub(r'^(\s*)[-*+]\s+(.+?)$', r'\1• \2', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 5. 处理Markdown链接
|
||||
markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+?)\s*(?:"[^"]*")?\)', r'\1 (\2)', markdown_text)
|
||||
|
||||
# 6. 处理HTML链接
|
||||
markdown_text = re.sub(r'<a href=[\'"]([^\'"]+)[\'"](?:\s+target=[\'"][^\'"]+[\'"])?>([^<]+)</a>', r'\2 (\1)', markdown_text)
|
||||
|
||||
# 7. 处理图片
|
||||
markdown_text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'[图片:\1]', markdown_text)
|
||||
|
||||
return markdown_text
|
||||
|
||||
|
||||
class WordFormatter:
|
||||
"""文档Word格式化器 - 保留原始文档结构"""
|
||||
|
||||
def __init__(self):
|
||||
self.doc = Document()
|
||||
self._setup_document()
|
||||
self._create_styles()
|
||||
|
||||
def _setup_document(self):
|
||||
"""设置文档基本格式,包括页面设置和页眉"""
|
||||
sections = self.doc.sections
|
||||
for section in sections:
|
||||
# 设置页面大小为A4
|
||||
section.page_width = Cm(21)
|
||||
section.page_height = Cm(29.7)
|
||||
# 设置页边距
|
||||
section.top_margin = Cm(3.7) # 上边距37mm
|
||||
section.bottom_margin = Cm(3.5) # 下边距35mm
|
||||
section.left_margin = Cm(2.8) # 左边距28mm
|
||||
section.right_margin = Cm(2.6) # 右边距26mm
|
||||
# 设置页眉页脚距离
|
||||
section.header_distance = Cm(2.0)
|
||||
section.footer_distance = Cm(2.0)
|
||||
|
||||
# 添加页眉
|
||||
header = section.header
|
||||
header_para = header.paragraphs[0]
|
||||
header_para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
||||
header_run = header_para.add_run("文档处理结果")
|
||||
header_run.font.name = '仿宋'
|
||||
header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
header_run.font.size = Pt(9)
|
||||
|
||||
def _create_styles(self):
|
||||
"""创建文档样式"""
|
||||
# 创建正文样式
|
||||
style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
style.font.name = '仿宋'
|
||||
style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
style.font.size = Pt(12) # 调整为12磅
|
||||
style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
|
||||
style.paragraph_format.space_after = Pt(0)
|
||||
|
||||
# 创建标题样式
|
||||
title_style = self.doc.styles.add_style('Title_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
title_style.font.name = '黑体'
|
||||
title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
|
||||
title_style.font.size = Pt(22) # 调整为22磅
|
||||
title_style.font.bold = True
|
||||
title_style.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
||||
title_style.paragraph_format.space_before = Pt(0)
|
||||
title_style.paragraph_format.space_after = Pt(24)
|
||||
title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
|
||||
|
||||
# 创建标题1样式
|
||||
h1_style = self.doc.styles.add_style('Heading1_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
h1_style.font.name = '黑体'
|
||||
h1_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
|
||||
h1_style.font.size = Pt(18)
|
||||
h1_style.font.bold = True
|
||||
h1_style.paragraph_format.space_before = Pt(12)
|
||||
h1_style.paragraph_format.space_after = Pt(6)
|
||||
|
||||
# 创建标题2样式
|
||||
h2_style = self.doc.styles.add_style('Heading2_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
h2_style.font.name = '黑体'
|
||||
h2_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
|
||||
h2_style.font.size = Pt(16)
|
||||
h2_style.font.bold = True
|
||||
h2_style.paragraph_format.space_before = Pt(10)
|
||||
h2_style.paragraph_format.space_after = Pt(6)
|
||||
|
||||
# 创建标题3样式
|
||||
h3_style = self.doc.styles.add_style('Heading3_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
h3_style.font.name = '黑体'
|
||||
h3_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
|
||||
h3_style.font.size = Pt(14)
|
||||
h3_style.font.bold = True
|
||||
h3_style.paragraph_format.space_before = Pt(8)
|
||||
h3_style.paragraph_format.space_after = Pt(4)
|
||||
|
||||
# 创建代码块样式
|
||||
code_style = self.doc.styles.add_style('Code_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
code_style.font.name = 'Courier New'
|
||||
code_style.font.size = Pt(11)
|
||||
code_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE
|
||||
code_style.paragraph_format.space_before = Pt(6)
|
||||
code_style.paragraph_format.space_after = Pt(6)
|
||||
code_style.paragraph_format.left_indent = Pt(36)
|
||||
code_style.paragraph_format.right_indent = Pt(36)
|
||||
|
||||
# 创建列表样式
|
||||
list_style = self.doc.styles.add_style('List_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
list_style.font.name = '仿宋'
|
||||
list_style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
list_style.font.size = Pt(12)
|
||||
list_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
|
||||
list_style.paragraph_format.left_indent = Pt(21)
|
||||
list_style.paragraph_format.first_line_indent = Pt(-21)
|
||||
|
||||
def create_document(self, content: str, processing_type: str = "文本处理"):
|
||||
"""创建文档,保留原始结构"""
|
||||
# 添加标题
|
||||
title_para = self.doc.add_paragraph(style='Title_Custom')
|
||||
title_run = title_para.add_run('文档处理结果')
|
||||
|
||||
# 添加处理类型
|
||||
processing_para = self.doc.add_paragraph()
|
||||
processing_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
||||
processing_run = processing_para.add_run(f"处理方式: {processing_type}")
|
||||
processing_run.font.name = '仿宋'
|
||||
processing_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
processing_run.font.size = Pt(14)
|
||||
|
||||
# 添加日期
|
||||
date_para = self.doc.add_paragraph()
|
||||
date_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
||||
date_run = date_para.add_run(f"处理时间: {datetime.now().strftime('%Y年%m月%d日')}")
|
||||
date_run.font.name = '仿宋'
|
||||
date_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
date_run.font.size = Pt(14)
|
||||
|
||||
self.doc.add_paragraph() # 添加空行
|
||||
|
||||
# 预处理内容,将Markdown格式转换为适合Word的格式
|
||||
processed_content = convert_markdown_to_word(content)
|
||||
|
||||
# 按行处理文本,保留结构
|
||||
lines = processed_content.split('\n')
|
||||
in_code_block = False
|
||||
current_paragraph = None
|
||||
|
||||
for line in lines:
|
||||
# 检查是否为标题
|
||||
header_match = re.match(r'^(#{1,6})\s+(.+)$', line)
|
||||
|
||||
if header_match:
|
||||
# 根据#的数量确定标题级别
|
||||
level = len(header_match.group(1))
|
||||
title_text = header_match.group(2)
|
||||
|
||||
if level == 1:
|
||||
style = 'Heading1_Custom'
|
||||
elif level == 2:
|
||||
style = 'Heading2_Custom'
|
||||
else:
|
||||
style = 'Heading3_Custom'
|
||||
|
||||
self.doc.add_paragraph(title_text, style=style)
|
||||
current_paragraph = None
|
||||
|
||||
# 检查代码块标记
|
||||
elif '[代码块]' in line:
|
||||
in_code_block = True
|
||||
current_paragraph = self.doc.add_paragraph(style='Code_Custom')
|
||||
code_line = line.replace('[代码块]', '').strip()
|
||||
if code_line:
|
||||
current_paragraph.add_run(code_line)
|
||||
|
||||
elif '[/代码块]' in line:
|
||||
in_code_block = False
|
||||
code_line = line.replace('[/代码块]', '').strip()
|
||||
if code_line and current_paragraph:
|
||||
current_paragraph.add_run(code_line)
|
||||
current_paragraph = None
|
||||
|
||||
# 检查列表项
|
||||
elif line.strip().startswith('•'):
|
||||
p = self.doc.add_paragraph(style='List_Custom')
|
||||
p.add_run(line.strip())
|
||||
current_paragraph = None
|
||||
|
||||
# 处理普通文本行
|
||||
elif line.strip():
|
||||
if in_code_block:
|
||||
if current_paragraph:
|
||||
current_paragraph.add_run('\n' + line)
|
||||
else:
|
||||
current_paragraph = self.doc.add_paragraph(line, style='Code_Custom')
|
||||
else:
|
||||
if current_paragraph is None or not current_paragraph.text:
|
||||
current_paragraph = self.doc.add_paragraph(line, style='Normal_Custom')
|
||||
else:
|
||||
current_paragraph.add_run('\n' + line)
|
||||
|
||||
# 处理空行,创建新段落
|
||||
elif not in_code_block:
|
||||
current_paragraph = None
|
||||
|
||||
return self.doc
|
||||
|
||||
Reference in New Issue
Block a user