Master 4.0 (#2210)

* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
This commit is contained in:
binary-husky
2025-08-23 15:59:22 +08:00
committed by GitHub
parent 65a4cf59c2
commit 8042750d41
79 changed files with 20850 additions and 57 deletions

View File

@@ -0,0 +1,4 @@
from .txt_doc import TxtFormatter
from .markdown_doc import MarkdownFormatter
from .html_doc import HtmlFormatter
from .word_doc import WordFormatter

View File

@@ -0,0 +1,300 @@
class HtmlFormatter:
"""HTML格式文档生成器 - 保留原始文档结构"""
def __init__(self, processing_type="文本处理"):
self.processing_type = processing_type
self.css_styles = """
:root {
--primary-color: #2563eb;
--primary-light: #eff6ff;
--secondary-color: #1e293b;
--background-color: #f8fafc;
--text-color: #334155;
--border-color: #e2e8f0;
--card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
}
body {
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
line-height: 1.8;
margin: 0;
padding: 2rem;
color: var(--text-color);
background-color: var(--background-color);
}
.container {
max-width: 1200px;
margin: 0 auto;
background: white;
padding: 2rem;
border-radius: 16px;
box-shadow: var(--card-shadow);
}
::selection {
background: var(--primary-light);
color: var(--primary-color);
}
@keyframes fadeIn {
from { opacity: 0; transform: translateY(20px); }
to { opacity: 1; transform: translateY(0); }
}
.container {
animation: fadeIn 0.6s ease-out;
}
.document-title {
color: var(--primary-color);
font-size: 2em;
text-align: center;
margin: 1rem 0 2rem;
padding-bottom: 1rem;
border-bottom: 2px solid var(--primary-color);
}
.document-body {
display: flex;
flex-direction: column;
gap: 1.5rem;
margin: 2rem 0;
}
.document-header {
display: flex;
flex-direction: column;
align-items: center;
margin-bottom: 2rem;
}
.processing-type {
color: var(--secondary-color);
font-size: 1.2em;
margin: 0.5rem 0;
}
.processing-date {
color: var(--text-color);
font-size: 0.9em;
opacity: 0.8;
}
.document-content {
background: white;
padding: 1.5rem;
border-radius: 8px;
border-left: 4px solid var(--primary-color);
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
/* 保留文档结构的样式 */
h1, h2, h3, h4, h5, h6 {
color: var(--secondary-color);
margin-top: 1.5em;
margin-bottom: 0.5em;
}
h1 { font-size: 1.8em; }
h2 { font-size: 1.5em; }
h3 { font-size: 1.3em; }
h4 { font-size: 1.1em; }
p {
margin: 0.8em 0;
}
ul, ol {
margin: 1em 0;
padding-left: 2em;
}
li {
margin: 0.5em 0;
}
blockquote {
margin: 1em 0;
padding: 0.5em 1em;
border-left: 4px solid var(--primary-light);
background: rgba(0,0,0,0.02);
}
code {
font-family: monospace;
background: rgba(0,0,0,0.05);
padding: 0.2em 0.4em;
border-radius: 3px;
}
pre {
background: rgba(0,0,0,0.05);
padding: 1em;
border-radius: 5px;
overflow-x: auto;
}
pre code {
background: transparent;
padding: 0;
}
@media (prefers-color-scheme: dark) {
:root {
--background-color: #0f172a;
--text-color: #e2e8f0;
--border-color: #1e293b;
}
.container, .document-content {
background: #1e293b;
}
blockquote {
background: rgba(255,255,255,0.05);
}
code, pre {
background: rgba(255,255,255,0.05);
}
}
"""
def _escape_html(self, text):
"""转义HTML特殊字符"""
import html
return html.escape(text)
def _markdown_to_html(self, text):
"""将Markdown格式转换为HTML格式保留文档结构"""
try:
import markdown
# 使用Python-Markdown库将markdown转换为HTML启用更多扩展以支持嵌套列表
return markdown.markdown(text, extensions=['tables', 'fenced_code', 'codehilite', 'nl2br', 'sane_lists', 'smarty', 'extra'])
except ImportError:
# 如果没有markdown库使用更复杂的替换来处理嵌套列表
import re
# 替换标题
text = re.sub(r'^# (.+)$', r'<h1>\1</h1>', text, flags=re.MULTILINE)
text = re.sub(r'^## (.+)$', r'<h2>\1</h2>', text, flags=re.MULTILINE)
text = re.sub(r'^### (.+)$', r'<h3>\1</h3>', text, flags=re.MULTILINE)
# 预处理列表 - 在列表项之间添加空行以正确分隔
# 处理编号列表
text = re.sub(r'(\n\d+\.\s.+)(\n\d+\.\s)', r'\1\n\2', text)
# 处理项目符号列表
text = re.sub(r'(\n•\s.+)(\n•\s)', r'\1\n\2', text)
text = re.sub(r'(\n\*\s.+)(\n\*\s)', r'\1\n\2', text)
text = re.sub(r'(\n-\s.+)(\n-\s)', r'\1\n\2', text)
# 处理嵌套列表 - 确保正确的缩进和结构
lines = text.split('\n')
in_list = False
list_type = None # 'ol' 或 'ul'
list_html = []
normal_lines = []
i = 0
while i < len(lines):
line = lines[i]
# 匹配编号列表项
numbered_match = re.match(r'^(\d+)\.\s+(.+)$', line)
# 匹配项目符号列表项
bullet_match = re.match(r'^[•\*-]\s+(.+)$', line)
if numbered_match:
if not in_list or list_type != 'ol':
# 开始新的编号列表
if in_list:
# 关闭前一个列表
list_html.append(f'</{list_type}>')
list_html.append('<ol>')
in_list = True
list_type = 'ol'
num, content = numbered_match.groups()
list_html.append(f'<li>{content}</li>')
elif bullet_match:
if not in_list or list_type != 'ul':
# 开始新的项目符号列表
if in_list:
# 关闭前一个列表
list_html.append(f'</{list_type}>')
list_html.append('<ul>')
in_list = True
list_type = 'ul'
content = bullet_match.group(1)
list_html.append(f'<li>{content}</li>')
else:
if in_list:
# 结束当前列表
list_html.append(f'</{list_type}>')
in_list = False
# 将完成的列表添加到正常行中
normal_lines.append(''.join(list_html))
list_html = []
normal_lines.append(line)
i += 1
# 如果最后还在列表中,确保关闭列表
if in_list:
list_html.append(f'</{list_type}>')
normal_lines.append(''.join(list_html))
# 重建文本
text = '\n'.join(normal_lines)
# 替换段落但避免处理已经是HTML标签的部分
paragraphs = text.split('\n\n')
for i, p in enumerate(paragraphs):
# 如果不是以HTML标签开始且不为空
if not (p.strip().startswith('<') and p.strip().endswith('>')) and p.strip() != '':
paragraphs[i] = f'<p>{p}</p>'
return '\n'.join(paragraphs)
def create_document(self, content: str) -> str:
"""生成完整的HTML文档保留原始文档结构
Args:
content: 处理后的文档内容
Returns:
str: 完整的HTML文档字符串
"""
from datetime import datetime
# 将markdown内容转换为HTML
html_content = self._markdown_to_html(content)
return f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>文档处理结果</title>
<style>{self.css_styles}</style>
</head>
<body>
<div class="container">
<h1 class="document-title">文档处理结果</h1>
<div class="document-header">
<div class="processing-type">处理方式: {self._escape_html(self.processing_type)}</div>
<div class="processing-date">处理时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
</div>
<div class="document-content">
{html_content}
</div>
</div>
</body>
</html>
"""

View File

@@ -0,0 +1,40 @@
class MarkdownFormatter:
"""Markdown格式文档生成器 - 保留原始文档结构"""
def __init__(self):
self.content = []
def _add_content(self, text: str):
"""添加正文内容"""
if text:
self.content.append(f"\n{text}\n")
def create_document(self, content: str, processing_type: str = "文本处理") -> str:
"""
创建完整的Markdown文档保留原始文档结构
Args:
content: 处理后的文档内容
processing_type: 处理类型(润色、翻译等)
Returns:
str: 生成的Markdown文本
"""
self.content = []
# 添加标题和说明
self.content.append(f"# 文档处理结果\n")
self.content.append(f"## 处理方式: {processing_type}\n")
# 添加处理时间
from datetime import datetime
self.content.append(f"*处理时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n")
# 添加分隔线
self.content.append("---\n")
# 添加原始内容,保留结构
self.content.append(content)
# 添加结尾分隔线
self.content.append("\n---\n")
return "\n".join(self.content)

View File

@@ -0,0 +1,69 @@
import re
def convert_markdown_to_txt(markdown_text):
"""Convert markdown text to plain text while preserving formatting"""
# Standardize line endings
markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n')
# 1. Handle headers but keep their formatting instead of removing them
markdown_text = re.sub(r'^#\s+(.+)$', r'# \1', markdown_text, flags=re.MULTILINE)
markdown_text = re.sub(r'^##\s+(.+)$', r'## \1', markdown_text, flags=re.MULTILINE)
markdown_text = re.sub(r'^###\s+(.+)$', r'### \1', markdown_text, flags=re.MULTILINE)
# 2. Handle bold and italic - simply remove markers
markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text)
markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text)
# 3. Handle lists but preserve formatting
markdown_text = re.sub(r'^\s*[-*+]\s+(.+?)(?=\n|$)', r'\1', markdown_text, flags=re.MULTILINE)
# 4. Handle links - keep only the text
markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 (\2)', markdown_text)
# 5. Handle HTML links - convert to user-friendly format
markdown_text = re.sub(r'<a href=[\'"]([^\'"]+)[\'"](?:\s+target=[\'"][^\'"]+[\'"])?>([^<]+)</a>', r'\2 (\1)', markdown_text)
# 6. Preserve paragraph breaks
markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text) # normalize multiple newlines to double newlines
# 7. Clean up extra spaces but maintain indentation
markdown_text = re.sub(r' +', ' ', markdown_text)
return markdown_text.strip()
class TxtFormatter:
"""文本格式化器 - 保留原始文档结构"""
def __init__(self):
self.content = []
self._setup_document()
def _setup_document(self):
"""初始化文档标题"""
self.content.append("=" * 50)
self.content.append("处理后文档".center(48))
self.content.append("=" * 50)
def _format_header(self):
"""创建文档头部信息"""
from datetime import datetime
date_str = datetime.now().strftime('%Y年%m月%d')
return [
date_str.center(48),
"\n" # 添加空行
]
def create_document(self, content):
"""生成保留原始结构的文档"""
# 添加头部信息
self.content.extend(self._format_header())
# 处理内容,保留原始结构
processed_content = convert_markdown_to_txt(content)
# 添加处理后的内容
self.content.append(processed_content)
# 合并所有内容
return "\n".join(self.content)

View File

@@ -0,0 +1,125 @@
from docx2pdf import convert
import os
import platform
from typing import Union
from pathlib import Path
from datetime import datetime
class WordToPdfConverter:
"""Word文档转PDF转换器"""
@staticmethod
def convert_to_pdf(word_path: Union[str, Path], pdf_path: Union[str, Path] = None) -> str:
"""
将Word文档转换为PDF
参数:
word_path: Word文档的路径
pdf_path: 可选PDF文件的输出路径。如果未指定将使用与Word文档相同的名称和位置
返回:
生成的PDF文件路径
异常:
如果转换失败,将抛出相应异常
"""
try:
# 确保输入路径是Path对象
word_path = Path(word_path)
# 如果未指定pdf_path则使用与word文档相同的名称
if pdf_path is None:
pdf_path = word_path.with_suffix('.pdf')
else:
pdf_path = Path(pdf_path)
# 检查操作系统
if platform.system() == 'Linux':
# Linux系统需要安装libreoffice
if not os.system('which libreoffice') == 0:
raise RuntimeError("请先安装LibreOffice: sudo apt-get install libreoffice")
# 使用libreoffice进行转换
os.system(f'libreoffice --headless --convert-to pdf "{word_path}" --outdir "{pdf_path.parent}"')
# 如果输出路径与默认生成的不同,则重命名
default_pdf = word_path.with_suffix('.pdf')
if default_pdf != pdf_path:
os.rename(default_pdf, pdf_path)
else:
# Windows和MacOS使用docx2pdf
convert(word_path, pdf_path)
return str(pdf_path)
except Exception as e:
raise Exception(f"转换PDF失败: {str(e)}")
@staticmethod
def batch_convert(word_dir: Union[str, Path], pdf_dir: Union[str, Path] = None) -> list:
"""
批量转换目录下的所有Word文档
参数:
word_dir: 包含Word文档的目录路径
pdf_dir: 可选PDF文件的输出目录。如果未指定将使用与Word文档相同的目录
返回:
生成的PDF文件路径列表
"""
word_dir = Path(word_dir)
if pdf_dir:
pdf_dir = Path(pdf_dir)
pdf_dir.mkdir(parents=True, exist_ok=True)
converted_files = []
for word_file in word_dir.glob("*.docx"):
try:
if pdf_dir:
pdf_path = pdf_dir / word_file.with_suffix('.pdf').name
else:
pdf_path = word_file.with_suffix('.pdf')
pdf_file = WordToPdfConverter.convert_to_pdf(word_file, pdf_path)
converted_files.append(pdf_file)
except Exception as e:
print(f"转换 {word_file} 失败: {str(e)}")
return converted_files
@staticmethod
def convert_doc_to_pdf(doc, output_dir: Union[str, Path] = None) -> str:
"""
将docx对象直接转换为PDF
参数:
doc: python-docx的Document对象
output_dir: 可选,输出目录。如果未指定,将使用当前目录
返回:
生成的PDF文件路径
"""
try:
# 设置临时文件路径和输出路径
output_dir = Path(output_dir) if output_dir else Path.cwd()
output_dir.mkdir(parents=True, exist_ok=True)
# 生成临时word文件
temp_docx = output_dir / f"temp_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
doc.save(temp_docx)
# 转换为PDF
pdf_path = temp_docx.with_suffix('.pdf')
WordToPdfConverter.convert_to_pdf(temp_docx, pdf_path)
# 删除临时word文件
temp_docx.unlink()
return str(pdf_path)
except Exception as e:
if temp_docx.exists():
temp_docx.unlink()
raise Exception(f"转换PDF失败: {str(e)}")

View File

@@ -0,0 +1,236 @@
import re
from docx import Document
from docx.shared import Cm, Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
from docx.enum.style import WD_STYLE_TYPE
from docx.oxml.ns import qn
from datetime import datetime
def convert_markdown_to_word(markdown_text):
# 0. 首先标准化所有换行符为\n
markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n')
# 1. 处理标题 - 支持更多级别的标题,使用更精确的正则
# 保留标题标记,以便后续处理时还能识别出标题级别
markdown_text = re.sub(r'^(#{1,6})\s+(.+?)(?:\s+#+)?$', r'\1 \2', markdown_text, flags=re.MULTILINE)
# 2. 处理粗体、斜体和加粗斜体
markdown_text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', markdown_text) # 加粗斜体
markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text) # 加粗
markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text) # 斜体
markdown_text = re.sub(r'_(.+?)_', r'\1', markdown_text) # 下划线斜体
markdown_text = re.sub(r'__(.+?)__', r'\1', markdown_text) # 下划线加粗
# 3. 处理代码块 - 不移除,而是简化格式
# 多行代码块
markdown_text = re.sub(r'```(?:\w+)?\n([\s\S]*?)```', r'[代码块]\n\1[/代码块]', markdown_text)
# 单行代码
markdown_text = re.sub(r'`([^`]+)`', r'[代码]\1[/代码]', markdown_text)
# 4. 处理列表 - 保留列表结构
# 匹配无序列表
markdown_text = re.sub(r'^(\s*)[-*+]\s+(.+?)$', r'\1• \2', markdown_text, flags=re.MULTILINE)
# 5. 处理Markdown链接
markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+?)\s*(?:"[^"]*")?\)', r'\1 (\2)', markdown_text)
# 6. 处理HTML链接
markdown_text = re.sub(r'<a href=[\'"]([^\'"]+)[\'"](?:\s+target=[\'"][^\'"]+[\'"])?>([^<]+)</a>', r'\2 (\1)', markdown_text)
# 7. 处理图片
markdown_text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'[图片:\1]', markdown_text)
return markdown_text
class WordFormatter:
"""文档Word格式化器 - 保留原始文档结构"""
def __init__(self):
self.doc = Document()
self._setup_document()
self._create_styles()
def _setup_document(self):
"""设置文档基本格式,包括页面设置和页眉"""
sections = self.doc.sections
for section in sections:
# 设置页面大小为A4
section.page_width = Cm(21)
section.page_height = Cm(29.7)
# 设置页边距
section.top_margin = Cm(3.7) # 上边距37mm
section.bottom_margin = Cm(3.5) # 下边距35mm
section.left_margin = Cm(2.8) # 左边距28mm
section.right_margin = Cm(2.6) # 右边距26mm
# 设置页眉页脚距离
section.header_distance = Cm(2.0)
section.footer_distance = Cm(2.0)
# 添加页眉
header = section.header
header_para = header.paragraphs[0]
header_para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
header_run = header_para.add_run("文档处理结果")
header_run.font.name = '仿宋'
header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
header_run.font.size = Pt(9)
def _create_styles(self):
"""创建文档样式"""
# 创建正文样式
style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH)
style.font.name = '仿宋'
style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
style.font.size = Pt(12) # 调整为12磅
style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
style.paragraph_format.space_after = Pt(0)
# 创建标题样式
title_style = self.doc.styles.add_style('Title_Custom', WD_STYLE_TYPE.PARAGRAPH)
title_style.font.name = '黑体'
title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
title_style.font.size = Pt(22) # 调整为22磅
title_style.font.bold = True
title_style.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
title_style.paragraph_format.space_before = Pt(0)
title_style.paragraph_format.space_after = Pt(24)
title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
# 创建标题1样式
h1_style = self.doc.styles.add_style('Heading1_Custom', WD_STYLE_TYPE.PARAGRAPH)
h1_style.font.name = '黑体'
h1_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
h1_style.font.size = Pt(18)
h1_style.font.bold = True
h1_style.paragraph_format.space_before = Pt(12)
h1_style.paragraph_format.space_after = Pt(6)
# 创建标题2样式
h2_style = self.doc.styles.add_style('Heading2_Custom', WD_STYLE_TYPE.PARAGRAPH)
h2_style.font.name = '黑体'
h2_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
h2_style.font.size = Pt(16)
h2_style.font.bold = True
h2_style.paragraph_format.space_before = Pt(10)
h2_style.paragraph_format.space_after = Pt(6)
# 创建标题3样式
h3_style = self.doc.styles.add_style('Heading3_Custom', WD_STYLE_TYPE.PARAGRAPH)
h3_style.font.name = '黑体'
h3_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
h3_style.font.size = Pt(14)
h3_style.font.bold = True
h3_style.paragraph_format.space_before = Pt(8)
h3_style.paragraph_format.space_after = Pt(4)
# 创建代码块样式
code_style = self.doc.styles.add_style('Code_Custom', WD_STYLE_TYPE.PARAGRAPH)
code_style.font.name = 'Courier New'
code_style.font.size = Pt(11)
code_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE
code_style.paragraph_format.space_before = Pt(6)
code_style.paragraph_format.space_after = Pt(6)
code_style.paragraph_format.left_indent = Pt(36)
code_style.paragraph_format.right_indent = Pt(36)
# 创建列表样式
list_style = self.doc.styles.add_style('List_Custom', WD_STYLE_TYPE.PARAGRAPH)
list_style.font.name = '仿宋'
list_style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
list_style.font.size = Pt(12)
list_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
list_style.paragraph_format.left_indent = Pt(21)
list_style.paragraph_format.first_line_indent = Pt(-21)
def create_document(self, content: str, processing_type: str = "文本处理"):
"""创建文档,保留原始结构"""
# 添加标题
title_para = self.doc.add_paragraph(style='Title_Custom')
title_run = title_para.add_run('文档处理结果')
# 添加处理类型
processing_para = self.doc.add_paragraph()
processing_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
processing_run = processing_para.add_run(f"处理方式: {processing_type}")
processing_run.font.name = '仿宋'
processing_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
processing_run.font.size = Pt(14)
# 添加日期
date_para = self.doc.add_paragraph()
date_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
date_run = date_para.add_run(f"处理时间: {datetime.now().strftime('%Y年%m月%d')}")
date_run.font.name = '仿宋'
date_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
date_run.font.size = Pt(14)
self.doc.add_paragraph() # 添加空行
# 预处理内容将Markdown格式转换为适合Word的格式
processed_content = convert_markdown_to_word(content)
# 按行处理文本,保留结构
lines = processed_content.split('\n')
in_code_block = False
current_paragraph = None
for line in lines:
# 检查是否为标题
header_match = re.match(r'^(#{1,6})\s+(.+)$', line)
if header_match:
# 根据#的数量确定标题级别
level = len(header_match.group(1))
title_text = header_match.group(2)
if level == 1:
style = 'Heading1_Custom'
elif level == 2:
style = 'Heading2_Custom'
else:
style = 'Heading3_Custom'
self.doc.add_paragraph(title_text, style=style)
current_paragraph = None
# 检查代码块标记
elif '[代码块]' in line:
in_code_block = True
current_paragraph = self.doc.add_paragraph(style='Code_Custom')
code_line = line.replace('[代码块]', '').strip()
if code_line:
current_paragraph.add_run(code_line)
elif '[/代码块]' in line:
in_code_block = False
code_line = line.replace('[/代码块]', '').strip()
if code_line and current_paragraph:
current_paragraph.add_run(code_line)
current_paragraph = None
# 检查列表项
elif line.strip().startswith(''):
p = self.doc.add_paragraph(style='List_Custom')
p.add_run(line.strip())
current_paragraph = None
# 处理普通文本行
elif line.strip():
if in_code_block:
if current_paragraph:
current_paragraph.add_run('\n' + line)
else:
current_paragraph = self.doc.add_paragraph(line, style='Code_Custom')
else:
if current_paragraph is None or not current_paragraph.text:
current_paragraph = self.doc.add_paragraph(line, style='Normal_Custom')
else:
current_paragraph.add_run('\n' + line)
# 处理空行,创建新段落
elif not in_code_block:
current_paragraph = None
return self.doc