diff --git a/crazy_functions/doc_fns/AI_review_doc.py b/crazy_functions/doc_fns/AI_review_doc.py
new file mode 100644
index 00000000..483ffc63
--- /dev/null
+++ b/crazy_functions/doc_fns/AI_review_doc.py
@@ -0,0 +1,812 @@
+import os
+import time
+from abc import ABC, abstractmethod
+from datetime import datetime
+from docx import Document
+from docx.enum.style import WD_STYLE_TYPE
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
+from docx.oxml.ns import qn
+from docx.shared import Inches, Cm
+from docx.shared import Pt, RGBColor, Inches
+from typing import Dict, List, Tuple
+import markdown
+from crazy_functions.doc_fns.conversation_doc.word_doc import convert_markdown_to_word
+
+
+
+class DocumentFormatter(ABC):
+ """文档格式化基类,定义文档格式化的基本接口"""
+
+ def __init__(self, final_summary: str, file_summaries_map: Dict, failed_files: List[Tuple]):
+ self.final_summary = final_summary
+ self.file_summaries_map = file_summaries_map
+ self.failed_files = failed_files
+
+ @abstractmethod
+ def format_failed_files(self) -> str:
+ """格式化失败文件列表"""
+ pass
+
+ @abstractmethod
+ def format_file_summaries(self) -> str:
+ """格式化文件总结内容"""
+ pass
+
+ @abstractmethod
+ def create_document(self) -> str:
+ """创建完整文档"""
+ pass
+
+
+class WordFormatter(DocumentFormatter):
+ """Word格式文档生成器 - 符合中国政府公文格式规范(GB/T 9704-2012),并进行了优化"""
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.doc = Document()
+ self._setup_document()
+ self._create_styles()
+ # 初始化三级标题编号系统
+ self.numbers = {
+ 1: 0, # 一级标题编号
+ 2: 0, # 二级标题编号
+ 3: 0 # 三级标题编号
+ }
+
+ def _setup_document(self):
+ """设置文档基本格式,包括页面设置和页眉"""
+ sections = self.doc.sections
+ for section in sections:
+ # 设置页面大小为A4
+ section.page_width = Cm(21)
+ section.page_height = Cm(29.7)
+ # 设置页边距
+ section.top_margin = Cm(3.7) # 上边距37mm
+ section.bottom_margin = Cm(3.5) # 下边距35mm
+ section.left_margin = Cm(2.8) # 左边距28mm
+ section.right_margin = Cm(2.6) # 右边距26mm
+ # 设置页眉页脚距离
+ section.header_distance = Cm(2.0)
+ section.footer_distance = Cm(2.0)
+
+ # 添加页眉
+ header = section.header
+ header_para = header.paragraphs[0]
+ header_para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
+ header_run = header_para.add_run("该文档由GPT-academic生成")
+ header_run.font.name = '仿宋'
+ header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
+ header_run.font.size = Pt(9)
+
+ def _create_styles(self):
+ """创建文档样式"""
+ # 创建正文样式
+ style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH)
+ style.font.name = '仿宋'
+ style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
+ style.font.size = Pt(14)
+ style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+ style.paragraph_format.space_after = Pt(0)
+ style.paragraph_format.first_line_indent = Pt(28)
+
+ # 创建各级标题样式
+ self._create_heading_style('Title_Custom', '方正小标宋简体', 32, WD_PARAGRAPH_ALIGNMENT.CENTER)
+ self._create_heading_style('Heading1_Custom', '黑体', 22, WD_PARAGRAPH_ALIGNMENT.LEFT)
+ self._create_heading_style('Heading2_Custom', '黑体', 18, WD_PARAGRAPH_ALIGNMENT.LEFT)
+ self._create_heading_style('Heading3_Custom', '黑体', 16, WD_PARAGRAPH_ALIGNMENT.LEFT)
+
+ def _create_heading_style(self, style_name: str, font_name: str, font_size: int, alignment):
+ """创建标题样式"""
+ style = self.doc.styles.add_style(style_name, WD_STYLE_TYPE.PARAGRAPH)
+ style.font.name = font_name
+ style._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
+ style.font.size = Pt(font_size)
+ style.font.bold = True
+ style.paragraph_format.alignment = alignment
+ style.paragraph_format.space_before = Pt(12)
+ style.paragraph_format.space_after = Pt(12)
+ style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+ return style
+
+ def _get_heading_number(self, level: int) -> str:
+ """
+ 生成标题编号
+
+ Args:
+ level: 标题级别 (0-3)
+
+ Returns:
+ str: 格式化的标题编号
+ """
+ if level == 0: # 主标题不需要编号
+ return ""
+
+ self.numbers[level] += 1 # 增加当前级别的编号
+
+ # 重置下级标题编号
+ for i in range(level + 1, 4):
+ self.numbers[i] = 0
+
+ # 根据级别返回不同格式的编号
+ if level == 1:
+ return f"{self.numbers[1]}. "
+ elif level == 2:
+ return f"{self.numbers[1]}.{self.numbers[2]} "
+ elif level == 3:
+ return f"{self.numbers[1]}.{self.numbers[2]}.{self.numbers[3]} "
+ return ""
+
+ def _add_heading(self, text: str, level: int):
+ """
+ 添加带编号的标题
+
+ Args:
+ text: 标题文本
+ level: 标题级别 (0-3)
+ """
+ style_map = {
+ 0: 'Title_Custom',
+ 1: 'Heading1_Custom',
+ 2: 'Heading2_Custom',
+ 3: 'Heading3_Custom'
+ }
+
+ number = self._get_heading_number(level)
+ paragraph = self.doc.add_paragraph(style=style_map[level])
+
+ if number:
+ number_run = paragraph.add_run(number)
+ font_size = 22 if level == 1 else (18 if level == 2 else 16)
+ self._get_run_style(number_run, '黑体', font_size, True)
+
+ text_run = paragraph.add_run(text)
+ font_size = 32 if level == 0 else (22 if level == 1 else (18 if level == 2 else 16))
+ self._get_run_style(text_run, '黑体', font_size, True)
+
+ # 主标题添加日期
+ if level == 0:
+ date_paragraph = self.doc.add_paragraph()
+ date_paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
+ date_run = date_paragraph.add_run(datetime.now().strftime('%Y年%m月%d日'))
+ self._get_run_style(date_run, '仿宋', 16, False)
+
+ return paragraph
+
+ def _get_run_style(self, run, font_name: str, font_size: int, bold: bool = False):
+ """设置文本运行对象的样式"""
+ run.font.name = font_name
+ run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
+ run.font.size = Pt(font_size)
+ run.font.bold = bold
+
+ def format_failed_files(self) -> str:
+ """格式化失败文件列表"""
+ result = []
+ if not self.failed_files:
+ return "\n".join(result)
+
+ result.append("处理失败文件:")
+ for fp, reason in self.failed_files:
+ result.append(f"• {os.path.basename(fp)}: {reason}")
+
+ self._add_heading("处理失败文件", 1)
+ for fp, reason in self.failed_files:
+ self._add_content(f"• {os.path.basename(fp)}: {reason}", indent=False)
+ self.doc.add_paragraph()
+
+ return "\n".join(result)
+
+ def _add_content(self, text: str, indent: bool = True):
+ """添加正文内容,使用convert_markdown_to_word处理文本"""
+ # 使用convert_markdown_to_word处理markdown文本
+ processed_text = convert_markdown_to_word(text)
+ paragraph = self.doc.add_paragraph(processed_text, style='Normal_Custom')
+ if not indent:
+ paragraph.paragraph_format.first_line_indent = Pt(0)
+ return paragraph
+
+ def format_file_summaries(self) -> str:
+ """
+ 格式化文件总结内容,确保正确的标题层级并处理markdown文本
+ """
+ result = []
+ # 首先对文件路径进行分组整理
+ file_groups = {}
+ for path in sorted(self.file_summaries_map.keys()):
+ dir_path = os.path.dirname(path)
+ if dir_path not in file_groups:
+ file_groups[dir_path] = []
+ file_groups[dir_path].append(path)
+
+ # 处理没有目录的文件
+ root_files = file_groups.get("", [])
+ if root_files:
+ for path in sorted(root_files):
+ file_name = os.path.basename(path)
+ result.append(f"\n📄 {file_name}")
+ result.append(self.file_summaries_map[path])
+ # 无目录的文件作为二级标题
+ self._add_heading(f"📄 {file_name}", 2)
+ # 使用convert_markdown_to_word处理文件内容
+ self._add_content(convert_markdown_to_word(self.file_summaries_map[path]))
+ self.doc.add_paragraph()
+
+ # 处理有目录的文件
+ for dir_path in sorted(file_groups.keys()):
+ if dir_path == "": # 跳过已处理的根目录文件
+ continue
+
+ # 添加目录作为二级标题
+ result.append(f"\n📁 {dir_path}")
+ self._add_heading(f"📁 {dir_path}", 2)
+
+ # 该目录下的所有文件作为三级标题
+ for path in sorted(file_groups[dir_path]):
+ file_name = os.path.basename(path)
+ result.append(f"\n📄 {file_name}")
+ result.append(self.file_summaries_map[path])
+
+ # 添加文件名作为三级标题
+ self._add_heading(f"📄 {file_name}", 3)
+ # 使用convert_markdown_to_word处理文件内容
+ self._add_content(convert_markdown_to_word(self.file_summaries_map[path]))
+ self.doc.add_paragraph()
+
+ return "\n".join(result)
+
+
+ def create_document(self):
+ """创建完整Word文档并返回文档对象"""
+ # 重置所有编号
+ for level in self.numbers:
+ self.numbers[level] = 0
+
+ # 添加主标题
+ self._add_heading("文档总结报告", 0)
+ self.doc.add_paragraph()
+
+ # 添加总体摘要,使用convert_markdown_to_word处理
+ self._add_heading("总体摘要", 1)
+ self._add_content(convert_markdown_to_word(self.final_summary))
+ self.doc.add_paragraph()
+
+ # 添加失败文件列表(如果有)
+ if self.failed_files:
+ self.format_failed_files()
+
+ # 添加文件详细总结
+ self._add_heading("各文件详细总结", 1)
+ self.format_file_summaries()
+
+ return self.doc
+
+ def save_as_pdf(self, word_path, pdf_path=None):
+ """将生成的Word文档转换为PDF
+
+ 参数:
+ word_path: Word文档的路径
+ pdf_path: 可选,PDF文件的输出路径。如果未指定,将使用与Word文档相同的名称和位置
+
+ 返回:
+ 生成的PDF文件路径,如果转换失败则返回None
+ """
+ from crazy_functions.doc_fns.conversation_doc.word2pdf import WordToPdfConverter
+ try:
+ pdf_path = WordToPdfConverter.convert_to_pdf(word_path, pdf_path)
+ return pdf_path
+ except Exception as e:
+ print(f"PDF转换失败: {str(e)}")
+ return None
+
+
+class MarkdownFormatter(DocumentFormatter):
+ """Markdown格式文档生成器"""
+
+ def format_failed_files(self) -> str:
+ if not self.failed_files:
+ return ""
+
+ formatted_text = ["\n## ⚠️ 处理失败的文件"]
+ for fp, reason in self.failed_files:
+ formatted_text.append(f"- {os.path.basename(fp)}: {reason}")
+ formatted_text.append("\n---")
+ return "\n".join(formatted_text)
+
+ def format_file_summaries(self) -> str:
+ formatted_text = []
+ sorted_paths = sorted(self.file_summaries_map.keys())
+ current_dir = ""
+
+ for path in sorted_paths:
+ dir_path = os.path.dirname(path)
+ if dir_path != current_dir:
+ if dir_path:
+ formatted_text.append(f"\n## 📁 {dir_path}")
+ current_dir = dir_path
+
+ file_name = os.path.basename(path)
+ formatted_text.append(f"\n### 📄 {file_name}")
+ formatted_text.append(self.file_summaries_map[path])
+ formatted_text.append("\n---")
+
+ return "\n".join(formatted_text)
+
+ def create_document(self) -> str:
+ document = [
+ "# 📑 文档总结报告",
+ "\n## 总体摘要",
+ self.final_summary
+ ]
+
+ if self.failed_files:
+ document.append(self.format_failed_files())
+
+ document.extend([
+ "\n# 📚 各文件详细总结",
+ self.format_file_summaries()
+ ])
+
+ return "\n".join(document)
+
+
+
+class HtmlFormatter(DocumentFormatter):
+ """HTML格式文档生成器 - 优化版"""
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.md = markdown.Markdown(extensions=['extra','codehilite', 'tables','nl2br'])
+ self.css_styles = """
+ @keyframes fadeIn {
+ from { opacity: 0; transform: translateY(20px); }
+ to { opacity: 1; transform: translateY(0); }
+ }
+
+ @keyframes slideIn {
+ from { transform: translateX(-20px); opacity: 0; }
+ to { transform: translateX(0); opacity: 1; }
+ }
+
+ @keyframes pulse {
+ 0% { transform: scale(1); }
+ 50% { transform: scale(1.05); }
+ 100% { transform: scale(1); }
+ }
+
+ :root {
+ /* Enhanced color palette */
+ --primary-color: #2563eb;
+ --primary-light: #eff6ff;
+ --secondary-color: #1e293b;
+ --background-color: #f8fafc;
+ --text-color: #334155;
+ --text-light: #64748b;
+ --border-color: #e2e8f0;
+ --error-color: #ef4444;
+ --error-light: #fef2f2;
+ --success-color: #22c55e;
+ --warning-color: #f59e0b;
+ --card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
+ --hover-shadow: 0 20px 25px -5px rgb(0 0 0 / 0.1), 0 8px 10px -6px rgb(0 0 0 / 0.1);
+
+ /* Typography */
+ --heading-font: "Plus Jakarta Sans", system-ui, sans-serif;
+ --body-font: "Inter", system-ui, sans-serif;
+ }
+
+ body {
+ font-family: var(--body-font);
+ line-height: 1.8;
+ max-width: 1200px;
+ margin: 0 auto;
+ padding: 2rem;
+ color: var(--text-color);
+ background-color: var(--background-color);
+ font-size: 16px;
+ -webkit-font-smoothing: antialiased;
+ }
+
+ .container {
+ background: white;
+ padding: 3rem;
+ border-radius: 24px;
+ box-shadow: var(--card-shadow);
+ transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
+ animation: fadeIn 0.6s ease-out;
+ border: 1px solid var(--border-color);
+ }
+
+ .container:hover {
+ box-shadow: var(--hover-shadow);
+ transform: translateY(-2px);
+ }
+
+ h1, h2, h3 {
+ font-family: var(--heading-font);
+ font-weight: 600;
+ }
+
+ h1 {
+ color: var(--primary-color);
+ font-size: 2.8em;
+ text-align: center;
+ margin: 2rem 0 3rem;
+ padding-bottom: 1.5rem;
+ border-bottom: 3px solid var(--primary-color);
+ letter-spacing: -0.03em;
+ position: relative;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ gap: 1rem;
+ }
+
+ h1::after {
+ content: '';
+ position: absolute;
+ bottom: -3px;
+ left: 50%;
+ transform: translateX(-50%);
+ width: 120px;
+ height: 3px;
+ background: linear-gradient(90deg, var(--primary-color), var(--primary-light));
+ border-radius: 3px;
+ transition: width 0.3s ease;
+ }
+
+ h1:hover::after {
+ width: 180px;
+ }
+
+ h2 {
+ color: var(--secondary-color);
+ font-size: 1.9em;
+ margin: 2.5rem 0 1.5rem;
+ padding-left: 1.2rem;
+ border-left: 4px solid var(--primary-color);
+ letter-spacing: -0.02em;
+ display: flex;
+ align-items: center;
+ gap: 1rem;
+ transition: all 0.3s ease;
+ }
+
+ h2:hover {
+ color: var(--primary-color);
+ transform: translateX(5px);
+ }
+
+ h3 {
+ color: var(--text-color);
+ font-size: 1.5em;
+ margin: 2rem 0 1rem;
+ padding-bottom: 0.8rem;
+ border-bottom: 2px solid var(--border-color);
+ transition: all 0.3s ease;
+ display: flex;
+ align-items: center;
+ gap: 0.8rem;
+ }
+
+ h3:hover {
+ color: var(--primary-color);
+ border-bottom-color: var(--primary-color);
+ }
+
+ .summary {
+ background: var(--primary-light);
+ padding: 2.5rem;
+ border-radius: 16px;
+ margin: 2.5rem 0;
+ box-shadow: 0 4px 6px -1px rgba(37, 99, 235, 0.1);
+ position: relative;
+ overflow: hidden;
+ transition: transform 0.3s ease, box-shadow 0.3s ease;
+ animation: slideIn 0.5s ease-out;
+ }
+
+ .summary:hover {
+ transform: translateY(-3px);
+ box-shadow: 0 8px 12px -2px rgba(37, 99, 235, 0.15);
+ }
+
+ .summary::before {
+ content: '';
+ position: absolute;
+ top: 0;
+ left: 0;
+ width: 4px;
+ height: 100%;
+ background: linear-gradient(to bottom, var(--primary-color), rgba(37, 99, 235, 0.6));
+ }
+
+ .summary p {
+ margin: 1.2rem 0;
+ line-height: 1.9;
+ color: var(--text-color);
+ transition: color 0.3s ease;
+ }
+
+ .summary:hover p {
+ color: var(--secondary-color);
+ }
+
+ .details {
+ margin-top: 3.5rem;
+ padding-top: 2.5rem;
+ border-top: 2px dashed var(--border-color);
+ animation: fadeIn 0.8s ease-out;
+ }
+
+ .failed-files {
+ background: var(--error-light);
+ padding: 2rem;
+ border-radius: 16px;
+ margin: 3rem 0;
+ border-left: 4px solid var(--error-color);
+ position: relative;
+ transition: all 0.3s ease;
+ animation: slideIn 0.5s ease-out;
+ }
+
+ .failed-files:hover {
+ transform: translateX(5px);
+ box-shadow: 0 8px 15px -3px rgba(239, 68, 68, 0.1);
+ }
+
+ .failed-files h2 {
+ color: var(--error-color);
+ border-left: none;
+ padding-left: 0;
+ }
+
+ .failed-files ul {
+ margin: 1.8rem 0;
+ padding-left: 1.2rem;
+ list-style-type: none;
+ }
+
+ .failed-files li {
+ margin: 1.2rem 0;
+ padding: 1.2rem 1.8rem;
+ background: rgba(239, 68, 68, 0.08);
+ border-radius: 12px;
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+ }
+
+ .failed-files li:hover {
+ transform: translateX(8px);
+ background: rgba(239, 68, 68, 0.12);
+ }
+
+ .directory-section {
+ margin: 3.5rem 0;
+ padding: 2rem;
+ background: var(--background-color);
+ border-radius: 16px;
+ position: relative;
+ transition: all 0.3s ease;
+ animation: fadeIn 0.6s ease-out;
+ }
+
+ .directory-section:hover {
+ background: white;
+ box-shadow: var(--card-shadow);
+ }
+
+ .file-summary {
+ background: white;
+ padding: 2rem;
+ margin: 1.8rem 0;
+ border-radius: 16px;
+ box-shadow: var(--card-shadow);
+ border-left: 4px solid var(--border-color);
+ transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
+ position: relative;
+ overflow: hidden;
+ }
+
+ .file-summary:hover {
+ border-left-color: var(--primary-color);
+ transform: translateX(8px) translateY(-2px);
+ box-shadow: var(--hover-shadow);
+ }
+
+ .file-summary {
+ background: white;
+ padding: 2rem;
+ margin: 1.8rem 0;
+ border-radius: 16px;
+ box-shadow: var(--card-shadow);
+ border-left: 4px solid var(--border-color);
+ transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
+ position: relative;
+ }
+
+ .file-summary:hover {
+ border-left-color: var(--primary-color);
+ transform: translateX(8px) translateY(-2px);
+ box-shadow: var(--hover-shadow);
+ }
+
+ .icon {
+ display: inline-flex;
+ align-items: center;
+ justify-content: center;
+ width: 32px;
+ height: 32px;
+ border-radius: 8px;
+ background: var(--primary-light);
+ color: var(--primary-color);
+ font-size: 1.2em;
+ transition: all 0.3s ease;
+ }
+
+ .file-summary:hover .icon,
+ .directory-section:hover .icon {
+ transform: scale(1.1);
+ background: var(--primary-color);
+ color: white;
+ }
+
+ /* Smooth scrolling */
+ html {
+ scroll-behavior: smooth;
+ }
+
+ /* Selection style */
+ ::selection {
+ background: var(--primary-light);
+ color: var(--primary-color);
+ }
+
+ /* Print styles */
+ @media print {
+ body {
+ background: white;
+ }
+ .container {
+ box-shadow: none;
+ padding: 0;
+ }
+ .file-summary, .failed-files {
+ break-inside: avoid;
+ box-shadow: none;
+ }
+ .icon {
+ display: none;
+ }
+ }
+
+ /* Responsive design */
+ @media (max-width: 768px) {
+ body {
+ padding: 1rem;
+ font-size: 15px;
+ }
+
+ .container {
+ padding: 1.5rem;
+ }
+
+ h1 {
+ font-size: 2.2em;
+ margin: 1.5rem 0 2rem;
+ }
+
+ h2 {
+ font-size: 1.7em;
+ }
+
+ h3 {
+ font-size: 1.4em;
+ }
+
+ .summary, .failed-files, .directory-section {
+ padding: 1.5rem;
+ }
+
+ .file-summary {
+ padding: 1.2rem;
+ }
+
+ .icon {
+ width: 28px;
+ height: 28px;
+ }
+ }
+
+ /* Dark mode support */
+ @media (prefers-color-scheme: dark) {
+ :root {
+ --primary-light: rgba(37, 99, 235, 0.15);
+ --background-color: #0f172a;
+ --text-color: #e2e8f0;
+ --text-light: #94a3b8;
+ --border-color: #1e293b;
+ --error-light: rgba(239, 68, 68, 0.15);
+ }
+
+ .container, .file-summary {
+ background: #1e293b;
+ }
+
+ .directory-section {
+ background: #0f172a;
+ }
+
+ .directory-section:hover {
+ background: #1e293b;
+ }
+ }
+ """
+
+ def format_failed_files(self) -> str:
+ if not self.failed_files:
+ return ""
+
+ failed_files_html = ['
']
+ failed_files_html.append('
⚠️ 处理失败的文件
')
+ failed_files_html.append("
")
+ for fp, reason in self.failed_files:
+ failed_files_html.append(
+ f'- 📄 {os.path.basename(fp)}
{reason} '
+ )
+ failed_files_html.append("
")
+ return "\n".join(failed_files_html)
+
+ def format_file_summaries(self) -> str:
+ formatted_html = []
+ sorted_paths = sorted(self.file_summaries_map.keys())
+ current_dir = ""
+
+ for path in sorted_paths:
+ dir_path = os.path.dirname(path)
+ if dir_path != current_dir:
+ if dir_path:
+ formatted_html.append('')
+ formatted_html.append(f'
📁 {dir_path}
')
+ formatted_html.append('')
+ current_dir = dir_path
+
+ file_name = os.path.basename(path)
+ formatted_html.append('')
+ formatted_html.append(f'
📄 {file_name}
')
+ formatted_html.append(self.md.convert(self.file_summaries_map[path]))
+ formatted_html.append('')
+
+ return "\n".join(formatted_html)
+
+ def create_document(self) -> str:
+ """生成HTML文档
+ Returns:
+ str: 完整的HTML文档字符串
+ """
+ return f"""
+
+
+
+
+
+ 文档总结报告
+
+
+
+
+
+
+
📑 文档总结报告
+
+
📋 总体摘要
+
{self.md.convert(self.final_summary)}
+
+ {self.format_failed_files()}
+
+
📚 各文件详细总结
+ {self.format_file_summaries()}
+
+
+
+
+
+ """
\ No newline at end of file
diff --git a/crazy_functions/doc_fns/__init__.py b/crazy_functions/doc_fns/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/crazy_functions/doc_fns/batch_file_query_doc.py b/crazy_functions/doc_fns/batch_file_query_doc.py
new file mode 100644
index 00000000..a2a11e37
--- /dev/null
+++ b/crazy_functions/doc_fns/batch_file_query_doc.py
@@ -0,0 +1,812 @@
+import os
+import time
+from abc import ABC, abstractmethod
+from datetime import datetime
+from docx import Document
+from docx.enum.style import WD_STYLE_TYPE
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
+from docx.oxml.ns import qn
+from docx.shared import Inches, Cm
+from docx.shared import Pt, RGBColor, Inches
+from typing import Dict, List, Tuple
+import markdown
+from crazy_functions.doc_fns.conversation_doc.word_doc import convert_markdown_to_word
+
+
+
+class DocumentFormatter(ABC):
+ """文档格式化基类,定义文档格式化的基本接口"""
+
+ def __init__(self, final_summary: str, file_summaries_map: Dict, failed_files: List[Tuple]):
+ self.final_summary = final_summary
+ self.file_summaries_map = file_summaries_map
+ self.failed_files = failed_files
+
+ @abstractmethod
+ def format_failed_files(self) -> str:
+ """格式化失败文件列表"""
+ pass
+
+ @abstractmethod
+ def format_file_summaries(self) -> str:
+ """格式化文件总结内容"""
+ pass
+
+ @abstractmethod
+ def create_document(self) -> str:
+ """创建完整文档"""
+ pass
+
+
+class WordFormatter(DocumentFormatter):
+ """Word格式文档生成器 - 符合中国政府公文格式规范(GB/T 9704-2012),并进行了优化"""
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.doc = Document()
+ self._setup_document()
+ self._create_styles()
+ # 初始化三级标题编号系统
+ self.numbers = {
+ 1: 0, # 一级标题编号
+ 2: 0, # 二级标题编号
+ 3: 0 # 三级标题编号
+ }
+
+ def _setup_document(self):
+ """设置文档基本格式,包括页面设置和页眉"""
+ sections = self.doc.sections
+ for section in sections:
+ # 设置页面大小为A4
+ section.page_width = Cm(21)
+ section.page_height = Cm(29.7)
+ # 设置页边距
+ section.top_margin = Cm(3.7) # 上边距37mm
+ section.bottom_margin = Cm(3.5) # 下边距35mm
+ section.left_margin = Cm(2.8) # 左边距28mm
+ section.right_margin = Cm(2.6) # 右边距26mm
+ # 设置页眉页脚距离
+ section.header_distance = Cm(2.0)
+ section.footer_distance = Cm(2.0)
+
+ # 添加页眉
+ header = section.header
+ header_para = header.paragraphs[0]
+ header_para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
+ header_run = header_para.add_run("该文档由GPT-academic生成")
+ header_run.font.name = '仿宋'
+ header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
+ header_run.font.size = Pt(9)
+
+ def _create_styles(self):
+ """创建文档样式"""
+ # 创建正文样式
+ style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH)
+ style.font.name = '仿宋'
+ style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
+ style.font.size = Pt(14)
+ style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+ style.paragraph_format.space_after = Pt(0)
+ style.paragraph_format.first_line_indent = Pt(28)
+
+ # 创建各级标题样式
+ self._create_heading_style('Title_Custom', '方正小标宋简体', 32, WD_PARAGRAPH_ALIGNMENT.CENTER)
+ self._create_heading_style('Heading1_Custom', '黑体', 22, WD_PARAGRAPH_ALIGNMENT.LEFT)
+ self._create_heading_style('Heading2_Custom', '黑体', 18, WD_PARAGRAPH_ALIGNMENT.LEFT)
+ self._create_heading_style('Heading3_Custom', '黑体', 16, WD_PARAGRAPH_ALIGNMENT.LEFT)
+
+ def _create_heading_style(self, style_name: str, font_name: str, font_size: int, alignment):
+ """创建标题样式"""
+ style = self.doc.styles.add_style(style_name, WD_STYLE_TYPE.PARAGRAPH)
+ style.font.name = font_name
+ style._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
+ style.font.size = Pt(font_size)
+ style.font.bold = True
+ style.paragraph_format.alignment = alignment
+ style.paragraph_format.space_before = Pt(12)
+ style.paragraph_format.space_after = Pt(12)
+ style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+ return style
+
+ def _get_heading_number(self, level: int) -> str:
+ """
+ 生成标题编号
+
+ Args:
+ level: 标题级别 (0-3)
+
+ Returns:
+ str: 格式化的标题编号
+ """
+ if level == 0: # 主标题不需要编号
+ return ""
+
+ self.numbers[level] += 1 # 增加当前级别的编号
+
+ # 重置下级标题编号
+ for i in range(level + 1, 4):
+ self.numbers[i] = 0
+
+ # 根据级别返回不同格式的编号
+ if level == 1:
+ return f"{self.numbers[1]}. "
+ elif level == 2:
+ return f"{self.numbers[1]}.{self.numbers[2]} "
+ elif level == 3:
+ return f"{self.numbers[1]}.{self.numbers[2]}.{self.numbers[3]} "
+ return ""
+
+ def _add_heading(self, text: str, level: int):
+ """
+ 添加带编号的标题
+
+ Args:
+ text: 标题文本
+ level: 标题级别 (0-3)
+ """
+ style_map = {
+ 0: 'Title_Custom',
+ 1: 'Heading1_Custom',
+ 2: 'Heading2_Custom',
+ 3: 'Heading3_Custom'
+ }
+
+ number = self._get_heading_number(level)
+ paragraph = self.doc.add_paragraph(style=style_map[level])
+
+ if number:
+ number_run = paragraph.add_run(number)
+ font_size = 22 if level == 1 else (18 if level == 2 else 16)
+ self._get_run_style(number_run, '黑体', font_size, True)
+
+ text_run = paragraph.add_run(text)
+ font_size = 32 if level == 0 else (22 if level == 1 else (18 if level == 2 else 16))
+ self._get_run_style(text_run, '黑体', font_size, True)
+
+ # 主标题添加日期
+ if level == 0:
+ date_paragraph = self.doc.add_paragraph()
+ date_paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
+ date_run = date_paragraph.add_run(datetime.now().strftime('%Y年%m月%d日'))
+ self._get_run_style(date_run, '仿宋', 16, False)
+
+ return paragraph
+
+ def _get_run_style(self, run, font_name: str, font_size: int, bold: bool = False):
+ """设置文本运行对象的样式"""
+ run.font.name = font_name
+ run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
+ run.font.size = Pt(font_size)
+ run.font.bold = bold
+
+ def format_failed_files(self) -> str:
+ """格式化失败文件列表"""
+ result = []
+ if not self.failed_files:
+ return "\n".join(result)
+
+ result.append("处理失败文件:")
+ for fp, reason in self.failed_files:
+ result.append(f"• {os.path.basename(fp)}: {reason}")
+
+ self._add_heading("处理失败文件", 1)
+ for fp, reason in self.failed_files:
+ self._add_content(f"• {os.path.basename(fp)}: {reason}", indent=False)
+ self.doc.add_paragraph()
+
+ return "\n".join(result)
+
+ def _add_content(self, text: str, indent: bool = True):
+ """添加正文内容,使用convert_markdown_to_word处理文本"""
+ # 使用convert_markdown_to_word处理markdown文本
+ processed_text = convert_markdown_to_word(text)
+ paragraph = self.doc.add_paragraph(processed_text, style='Normal_Custom')
+ if not indent:
+ paragraph.paragraph_format.first_line_indent = Pt(0)
+ return paragraph
+
+ def format_file_summaries(self) -> str:
+ """
+ 格式化文件总结内容,确保正确的标题层级并处理markdown文本
+ """
+ result = []
+ # 首先对文件路径进行分组整理
+ file_groups = {}
+ for path in sorted(self.file_summaries_map.keys()):
+ dir_path = os.path.dirname(path)
+ if dir_path not in file_groups:
+ file_groups[dir_path] = []
+ file_groups[dir_path].append(path)
+
+ # 处理没有目录的文件
+ root_files = file_groups.get("", [])
+ if root_files:
+ for path in sorted(root_files):
+ file_name = os.path.basename(path)
+ result.append(f"\n📄 {file_name}")
+ result.append(self.file_summaries_map[path])
+ # 无目录的文件作为二级标题
+ self._add_heading(f"📄 {file_name}", 2)
+ # 使用convert_markdown_to_word处理文件内容
+ self._add_content(convert_markdown_to_word(self.file_summaries_map[path]))
+ self.doc.add_paragraph()
+
+ # 处理有目录的文件
+ for dir_path in sorted(file_groups.keys()):
+ if dir_path == "": # 跳过已处理的根目录文件
+ continue
+
+ # 添加目录作为二级标题
+ result.append(f"\n📁 {dir_path}")
+ self._add_heading(f"📁 {dir_path}", 2)
+
+ # 该目录下的所有文件作为三级标题
+ for path in sorted(file_groups[dir_path]):
+ file_name = os.path.basename(path)
+ result.append(f"\n📄 {file_name}")
+ result.append(self.file_summaries_map[path])
+
+ # 添加文件名作为三级标题
+ self._add_heading(f"📄 {file_name}", 3)
+ # 使用convert_markdown_to_word处理文件内容
+ self._add_content(convert_markdown_to_word(self.file_summaries_map[path]))
+ self.doc.add_paragraph()
+
+ return "\n".join(result)
+
+
+ def create_document(self):
+ """创建完整Word文档并返回文档对象"""
+ # 重置所有编号
+ for level in self.numbers:
+ self.numbers[level] = 0
+
+ # 添加主标题
+ self._add_heading("文档总结报告", 0)
+ self.doc.add_paragraph()
+
+ # 添加总体摘要,使用convert_markdown_to_word处理
+ self._add_heading("总体摘要", 1)
+ self._add_content(convert_markdown_to_word(self.final_summary))
+ self.doc.add_paragraph()
+
+ # 添加失败文件列表(如果有)
+ if self.failed_files:
+ self.format_failed_files()
+
+ # 添加文件详细总结
+ self._add_heading("各文件详细总结", 1)
+ self.format_file_summaries()
+
+ return self.doc
+
+ def save_as_pdf(self, word_path, pdf_path=None):
+ """将生成的Word文档转换为PDF
+
+ 参数:
+ word_path: Word文档的路径
+ pdf_path: 可选,PDF文件的输出路径。如果未指定,将使用与Word文档相同的名称和位置
+
+ 返回:
+ 生成的PDF文件路径,如果转换失败则返回None
+ """
+ from crazy_functions.doc_fns.conversation_doc.word2pdf import WordToPdfConverter
+ try:
+ pdf_path = WordToPdfConverter.convert_to_pdf(word_path, pdf_path)
+ return pdf_path
+ except Exception as e:
+ print(f"PDF转换失败: {str(e)}")
+ return None
+
+
+class MarkdownFormatter(DocumentFormatter):
+ """Markdown格式文档生成器"""
+
+ def format_failed_files(self) -> str:
+ if not self.failed_files:
+ return ""
+
+ formatted_text = ["\n## ⚠️ 处理失败的文件"]
+ for fp, reason in self.failed_files:
+ formatted_text.append(f"- {os.path.basename(fp)}: {reason}")
+ formatted_text.append("\n---")
+ return "\n".join(formatted_text)
+
+ def format_file_summaries(self) -> str:
+ formatted_text = []
+ sorted_paths = sorted(self.file_summaries_map.keys())
+ current_dir = ""
+
+ for path in sorted_paths:
+ dir_path = os.path.dirname(path)
+ if dir_path != current_dir:
+ if dir_path:
+ formatted_text.append(f"\n## 📁 {dir_path}")
+ current_dir = dir_path
+
+ file_name = os.path.basename(path)
+ formatted_text.append(f"\n### 📄 {file_name}")
+ formatted_text.append(self.file_summaries_map[path])
+ formatted_text.append("\n---")
+
+ return "\n".join(formatted_text)
+
+ def create_document(self) -> str:
+ document = [
+ "# 📑 文档总结报告",
+ "\n## 总体摘要",
+ self.final_summary
+ ]
+
+ if self.failed_files:
+ document.append(self.format_failed_files())
+
+ document.extend([
+ "\n# 📚 各文件详细总结",
+ self.format_file_summaries()
+ ])
+
+ return "\n".join(document)
+
+
+
+class HtmlFormatter(DocumentFormatter):
+ """HTML格式文档生成器 - 优化版"""
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.md = markdown.Markdown(extensions=['extra','codehilite', 'tables','nl2br'])
+ self.css_styles = """
+ @keyframes fadeIn {
+ from { opacity: 0; transform: translateY(20px); }
+ to { opacity: 1; transform: translateY(0); }
+ }
+
+ @keyframes slideIn {
+ from { transform: translateX(-20px); opacity: 0; }
+ to { transform: translateX(0); opacity: 1; }
+ }
+
+ @keyframes pulse {
+ 0% { transform: scale(1); }
+ 50% { transform: scale(1.05); }
+ 100% { transform: scale(1); }
+ }
+
+ :root {
+ /* Enhanced color palette */
+ --primary-color: #2563eb;
+ --primary-light: #eff6ff;
+ --secondary-color: #1e293b;
+ --background-color: #f8fafc;
+ --text-color: #334155;
+ --text-light: #64748b;
+ --border-color: #e2e8f0;
+ --error-color: #ef4444;
+ --error-light: #fef2f2;
+ --success-color: #22c55e;
+ --warning-color: #f59e0b;
+ --card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
+ --hover-shadow: 0 20px 25px -5px rgb(0 0 0 / 0.1), 0 8px 10px -6px rgb(0 0 0 / 0.1);
+
+ /* Typography */
+ --heading-font: "Plus Jakarta Sans", system-ui, sans-serif;
+ --body-font: "Inter", system-ui, sans-serif;
+ }
+
+ body {
+ font-family: var(--body-font);
+ line-height: 1.8;
+ max-width: 1200px;
+ margin: 0 auto;
+ padding: 2rem;
+ color: var(--text-color);
+ background-color: var(--background-color);
+ font-size: 16px;
+ -webkit-font-smoothing: antialiased;
+ }
+
+ .container {
+ background: white;
+ padding: 3rem;
+ border-radius: 24px;
+ box-shadow: var(--card-shadow);
+ transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
+ animation: fadeIn 0.6s ease-out;
+ border: 1px solid var(--border-color);
+ }
+
+ .container:hover {
+ box-shadow: var(--hover-shadow);
+ transform: translateY(-2px);
+ }
+
+ h1, h2, h3 {
+ font-family: var(--heading-font);
+ font-weight: 600;
+ }
+
+ h1 {
+ color: var(--primary-color);
+ font-size: 2.8em;
+ text-align: center;
+ margin: 2rem 0 3rem;
+ padding-bottom: 1.5rem;
+ border-bottom: 3px solid var(--primary-color);
+ letter-spacing: -0.03em;
+ position: relative;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ gap: 1rem;
+ }
+
+ h1::after {
+ content: '';
+ position: absolute;
+ bottom: -3px;
+ left: 50%;
+ transform: translateX(-50%);
+ width: 120px;
+ height: 3px;
+ background: linear-gradient(90deg, var(--primary-color), var(--primary-light));
+ border-radius: 3px;
+ transition: width 0.3s ease;
+ }
+
+ h1:hover::after {
+ width: 180px;
+ }
+
+ h2 {
+ color: var(--secondary-color);
+ font-size: 1.9em;
+ margin: 2.5rem 0 1.5rem;
+ padding-left: 1.2rem;
+ border-left: 4px solid var(--primary-color);
+ letter-spacing: -0.02em;
+ display: flex;
+ align-items: center;
+ gap: 1rem;
+ transition: all 0.3s ease;
+ }
+
+ h2:hover {
+ color: var(--primary-color);
+ transform: translateX(5px);
+ }
+
+ h3 {
+ color: var(--text-color);
+ font-size: 1.5em;
+ margin: 2rem 0 1rem;
+ padding-bottom: 0.8rem;
+ border-bottom: 2px solid var(--border-color);
+ transition: all 0.3s ease;
+ display: flex;
+ align-items: center;
+ gap: 0.8rem;
+ }
+
+ h3:hover {
+ color: var(--primary-color);
+ border-bottom-color: var(--primary-color);
+ }
+
+ .summary {
+ background: var(--primary-light);
+ padding: 2.5rem;
+ border-radius: 16px;
+ margin: 2.5rem 0;
+ box-shadow: 0 4px 6px -1px rgba(37, 99, 235, 0.1);
+ position: relative;
+ overflow: hidden;
+ transition: transform 0.3s ease, box-shadow 0.3s ease;
+ animation: slideIn 0.5s ease-out;
+ }
+
+ .summary:hover {
+ transform: translateY(-3px);
+ box-shadow: 0 8px 12px -2px rgba(37, 99, 235, 0.15);
+ }
+
+ .summary::before {
+ content: '';
+ position: absolute;
+ top: 0;
+ left: 0;
+ width: 4px;
+ height: 100%;
+ background: linear-gradient(to bottom, var(--primary-color), rgba(37, 99, 235, 0.6));
+ }
+
+ .summary p {
+ margin: 1.2rem 0;
+ line-height: 1.9;
+ color: var(--text-color);
+ transition: color 0.3s ease;
+ }
+
+ .summary:hover p {
+ color: var(--secondary-color);
+ }
+
+ .details {
+ margin-top: 3.5rem;
+ padding-top: 2.5rem;
+ border-top: 2px dashed var(--border-color);
+ animation: fadeIn 0.8s ease-out;
+ }
+
+ .failed-files {
+ background: var(--error-light);
+ padding: 2rem;
+ border-radius: 16px;
+ margin: 3rem 0;
+ border-left: 4px solid var(--error-color);
+ position: relative;
+ transition: all 0.3s ease;
+ animation: slideIn 0.5s ease-out;
+ }
+
+ .failed-files:hover {
+ transform: translateX(5px);
+ box-shadow: 0 8px 15px -3px rgba(239, 68, 68, 0.1);
+ }
+
+ .failed-files h2 {
+ color: var(--error-color);
+ border-left: none;
+ padding-left: 0;
+ }
+
+ .failed-files ul {
+ margin: 1.8rem 0;
+ padding-left: 1.2rem;
+ list-style-type: none;
+ }
+
+ .failed-files li {
+ margin: 1.2rem 0;
+ padding: 1.2rem 1.8rem;
+ background: rgba(239, 68, 68, 0.08);
+ border-radius: 12px;
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+ }
+
+ .failed-files li:hover {
+ transform: translateX(8px);
+ background: rgba(239, 68, 68, 0.12);
+ }
+
+ .directory-section {
+ margin: 3.5rem 0;
+ padding: 2rem;
+ background: var(--background-color);
+ border-radius: 16px;
+ position: relative;
+ transition: all 0.3s ease;
+ animation: fadeIn 0.6s ease-out;
+ }
+
+ .directory-section:hover {
+ background: white;
+ box-shadow: var(--card-shadow);
+ }
+
+ .file-summary {
+ background: white;
+ padding: 2rem;
+ margin: 1.8rem 0;
+ border-radius: 16px;
+ box-shadow: var(--card-shadow);
+ border-left: 4px solid var(--border-color);
+ transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
+ position: relative;
+ overflow: hidden;
+ }
+
+ .file-summary:hover {
+ border-left-color: var(--primary-color);
+ transform: translateX(8px) translateY(-2px);
+ box-shadow: var(--hover-shadow);
+ }
+
+ .file-summary {
+ background: white;
+ padding: 2rem;
+ margin: 1.8rem 0;
+ border-radius: 16px;
+ box-shadow: var(--card-shadow);
+ border-left: 4px solid var(--border-color);
+ transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
+ position: relative;
+ }
+
+ .file-summary:hover {
+ border-left-color: var(--primary-color);
+ transform: translateX(8px) translateY(-2px);
+ box-shadow: var(--hover-shadow);
+ }
+
+ .icon {
+ display: inline-flex;
+ align-items: center;
+ justify-content: center;
+ width: 32px;
+ height: 32px;
+ border-radius: 8px;
+ background: var(--primary-light);
+ color: var(--primary-color);
+ font-size: 1.2em;
+ transition: all 0.3s ease;
+ }
+
+ .file-summary:hover .icon,
+ .directory-section:hover .icon {
+ transform: scale(1.1);
+ background: var(--primary-color);
+ color: white;
+ }
+
+ /* Smooth scrolling */
+ html {
+ scroll-behavior: smooth;
+ }
+
+ /* Selection style */
+ ::selection {
+ background: var(--primary-light);
+ color: var(--primary-color);
+ }
+
+ /* Print styles */
+ @media print {
+ body {
+ background: white;
+ }
+ .container {
+ box-shadow: none;
+ padding: 0;
+ }
+ .file-summary, .failed-files {
+ break-inside: avoid;
+ box-shadow: none;
+ }
+ .icon {
+ display: none;
+ }
+ }
+
+ /* Responsive design */
+ @media (max-width: 768px) {
+ body {
+ padding: 1rem;
+ font-size: 15px;
+ }
+
+ .container {
+ padding: 1.5rem;
+ }
+
+ h1 {
+ font-size: 2.2em;
+ margin: 1.5rem 0 2rem;
+ }
+
+ h2 {
+ font-size: 1.7em;
+ }
+
+ h3 {
+ font-size: 1.4em;
+ }
+
+ .summary, .failed-files, .directory-section {
+ padding: 1.5rem;
+ }
+
+ .file-summary {
+ padding: 1.2rem;
+ }
+
+ .icon {
+ width: 28px;
+ height: 28px;
+ }
+ }
+
+ /* Dark mode support */
+ @media (prefers-color-scheme: dark) {
+ :root {
+ --primary-light: rgba(37, 99, 235, 0.15);
+ --background-color: #0f172a;
+ --text-color: #e2e8f0;
+ --text-light: #94a3b8;
+ --border-color: #1e293b;
+ --error-light: rgba(239, 68, 68, 0.15);
+ }
+
+ .container, .file-summary {
+ background: #1e293b;
+ }
+
+ .directory-section {
+ background: #0f172a;
+ }
+
+ .directory-section:hover {
+ background: #1e293b;
+ }
+ }
+ """
+
+ def format_failed_files(self) -> str:
+ if not self.failed_files:
+ return ""
+
+ failed_files_html = ['']
+ failed_files_html.append('
⚠️ 处理失败的文件
')
+ failed_files_html.append("
")
+ for fp, reason in self.failed_files:
+ failed_files_html.append(
+ f'- 📄 {os.path.basename(fp)}
{reason} '
+ )
+ failed_files_html.append("
")
+ return "\n".join(failed_files_html)
+
+ def format_file_summaries(self) -> str:
+ formatted_html = []
+ sorted_paths = sorted(self.file_summaries_map.keys())
+ current_dir = ""
+
+ for path in sorted_paths:
+ dir_path = os.path.dirname(path)
+ if dir_path != current_dir:
+ if dir_path:
+ formatted_html.append('')
+ formatted_html.append(f'
📁 {dir_path}
')
+ formatted_html.append('')
+ current_dir = dir_path
+
+ file_name = os.path.basename(path)
+ formatted_html.append('')
+ formatted_html.append(f'
📄 {file_name}
')
+ formatted_html.append(self.md.convert(self.file_summaries_map[path]))
+ formatted_html.append('')
+
+ return "\n".join(formatted_html)
+
+ def create_document(self) -> str:
+ """生成HTML文档
+ Returns:
+ str: 完整的HTML文档字符串
+ """
+ return f"""
+
+
+
+
+
+ 文档总结报告
+
+
+
+
+
+
+
📑 文档总结报告
+
+
📋 总体摘要
+
{self.md.convert(self.final_summary)}
+
+ {self.format_failed_files()}
+
+
📚 各文件详细总结
+ {self.format_file_summaries()}
+
+
+
+
+
+ """
\ No newline at end of file
diff --git a/crazy_functions/doc_fns/content_folder.py b/crazy_functions/doc_fns/content_folder.py
new file mode 100644
index 00000000..26f991c7
--- /dev/null
+++ b/crazy_functions/doc_fns/content_folder.py
@@ -0,0 +1,237 @@
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional, Type, TypeVar, Generic, Union
+
+from dataclasses import dataclass
+from enum import Enum, auto
+import logging
+from datetime import datetime
+
+# 设置日志
+logger = logging.getLogger(__name__)
+
+
+# 自定义异常类定义
+class FoldingError(Exception):
+ """折叠相关的自定义异常基类"""
+ pass
+
+
+class FormattingError(FoldingError):
+ """格式化过程中的错误"""
+ pass
+
+
+class MetadataError(FoldingError):
+ """元数据相关的错误"""
+ pass
+
+
+class ValidationError(FoldingError):
+ """验证错误"""
+ pass
+
+
+class FoldingStyle(Enum):
+ """折叠样式枚举"""
+ SIMPLE = auto() # 简单折叠
+ DETAILED = auto() # 详细折叠(带有额外信息)
+ NESTED = auto() # 嵌套折叠
+
+
+@dataclass
+class FoldingOptions:
+ """折叠选项配置"""
+ style: FoldingStyle = FoldingStyle.DETAILED
+ code_language: Optional[str] = None # 代码块的语言
+ show_timestamp: bool = False # 是否显示时间戳
+ indent_level: int = 0 # 缩进级别
+ custom_css: Optional[str] = None # 自定义CSS类
+
+
+T = TypeVar('T') # 用于泛型类型
+
+
+class BaseMetadata(ABC):
+ """元数据基类"""
+
+ @abstractmethod
+ def validate(self) -> bool:
+ """验证元数据的有效性"""
+ pass
+
+ def _validate_non_empty_str(self, value: Optional[str]) -> bool:
+ """验证字符串非空"""
+ return bool(value and value.strip())
+
+
+@dataclass
+class FileMetadata(BaseMetadata):
+ """文件元数据"""
+ rel_path: str
+ size: float
+ last_modified: Optional[datetime] = None
+ mime_type: Optional[str] = None
+ encoding: str = 'utf-8'
+
+ def validate(self) -> bool:
+ """验证文件元数据的有效性"""
+ try:
+ if not self._validate_non_empty_str(self.rel_path):
+ return False
+ if self.size < 0:
+ return False
+ return True
+ except Exception as e:
+ logger.error(f"File metadata validation error: {str(e)}")
+ return False
+
+
+
+
+class ContentFormatter(ABC, Generic[T]):
+ """内容格式化抽象基类
+
+ 支持泛型类型参数,可以指定具体的元数据类型。
+ """
+
+ @abstractmethod
+ def format(self,
+ content: str,
+ metadata: T,
+ options: Optional[FoldingOptions] = None) -> str:
+ """格式化内容
+
+ Args:
+ content: 需要格式化的内容
+ metadata: 类型化的元数据
+ options: 折叠选项
+
+ Returns:
+ str: 格式化后的内容
+
+ Raises:
+ FormattingError: 格式化过程中的错误
+ """
+ pass
+
+ def _create_summary(self, metadata: T) -> str:
+ """创建折叠摘要,可被子类重写"""
+ return str(metadata)
+
+ def _format_content_block(self,
+ content: str,
+ options: Optional[FoldingOptions]) -> str:
+ """格式化内容块,处理代码块等特殊格式"""
+ if not options:
+ return content
+
+ if options.code_language:
+ return f"```{options.code_language}\n{content}\n```"
+ return content
+
+ def _add_indent(self, text: str, level: int) -> str:
+ """添加缩进"""
+ if level <= 0:
+ return text
+ indent = " " * level
+ return "\n".join(indent + line for line in text.splitlines())
+
+
+class FileContentFormatter(ContentFormatter[FileMetadata]):
+ """文件内容格式化器"""
+
+ def format(self,
+ content: str,
+ metadata: FileMetadata,
+ options: Optional[FoldingOptions] = None) -> str:
+ """格式化文件内容"""
+ if not metadata.validate():
+ raise MetadataError("Invalid file metadata")
+
+ try:
+ options = options or FoldingOptions()
+
+ # 构建摘要信息
+ summary_parts = [
+ f"{metadata.rel_path} ({metadata.size:.2f}MB)",
+ f"Type: {metadata.mime_type}" if metadata.mime_type else None,
+ (f"Modified: {metadata.last_modified.strftime('%Y-%m-%d %H:%M:%S')}"
+ if metadata.last_modified and options.show_timestamp else None)
+ ]
+ summary = " | ".join(filter(None, summary_parts))
+
+ # 构建HTML类
+ css_class = f' class="{options.custom_css}"' if options.custom_css else ''
+
+ # 格式化内容
+ formatted_content = self._format_content_block(content, options)
+
+ # 组装最终结果
+ result = (
+ f'{summary}
\n\n'
+ f'{formatted_content}\n\n'
+ f' \n\n'
+ )
+
+ return self._add_indent(result, options.indent_level)
+
+ except Exception as e:
+ logger.error(f"Error formatting file content: {str(e)}")
+ raise FormattingError(f"Failed to format file content: {str(e)}")
+
+
+class ContentFoldingManager:
+ """内容折叠管理器"""
+
+ def __init__(self):
+ """初始化折叠管理器"""
+ self._formatters: Dict[str, ContentFormatter] = {}
+ self._register_default_formatters()
+
+ def _register_default_formatters(self) -> None:
+ """注册默认的格式化器"""
+ self.register_formatter('file', FileContentFormatter())
+
+ def register_formatter(self, name: str, formatter: ContentFormatter) -> None:
+ """注册新的格式化器"""
+ if not isinstance(formatter, ContentFormatter):
+ raise TypeError("Formatter must implement ContentFormatter interface")
+ self._formatters[name] = formatter
+
+ def _guess_language(self, extension: str) -> Optional[str]:
+ """根据文件扩展名猜测编程语言"""
+ extension = extension.lower().lstrip('.')
+ language_map = {
+ 'py': 'python',
+ 'js': 'javascript',
+ 'java': 'java',
+ 'cpp': 'cpp',
+ 'cs': 'csharp',
+ 'html': 'html',
+ 'css': 'css',
+ 'md': 'markdown',
+ 'json': 'json',
+ 'xml': 'xml',
+ 'sql': 'sql',
+ 'sh': 'bash',
+ 'yaml': 'yaml',
+ 'yml': 'yaml',
+ 'txt': None # 纯文本不需要语言标识
+ }
+ return language_map.get(extension)
+
+ def format_content(self,
+ content: str,
+ formatter_type: str,
+ metadata: Union[FileMetadata],
+ options: Optional[FoldingOptions] = None) -> str:
+ """格式化内容"""
+ formatter = self._formatters.get(formatter_type)
+ if not formatter:
+ raise KeyError(f"No formatter registered for type: {formatter_type}")
+
+ if not isinstance(metadata, FileMetadata):
+ raise TypeError("Invalid metadata type")
+
+ return formatter.format(content, metadata, options)
+
diff --git a/crazy_functions/doc_fns/conversation_doc/excel_doc.py b/crazy_functions/doc_fns/conversation_doc/excel_doc.py
new file mode 100644
index 00000000..ee19e162
--- /dev/null
+++ b/crazy_functions/doc_fns/conversation_doc/excel_doc.py
@@ -0,0 +1,211 @@
+import re
+import os
+import pandas as pd
+from datetime import datetime
+from openpyxl import Workbook
+
+
+class ExcelTableFormatter:
+ """聊天记录中Markdown表格转Excel生成器"""
+
+ def __init__(self):
+ """初始化Excel文档对象"""
+ self.workbook = Workbook()
+ self._table_count = 0
+ self._current_sheet = None
+
+ def _normalize_table_row(self, row):
+ """标准化表格行,处理不同的分隔符情况"""
+ row = row.strip()
+ if row.startswith('|'):
+ row = row[1:]
+ if row.endswith('|'):
+ row = row[:-1]
+ return [cell.strip() for cell in row.split('|')]
+
+ def _is_separator_row(self, row):
+ """检查是否是分隔行(由 - 或 : 组成)"""
+ clean_row = re.sub(r'[\s|]', '', row)
+ return bool(re.match(r'^[-:]+$', clean_row))
+
+ def _extract_tables_from_text(self, text):
+ """从文本中提取所有表格内容"""
+ if not isinstance(text, str):
+ return []
+
+ tables = []
+ current_table = []
+ is_in_table = False
+
+ for line in text.split('\n'):
+ line = line.strip()
+ if not line:
+ if is_in_table and current_table:
+ if len(current_table) >= 2:
+ tables.append(current_table)
+ current_table = []
+ is_in_table = False
+ continue
+
+ if '|' in line:
+ if not is_in_table:
+ is_in_table = True
+ current_table.append(line)
+ else:
+ if is_in_table and current_table:
+ if len(current_table) >= 2:
+ tables.append(current_table)
+ current_table = []
+ is_in_table = False
+
+ if is_in_table and current_table and len(current_table) >= 2:
+ tables.append(current_table)
+
+ return tables
+
+ def _parse_table(self, table_lines):
+ """解析表格内容为结构化数据"""
+ try:
+ headers = self._normalize_table_row(table_lines[0])
+
+ separator_index = next(
+ (i for i, line in enumerate(table_lines) if self._is_separator_row(line)),
+ 1
+ )
+
+ data_rows = []
+ for line in table_lines[separator_index + 1:]:
+ cells = self._normalize_table_row(line)
+ # 确保单元格数量与表头一致
+ while len(cells) < len(headers):
+ cells.append('')
+ cells = cells[:len(headers)]
+ data_rows.append(cells)
+
+ if headers and data_rows:
+ return {
+ 'headers': headers,
+ 'data': data_rows
+ }
+ except Exception as e:
+ print(f"解析表格时发生错误: {str(e)}")
+
+ return None
+
+ def _create_sheet(self, question_num, table_num):
+ """创建新的工作表"""
+ sheet_name = f'Q{question_num}_T{table_num}'
+ if len(sheet_name) > 31:
+ sheet_name = f'Table{self._table_count}'
+
+ if sheet_name in self.workbook.sheetnames:
+ sheet_name = f'{sheet_name}_{datetime.now().strftime("%H%M%S")}'
+
+ return self.workbook.create_sheet(title=sheet_name)
+
+ def create_document(self, history):
+ """
+ 处理聊天历史中的所有表格并创建Excel文档
+
+ Args:
+ history: 聊天历史列表
+
+ Returns:
+ Workbook: 处理完成的Excel工作簿对象,如果没有表格则返回None
+ """
+ has_tables = False
+
+ # 删除默认创建的工作表
+ default_sheet = self.workbook['Sheet']
+ self.workbook.remove(default_sheet)
+
+ # 遍历所有回答
+ for i in range(1, len(history), 2):
+ answer = history[i]
+ tables = self._extract_tables_from_text(answer)
+
+ for table_lines in tables:
+ parsed_table = self._parse_table(table_lines)
+ if parsed_table:
+ self._table_count += 1
+ sheet = self._create_sheet(i // 2 + 1, self._table_count)
+
+ # 写入表头
+ for col, header in enumerate(parsed_table['headers'], 1):
+ sheet.cell(row=1, column=col, value=header)
+
+ # 写入数据
+ for row_idx, row_data in enumerate(parsed_table['data'], 2):
+ for col_idx, value in enumerate(row_data, 1):
+ sheet.cell(row=row_idx, column=col_idx, value=value)
+
+ has_tables = True
+
+ return self.workbook if has_tables else None
+
+
+def save_chat_tables(history, save_dir, base_name):
+ """
+ 保存聊天历史中的表格到Excel文件
+
+ Args:
+ history: 聊天历史列表
+ save_dir: 保存目录
+ base_name: 基础文件名
+
+ Returns:
+ list: 保存的文件路径列表
+ """
+ result_files = []
+
+ try:
+ # 创建Excel格式
+ excel_formatter = ExcelTableFormatter()
+ workbook = excel_formatter.create_document(history)
+
+ if workbook is not None:
+ # 确保保存目录存在
+ os.makedirs(save_dir, exist_ok=True)
+
+ # 生成Excel文件路径
+ excel_file = os.path.join(save_dir, base_name + '.xlsx')
+
+ # 保存Excel文件
+ workbook.save(excel_file)
+ result_files.append(excel_file)
+ print(f"已保存表格到Excel文件: {excel_file}")
+ except Exception as e:
+ print(f"保存Excel格式失败: {str(e)}")
+
+ return result_files
+
+
+# 使用示例
+if __name__ == "__main__":
+ # 示例聊天历史
+ history = [
+ "问题1",
+ """这是第一个表格:
+ | A | B | C |
+ |---|---|---|
+ | 1 | 2 | 3 |""",
+
+ "问题2",
+ "这是没有表格的回答",
+
+ "问题3",
+ """回答包含多个表格:
+ | Name | Age |
+ |------|-----|
+ | Tom | 20 |
+
+ 第二个表格:
+ | X | Y |
+ |---|---|
+ | 1 | 2 |"""
+ ]
+
+ # 保存表格
+ save_dir = "output"
+ base_name = "chat_tables"
+ saved_files = save_chat_tables(history, save_dir, base_name)
\ No newline at end of file
diff --git a/crazy_functions/doc_fns/conversation_doc/html_doc.py b/crazy_functions/doc_fns/conversation_doc/html_doc.py
new file mode 100644
index 00000000..49e8becb
--- /dev/null
+++ b/crazy_functions/doc_fns/conversation_doc/html_doc.py
@@ -0,0 +1,190 @@
+
+
+class HtmlFormatter:
+ """聊天记录HTML格式生成器"""
+
+ def __init__(self, chatbot, history):
+ self.chatbot = chatbot
+ self.history = history
+ self.css_styles = """
+ :root {
+ --primary-color: #2563eb;
+ --primary-light: #eff6ff;
+ --secondary-color: #1e293b;
+ --background-color: #f8fafc;
+ --text-color: #334155;
+ --border-color: #e2e8f0;
+ --card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
+ }
+
+ body {
+ font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+ line-height: 1.8;
+ margin: 0;
+ padding: 2rem;
+ color: var(--text-color);
+ background-color: var(--background-color);
+ }
+
+ .container {
+ max-width: 1200px;
+ margin: 0 auto;
+ background: white;
+ padding: 2rem;
+ border-radius: 16px;
+ box-shadow: var(--card-shadow);
+ }
+ ::selection {
+ background: var(--primary-light);
+ color: var(--primary-color);
+ }
+ @keyframes fadeIn {
+ from { opacity: 0; transform: translateY(20px); }
+ to { opacity: 1; transform: translateY(0); }
+ }
+
+ @keyframes slideIn {
+ from { transform: translateX(-20px); opacity: 0; }
+ to { transform: translateX(0); opacity: 1; }
+ }
+
+ .container {
+ animation: fadeIn 0.6s ease-out;
+ }
+
+ .QaBox {
+ animation: slideIn 0.5s ease-out;
+ transition: all 0.3s ease;
+ }
+
+ .QaBox:hover {
+ transform: translateX(5px);
+ }
+ .Question, .Answer, .historyBox {
+ transition: all 0.3s ease;
+ }
+ .chat-title {
+ color: var(--primary-color);
+ font-size: 2em;
+ text-align: center;
+ margin: 1rem 0 2rem;
+ padding-bottom: 1rem;
+ border-bottom: 2px solid var(--primary-color);
+ }
+
+ .chat-body {
+ display: flex;
+ flex-direction: column;
+ gap: 1.5rem;
+ margin: 2rem 0;
+ }
+
+ .QaBox {
+ background: white;
+ padding: 1.5rem;
+ border-radius: 8px;
+ border-left: 4px solid var(--primary-color);
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+ margin-bottom: 1.5rem;
+ }
+
+ .Question {
+ color: var(--secondary-color);
+ font-weight: 500;
+ margin-bottom: 1rem;
+ }
+
+ .Answer {
+ color: var(--text-color);
+ background: var(--primary-light);
+ padding: 1rem;
+ border-radius: 6px;
+ }
+
+ .history-section {
+ margin-top: 3rem;
+ padding-top: 2rem;
+ border-top: 2px solid var(--border-color);
+ }
+
+ .history-title {
+ color: var(--secondary-color);
+ font-size: 1.5em;
+ margin-bottom: 1.5rem;
+ text-align: center;
+ }
+
+ .historyBox {
+ background: white;
+ padding: 1rem;
+ margin: 0.5rem 0;
+ border-radius: 6px;
+ border: 1px solid var(--border-color);
+ }
+
+ @media (prefers-color-scheme: dark) {
+ :root {
+ --background-color: #0f172a;
+ --text-color: #e2e8f0;
+ --border-color: #1e293b;
+ }
+
+ .container, .QaBox {
+ background: #1e293b;
+ }
+ }
+ """
+
+ def format_chat_content(self) -> str:
+ """格式化聊天内容"""
+ chat_content = []
+ for q, a in self.chatbot:
+ question = str(q) if q is not None else ""
+ answer = str(a) if a is not None else ""
+ chat_content.append(f'''
+
+
{question}
+
{answer}
+
+ ''')
+ return "\n".join(chat_content)
+
+ def format_history_content(self) -> str:
+ """格式化历史记录内容"""
+ if not self.history:
+ return ""
+
+ history_content = []
+ for entry in self.history:
+ history_content.append(f'''
+
+ ''')
+ return "\n".join(history_content)
+
+ def create_document(self) -> str:
+ """生成完整的HTML文档
+
+ Returns:
+ str: 完整的HTML文档字符串
+ """
+ return f"""
+
+
+
+
+
+ 对话存档
+
+
+
+
+
对话存档
+
+ {self.format_chat_content()}
+
+
+
+
+ """
\ No newline at end of file
diff --git a/crazy_functions/doc_fns/conversation_doc/markdown_doc.py b/crazy_functions/doc_fns/conversation_doc/markdown_doc.py
new file mode 100644
index 00000000..15441073
--- /dev/null
+++ b/crazy_functions/doc_fns/conversation_doc/markdown_doc.py
@@ -0,0 +1,39 @@
+
+class MarkdownFormatter:
+ """Markdown格式文档生成器 - 用于生成对话记录的markdown文档"""
+
+ def __init__(self):
+ self.content = []
+
+ def _add_content(self, text: str):
+ """添加正文内容"""
+ if text:
+ self.content.append(f"\n{text}\n")
+
+ def create_document(self, history: list) -> str:
+ """
+ 创建完整的Markdown文档
+ Args:
+ history: 历史记录列表,偶数位置为问题,奇数位置为答案
+ Returns:
+ str: 生成的Markdown文本
+ """
+ self.content = []
+
+ # 处理问答对
+ for i in range(0, len(history), 2):
+ question = history[i]
+ answer = history[i + 1]
+
+ # 添加问题
+ self.content.append(f"\n### 问题 {i//2 + 1}")
+ self._add_content(question)
+
+ # 添加回答
+ self.content.append(f"\n### 回答 {i//2 + 1}")
+ self._add_content(answer)
+
+ # 添加分隔线
+ self.content.append("\n---\n")
+
+ return "\n".join(self.content)
diff --git a/crazy_functions/doc_fns/conversation_doc/pdf_doc.py b/crazy_functions/doc_fns/conversation_doc/pdf_doc.py
new file mode 100644
index 00000000..2b7d15c6
--- /dev/null
+++ b/crazy_functions/doc_fns/conversation_doc/pdf_doc.py
@@ -0,0 +1,172 @@
+from datetime import datetime
+import os
+import re
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.ttfonts import TTFont
+
+def convert_markdown_to_pdf(markdown_text):
+ """将Markdown文本转换为PDF格式的纯文本"""
+ if not markdown_text:
+ return ""
+
+ # 标准化换行符
+ markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n')
+
+ # 处理标题、粗体、斜体
+ markdown_text = re.sub(r'^#\s+(.+)$', r'\1', markdown_text, flags=re.MULTILINE)
+ markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text)
+ markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text)
+
+ # 处理列表
+ markdown_text = re.sub(r'^\s*[-*+]\s+(.+?)(?=\n|$)', r'• \1', markdown_text, flags=re.MULTILINE)
+ markdown_text = re.sub(r'^\s*\d+\.\s+(.+?)(?=\n|$)', r'\1', markdown_text, flags=re.MULTILINE)
+
+ # 处理链接
+ markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', markdown_text)
+
+ # 处理段落
+ markdown_text = re.sub(r'\n{2,}', '\n', markdown_text)
+ markdown_text = re.sub(r'(?([^<]+)', r'\2 (\1)',
+ markdown_text)
+
+ # 6. Preserve paragraph breaks
+ markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text) # normalize multiple newlines to double newlines
+
+ # 7. Clean up extra spaces but maintain indentation
+ markdown_text = re.sub(r' +', ' ', markdown_text)
+
+ return markdown_text.strip()
+
+
+class TxtFormatter:
+ """Chat history TXT document generator"""
+
+ def __init__(self):
+ self.content = []
+ self._setup_document()
+
+ def _setup_document(self):
+ """Initialize document with header"""
+ self.content.append("=" * 50)
+ self.content.append("GPT-Academic对话记录".center(48))
+ self.content.append("=" * 50)
+
+ def _format_header(self):
+ """Create document header with current date"""
+ from datetime import datetime
+ date_str = datetime.now().strftime('%Y年%m月%d日')
+ return [
+ date_str.center(48),
+ "\n" # Add blank line after date
+ ]
+
+ def create_document(self, history):
+ """Generate document from chat history"""
+ # Add header with date
+ self.content.extend(self._format_header())
+
+ # Add conversation content
+ for i in range(0, len(history), 2):
+ question = history[i]
+ answer = convert_markdown_to_txt(history[i + 1]) if i + 1 < len(history) else ""
+
+ if question:
+ self.content.append(f"问题 {i // 2 + 1}:{str(question)}")
+ self.content.append("") # Add blank line
+
+ if answer:
+ self.content.append(f"回答 {i // 2 + 1}:{str(answer)}")
+ self.content.append("") # Add blank line
+
+ # Join all content with newlines
+ return "\n".join(self.content)
diff --git a/crazy_functions/doc_fns/conversation_doc/word2pdf.py b/crazy_functions/doc_fns/conversation_doc/word2pdf.py
new file mode 100644
index 00000000..253ecd25
--- /dev/null
+++ b/crazy_functions/doc_fns/conversation_doc/word2pdf.py
@@ -0,0 +1,155 @@
+from docx2pdf import convert
+import os
+import platform
+import subprocess
+from typing import Union
+from pathlib import Path
+from datetime import datetime
+
+class WordToPdfConverter:
+ """Word文档转PDF转换器"""
+
+ @staticmethod
+ def convert_to_pdf(word_path: Union[str, Path], pdf_path: Union[str, Path] = None) -> str:
+ """
+ 将Word文档转换为PDF
+
+ 参数:
+ word_path: Word文档的路径
+ pdf_path: 可选,PDF文件的输出路径。如果未指定,将使用与Word文档相同的名称和位置
+
+ 返回:
+ 生成的PDF文件路径
+
+ 异常:
+ 如果转换失败,将抛出相应异常
+ """
+ try:
+ # 确保输入路径是Path对象
+ word_path = Path(word_path)
+
+ # 如果未指定pdf_path,则使用与word文档相同的名称
+ if pdf_path is None:
+ pdf_path = word_path.with_suffix('.pdf')
+ else:
+ pdf_path = Path(pdf_path)
+
+ # 检查操作系统
+ if platform.system() == 'Linux':
+ # Linux系统需要安装libreoffice
+ which_result = subprocess.run(['which', 'libreoffice'], capture_output=True, text=True)
+ if which_result.returncode != 0:
+ raise RuntimeError("请先安装LibreOffice: sudo apt-get install libreoffice")
+
+ print(f"开始转换Word文档: {word_path} 到 PDF")
+
+ # 使用subprocess代替os.system
+ result = subprocess.run(
+ ['libreoffice', '--headless', '--convert-to', 'pdf:writer_pdf_Export',
+ str(word_path), '--outdir', str(pdf_path.parent)],
+ capture_output=True, text=True
+ )
+
+ if result.returncode != 0:
+ error_msg = result.stderr or "未知错误"
+ print(f"LibreOffice转换失败,错误信息: {error_msg}")
+ raise RuntimeError(f"LibreOffice转换失败: {error_msg}")
+
+ print(f"LibreOffice转换输出: {result.stdout}")
+
+ # 如果输出路径与默认生成的不同,则重命名
+ default_pdf = word_path.with_suffix('.pdf')
+ if default_pdf != pdf_path and default_pdf.exists():
+ os.rename(default_pdf, pdf_path)
+ print(f"已将PDF从 {default_pdf} 重命名为 {pdf_path}")
+
+ # 验证PDF是否成功生成
+ if not pdf_path.exists() or pdf_path.stat().st_size == 0:
+ raise RuntimeError("PDF生成失败或文件为空")
+
+ print(f"PDF转换成功,文件大小: {pdf_path.stat().st_size} 字节")
+ else:
+ # Windows和MacOS使用docx2pdf
+ print(f"使用docx2pdf转换 {word_path} 到 {pdf_path}")
+ convert(word_path, pdf_path)
+
+ # 验证PDF是否成功生成
+ if not pdf_path.exists() or pdf_path.stat().st_size == 0:
+ raise RuntimeError("PDF生成失败或文件为空")
+
+ print(f"PDF转换成功,文件大小: {pdf_path.stat().st_size} 字节")
+
+ return str(pdf_path)
+
+ except Exception as e:
+ print(f"PDF转换异常: {str(e)}")
+ raise Exception(f"转换PDF失败: {str(e)}")
+
+ @staticmethod
+ def batch_convert(word_dir: Union[str, Path], pdf_dir: Union[str, Path] = None) -> list:
+ """
+ 批量转换目录下的所有Word文档
+
+ 参数:
+ word_dir: 包含Word文档的目录路径
+ pdf_dir: 可选,PDF文件的输出目录。如果未指定,将使用与Word文档相同的目录
+
+ 返回:
+ 生成的PDF文件路径列表
+ """
+ word_dir = Path(word_dir)
+ if pdf_dir:
+ pdf_dir = Path(pdf_dir)
+ pdf_dir.mkdir(parents=True, exist_ok=True)
+
+ converted_files = []
+
+ for word_file in word_dir.glob("*.docx"):
+ try:
+ if pdf_dir:
+ pdf_path = pdf_dir / word_file.with_suffix('.pdf').name
+ else:
+ pdf_path = word_file.with_suffix('.pdf')
+
+ pdf_file = WordToPdfConverter.convert_to_pdf(word_file, pdf_path)
+ converted_files.append(pdf_file)
+
+ except Exception as e:
+ print(f"转换 {word_file} 失败: {str(e)}")
+
+ return converted_files
+
+ @staticmethod
+ def convert_doc_to_pdf(doc, output_dir: Union[str, Path] = None) -> str:
+ """
+ 将docx对象直接转换为PDF
+
+ 参数:
+ doc: python-docx的Document对象
+ output_dir: 可选,输出目录。如果未指定,将使用当前目录
+
+ 返回:
+ 生成的PDF文件路径
+ """
+ try:
+ # 设置临时文件路径和输出路径
+ output_dir = Path(output_dir) if output_dir else Path.cwd()
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # 生成临时word文件
+ temp_docx = output_dir / f"temp_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
+ doc.save(temp_docx)
+
+ # 转换为PDF
+ pdf_path = temp_docx.with_suffix('.pdf')
+ WordToPdfConverter.convert_to_pdf(temp_docx, pdf_path)
+
+ # 删除临时word文件
+ temp_docx.unlink()
+
+ return str(pdf_path)
+
+ except Exception as e:
+ if temp_docx.exists():
+ temp_docx.unlink()
+ raise Exception(f"转换PDF失败: {str(e)}")
\ No newline at end of file
diff --git a/crazy_functions/doc_fns/conversation_doc/word_doc.py b/crazy_functions/doc_fns/conversation_doc/word_doc.py
new file mode 100644
index 00000000..73556888
--- /dev/null
+++ b/crazy_functions/doc_fns/conversation_doc/word_doc.py
@@ -0,0 +1,177 @@
+import re
+from docx import Document
+from docx.shared import Cm, Pt
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
+from docx.enum.style import WD_STYLE_TYPE
+from docx.oxml.ns import qn
+from datetime import datetime
+
+
+def convert_markdown_to_word(markdown_text):
+ # 0. 首先标准化所有换行符为\n
+ markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n')
+
+ # 1. 处理标题 - 支持更多级别的标题,使用更精确的正则
+ # 保留标题标记,以便后续处理时还能识别出标题级别
+ markdown_text = re.sub(r'^(#{1,6})\s+(.+?)(?:\s+#+)?$', r'\1 \2', markdown_text, flags=re.MULTILINE)
+
+ # 2. 处理粗体、斜体和加粗斜体
+ markdown_text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', markdown_text) # 加粗斜体
+ markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text) # 加粗
+ markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text) # 斜体
+ markdown_text = re.sub(r'_(.+?)_', r'\1', markdown_text) # 下划线斜体
+ markdown_text = re.sub(r'__(.+?)__', r'\1', markdown_text) # 下划线加粗
+
+ # 3. 处理代码块 - 不移除,而是简化格式
+ # 多行代码块
+ markdown_text = re.sub(r'```(?:\w+)?\n([\s\S]*?)```', r'[代码块]\n\1[/代码块]', markdown_text)
+ # 单行代码
+ markdown_text = re.sub(r'`([^`]+)`', r'[代码]\1[/代码]', markdown_text)
+
+ # 4. 处理列表 - 保留列表结构
+ # 匹配无序列表
+ markdown_text = re.sub(r'^(\s*)[-*+]\s+(.+?)$', r'\1• \2', markdown_text, flags=re.MULTILINE)
+
+ # 5. 处理Markdown链接
+ markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+?)\s*(?:"[^"]*")?\)', r'\1 (\2)', markdown_text)
+
+ # 6. 处理HTML链接
+ markdown_text = re.sub(r'([^<]+)', r'\2 (\1)',
+ markdown_text)
+
+ # 7. 处理图片
+ markdown_text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'[图片:\1]', markdown_text)
+
+ return markdown_text
+
+
+class WordFormatter:
+ """聊天记录Word文档生成器 - 符合中国政府公文格式规范(GB/T 9704-2012)"""
+
+ def __init__(self):
+ self.doc = Document()
+ self._setup_document()
+ self._create_styles()
+
+ def _setup_document(self):
+ """设置文档基本格式,包括页面设置和页眉"""
+ sections = self.doc.sections
+ for section in sections:
+ # 设置页面大小为A4
+ section.page_width = Cm(21)
+ section.page_height = Cm(29.7)
+ # 设置页边距
+ section.top_margin = Cm(3.7) # 上边距37mm
+ section.bottom_margin = Cm(3.5) # 下边距35mm
+ section.left_margin = Cm(2.8) # 左边距28mm
+ section.right_margin = Cm(2.6) # 右边距26mm
+ # 设置页眉页脚距离
+ section.header_distance = Cm(2.0)
+ section.footer_distance = Cm(2.0)
+
+ # 添加页眉
+ header = section.header
+ header_para = header.paragraphs[0]
+ header_para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
+ header_run = header_para.add_run("GPT-Academic对话记录")
+ header_run.font.name = '仿宋'
+ header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
+ header_run.font.size = Pt(9)
+
+ def _create_styles(self):
+ """创建文档样式"""
+ # 创建正文样式
+ style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH)
+ style.font.name = '仿宋'
+ style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
+ style.font.size = Pt(12) # 调整为12磅
+ style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+ style.paragraph_format.space_after = Pt(0)
+
+ # 创建问题样式
+ question_style = self.doc.styles.add_style('Question_Style', WD_STYLE_TYPE.PARAGRAPH)
+ question_style.font.name = '黑体'
+ question_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
+ question_style.font.size = Pt(14) # 调整为14磅
+ question_style.font.bold = True
+ question_style.paragraph_format.space_before = Pt(12) # 减小段前距
+ question_style.paragraph_format.space_after = Pt(6)
+ question_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+ question_style.paragraph_format.left_indent = Pt(0) # 移除左缩进
+
+ # 创建回答样式
+ answer_style = self.doc.styles.add_style('Answer_Style', WD_STYLE_TYPE.PARAGRAPH)
+ answer_style.font.name = '仿宋'
+ answer_style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
+ answer_style.font.size = Pt(12) # 调整为12磅
+ answer_style.paragraph_format.space_before = Pt(6)
+ answer_style.paragraph_format.space_after = Pt(12)
+ answer_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+ answer_style.paragraph_format.left_indent = Pt(0) # 移除左缩进
+
+ # 创建标题样式
+ title_style = self.doc.styles.add_style('Title_Custom', WD_STYLE_TYPE.PARAGRAPH)
+ title_style.font.name = '黑体' # 改用黑体
+ title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
+ title_style.font.size = Pt(22) # 调整为22磅
+ title_style.font.bold = True
+ title_style.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
+ title_style.paragraph_format.space_before = Pt(0)
+ title_style.paragraph_format.space_after = Pt(24)
+ title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+
+ # 添加参考文献样式
+ ref_style = self.doc.styles.add_style('Reference_Style', WD_STYLE_TYPE.PARAGRAPH)
+ ref_style.font.name = '宋体'
+ ref_style._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
+ ref_style.font.size = Pt(10.5) # 参考文献使用小号字体
+ ref_style.paragraph_format.space_before = Pt(3)
+ ref_style.paragraph_format.space_after = Pt(3)
+ ref_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE
+ ref_style.paragraph_format.left_indent = Pt(21)
+ ref_style.paragraph_format.first_line_indent = Pt(-21)
+
+ # 添加参考文献标题样式
+ ref_title_style = self.doc.styles.add_style('Reference_Title_Style', WD_STYLE_TYPE.PARAGRAPH)
+ ref_title_style.font.name = '黑体'
+ ref_title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
+ ref_title_style.font.size = Pt(16)
+ ref_title_style.font.bold = True
+ ref_title_style.paragraph_format.space_before = Pt(24)
+ ref_title_style.paragraph_format.space_after = Pt(12)
+ ref_title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+
+ def create_document(self, history):
+ """写入聊天历史"""
+ # 添加标题
+ title_para = self.doc.add_paragraph(style='Title_Custom')
+ title_run = title_para.add_run('GPT-Academic 对话记录')
+
+ # 添加日期
+ date_para = self.doc.add_paragraph()
+ date_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
+ date_run = date_para.add_run(datetime.now().strftime('%Y年%m月%d日'))
+ date_run.font.name = '仿宋'
+ date_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
+ date_run.font.size = Pt(16)
+
+ self.doc.add_paragraph() # 添加空行
+
+ # 添加对话内容
+ for i in range(0, len(history), 2):
+ question = history[i]
+ answer = convert_markdown_to_word(history[i + 1])
+
+ if question:
+ q_para = self.doc.add_paragraph(style='Question_Style')
+ q_para.add_run(f'问题 {i//2 + 1}:').bold = True
+ q_para.add_run(str(question))
+
+ if answer:
+ a_para = self.doc.add_paragraph(style='Answer_Style')
+ a_para.add_run(f'回答 {i//2 + 1}:').bold = True
+ a_para.add_run(str(answer))
+
+
+ return self.doc
+
diff --git a/crazy_functions/doc_fns/read_fns/__init__.py b/crazy_functions/doc_fns/read_fns/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/crazy_functions/doc_fns/read_fns/docx_reader.py b/crazy_functions/doc_fns/read_fns/docx_reader.py
new file mode 100644
index 00000000..9308940b
--- /dev/null
+++ b/crazy_functions/doc_fns/read_fns/docx_reader.py
@@ -0,0 +1,6 @@
+import nltk
+nltk.data.path.append('~/nltk_data')
+nltk.download('averaged_perceptron_tagger', download_dir='~/nltk_data',
+ )
+nltk.download('punkt', download_dir='~/nltk_data',
+ )
\ No newline at end of file
diff --git a/crazy_functions/doc_fns/read_fns/excel_reader.py b/crazy_functions/doc_fns/read_fns/excel_reader.py
new file mode 100644
index 00000000..d70e9d53
--- /dev/null
+++ b/crazy_functions/doc_fns/read_fns/excel_reader.py
@@ -0,0 +1,286 @@
+from __future__ import annotations
+
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from typing import Optional, List, Set, Dict, Union, Iterator, Tuple
+from dataclasses import dataclass, field
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import chardet
+from functools import lru_cache
+import os
+
+
+@dataclass
+class ExtractorConfig:
+ """提取器配置类"""
+ encoding: str = 'auto'
+ na_filter: bool = True
+ skip_blank_lines: bool = True
+ chunk_size: int = 10000
+ max_workers: int = 4
+ preserve_format: bool = True
+ read_all_sheets: bool = True # 新增:是否读取所有工作表
+ text_cleanup: Dict[str, bool] = field(default_factory=lambda: {
+ 'remove_extra_spaces': True,
+ 'normalize_whitespace': False,
+ 'remove_special_chars': False,
+ 'lowercase': False
+ })
+
+
+class ExcelTextExtractor:
+ """增强的Excel格式文件文本内容提取器"""
+
+ SUPPORTED_EXTENSIONS: Set[str] = {
+ '.xlsx', '.xls', '.csv', '.tsv', '.xlsm', '.xltx', '.xltm', '.ods'
+ }
+
+ def __init__(self, config: Optional[ExtractorConfig] = None):
+ self.config = config or ExtractorConfig()
+ self._setup_logging()
+ self._detect_encoding = lru_cache(maxsize=128)(self._detect_encoding)
+
+ def _setup_logging(self) -> None:
+ """配置日志记录器"""
+ logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ )
+ self.logger = logging.getLogger(__name__)
+ fh = logging.FileHandler('excel_extractor.log')
+ fh.setLevel(logging.ERROR)
+ self.logger.addHandler(fh)
+
+ def _detect_encoding(self, file_path: Path) -> str:
+ if self.config.encoding != 'auto':
+ return self.config.encoding
+
+ try:
+ with open(file_path, 'rb') as f:
+ raw_data = f.read(10000)
+ result = chardet.detect(raw_data)
+ return result['encoding'] or 'utf-8'
+ except Exception as e:
+ self.logger.warning(f"Encoding detection failed: {e}. Using utf-8")
+ return 'utf-8'
+
+ def _validate_file(self, file_path: Union[str, Path]) -> Path:
+ path = Path(file_path).resolve()
+
+ if not path.exists():
+ raise ValueError(f"File not found: {path}")
+
+ if not path.is_file():
+ raise ValueError(f"Not a file: {path}")
+
+ if not os.access(path, os.R_OK):
+ raise PermissionError(f"No read permission: {path}")
+
+ if path.suffix.lower() not in self.SUPPORTED_EXTENSIONS:
+ raise ValueError(
+ f"Unsupported format: {path.suffix}. "
+ f"Supported: {', '.join(sorted(self.SUPPORTED_EXTENSIONS))}"
+ )
+
+ return path
+
+ def _format_value(self, value: Any) -> str:
+ if pd.isna(value) or value is None:
+ return ''
+ if isinstance(value, (int, float)):
+ return str(value)
+ return str(value).strip()
+
+ def _process_chunk(self, chunk: pd.DataFrame, columns: Optional[List[str]] = None, sheet_name: str = '') -> str:
+ """处理数据块,新增sheet_name参数"""
+ try:
+ if columns:
+ chunk = chunk[columns]
+
+ if self.config.preserve_format:
+ formatted_chunk = chunk.applymap(self._format_value)
+ rows = []
+
+ # 添加工作表名称作为标题
+ if sheet_name:
+ rows.append(f"[Sheet: {sheet_name}]")
+
+ # 添加表头
+ headers = [str(col) for col in formatted_chunk.columns]
+ rows.append('\t'.join(headers))
+
+ # 添加数据行
+ for _, row in formatted_chunk.iterrows():
+ rows.append('\t'.join(row.values))
+
+ return '\n'.join(rows)
+ else:
+ flat_values = (
+ chunk.astype(str)
+ .replace({'nan': '', 'None': '', 'NaN': ''})
+ .values.flatten()
+ )
+ return ' '.join(v for v in flat_values if v)
+
+ except Exception as e:
+ self.logger.error(f"Error processing chunk: {e}")
+ raise
+
+ def _read_file(self, file_path: Path) -> Union[pd.DataFrame, Iterator[pd.DataFrame], Dict[str, pd.DataFrame]]:
+ """读取文件,支持多工作表"""
+ try:
+ encoding = self._detect_encoding(file_path)
+
+ if file_path.suffix.lower() in {'.csv', '.tsv'}:
+ sep = '\t' if file_path.suffix.lower() == '.tsv' else ','
+
+ # 对大文件使用分块读取
+ if file_path.stat().st_size > self.config.chunk_size * 1024:
+ return pd.read_csv(
+ file_path,
+ encoding=encoding,
+ na_filter=self.config.na_filter,
+ skip_blank_lines=self.config.skip_blank_lines,
+ sep=sep,
+ chunksize=self.config.chunk_size,
+ on_bad_lines='warn'
+ )
+ else:
+ return pd.read_csv(
+ file_path,
+ encoding=encoding,
+ na_filter=self.config.na_filter,
+ skip_blank_lines=self.config.skip_blank_lines,
+ sep=sep
+ )
+ else:
+ # Excel文件处理,支持多工作表
+ if self.config.read_all_sheets:
+ # 读取所有工作表
+ return pd.read_excel(
+ file_path,
+ na_filter=self.config.na_filter,
+ keep_default_na=self.config.na_filter,
+ engine='openpyxl',
+ sheet_name=None # None表示读取所有工作表
+ )
+ else:
+ # 只读取第一个工作表
+ return pd.read_excel(
+ file_path,
+ na_filter=self.config.na_filter,
+ keep_default_na=self.config.na_filter,
+ engine='openpyxl',
+ sheet_name=0 # 读取第一个工作表
+ )
+
+ except Exception as e:
+ self.logger.error(f"Error reading file {file_path}: {e}")
+ raise
+
+ def extract_text(
+ self,
+ file_path: Union[str, Path],
+ columns: Optional[List[str]] = None,
+ separator: str = '\n'
+ ) -> str:
+ """提取文本,支持多工作表"""
+ try:
+ path = self._validate_file(file_path)
+ self.logger.info(f"Processing: {path}")
+
+ reader = self._read_file(path)
+ texts = []
+
+ # 处理Excel多工作表
+ if isinstance(reader, dict):
+ for sheet_name, df in reader.items():
+ sheet_text = self._process_chunk(df, columns, sheet_name)
+ if sheet_text:
+ texts.append(sheet_text)
+ return separator.join(texts)
+
+ # 处理单个DataFrame
+ elif isinstance(reader, pd.DataFrame):
+ return self._process_chunk(reader, columns)
+
+ # 处理DataFrame迭代器
+ else:
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
+ futures = {
+ executor.submit(self._process_chunk, chunk, columns): i
+ for i, chunk in enumerate(reader)
+ }
+
+ chunk_texts = []
+ for future in as_completed(futures):
+ try:
+ text = future.result()
+ if text:
+ chunk_texts.append((futures[future], text))
+ except Exception as e:
+ self.logger.error(f"Error in chunk {futures[future]}: {e}")
+
+ # 按块的顺序排序
+ chunk_texts.sort(key=lambda x: x[0])
+ texts = [text for _, text in chunk_texts]
+
+ # 合并文本,保留格式
+ if texts and self.config.preserve_format:
+ result = texts[0] # 第一块包含表头
+ if len(texts) > 1:
+ # 跳过后续块的表头行
+ for text in texts[1:]:
+ result += '\n' + '\n'.join(text.split('\n')[1:])
+ return result
+ else:
+ return separator.join(texts)
+
+ except Exception as e:
+ self.logger.error(f"Extraction failed: {e}")
+ raise
+
+ @staticmethod
+ def get_supported_formats() -> List[str]:
+ """获取支持的文件格式列表"""
+ return sorted(ExcelTextExtractor.SUPPORTED_EXTENSIONS)
+
+
+def main():
+ """主函数:演示用法"""
+ config = ExtractorConfig(
+ encoding='auto',
+ preserve_format=True,
+ read_all_sheets=True, # 启用多工作表读取
+ text_cleanup={
+ 'remove_extra_spaces': True,
+ 'normalize_whitespace': False,
+ 'remove_special_chars': False,
+ 'lowercase': False
+ }
+ )
+
+ extractor = ExcelTextExtractor(config)
+
+ try:
+ sample_file = 'example.xlsx'
+ if Path(sample_file).exists():
+ text = extractor.extract_text(
+ sample_file,
+ columns=['title', 'content']
+ )
+ print("提取的文本:")
+ print(text)
+ else:
+ print(f"示例文件 {sample_file} 不存在")
+
+ print("\n支持的格式:", extractor.get_supported_formats())
+
+ except Exception as e:
+ print(f"错误: {e}")
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/crazy_functions/doc_fns/read_fns/markitdown/markdown_reader.py b/crazy_functions/doc_fns/read_fns/markitdown/markdown_reader.py
new file mode 100644
index 00000000..b88212e2
--- /dev/null
+++ b/crazy_functions/doc_fns/read_fns/markitdown/markdown_reader.py
@@ -0,0 +1,359 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Optional, Set, Dict, Union, List
+from dataclasses import dataclass, field
+import logging
+import os
+import re
+import subprocess
+import tempfile
+import shutil
+
+@dataclass
+class MarkdownConverterConfig:
+ """PDF 到 Markdown 转换器配置类
+
+ Attributes:
+ extract_images: 是否提取图片
+ extract_tables: 是否尝试保留表格结构
+ extract_code_blocks: 是否识别代码块
+ extract_math: 是否转换数学公式
+ output_dir: 输出目录路径
+ image_dir: 图片保存目录路径
+ paragraph_separator: 段落之间的分隔符
+ text_cleanup: 文本清理选项字典
+ docintel_endpoint: Document Intelligence端点URL (可选)
+ enable_plugins: 是否启用插件
+ llm_client: LLM客户端对象 (例如OpenAI client)
+ llm_model: 要使用的LLM模型名称
+ """
+ extract_images: bool = True
+ extract_tables: bool = True
+ extract_code_blocks: bool = True
+ extract_math: bool = True
+ output_dir: str = ""
+ image_dir: str = "images"
+ paragraph_separator: str = '\n\n'
+ text_cleanup: Dict[str, bool] = field(default_factory=lambda: {
+ 'remove_extra_spaces': True,
+ 'normalize_whitespace': True,
+ 'remove_special_chars': False,
+ 'lowercase': False
+ })
+ docintel_endpoint: str = ""
+ enable_plugins: bool = False
+ llm_client: Optional[object] = None
+ llm_model: str = ""
+
+
+class MarkdownConverter:
+ """PDF 到 Markdown 转换器
+
+ 使用 markitdown 库实现 PDF 到 Markdown 的转换,支持多种配置选项。
+ """
+
+ SUPPORTED_EXTENSIONS: Set[str] = {
+ '.pdf',
+ }
+
+ def __init__(self, config: Optional[MarkdownConverterConfig] = None):
+ """初始化转换器
+
+ Args:
+ config: 转换器配置对象,如果为None则使用默认配置
+ """
+ self.config = config or MarkdownConverterConfig()
+ self._setup_logging()
+
+ # 检查是否安装了 markitdown
+ self._check_markitdown_installation()
+
+ def _setup_logging(self) -> None:
+ """配置日志记录器"""
+ logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ )
+ self.logger = logging.getLogger(__name__)
+
+ # 添加文件处理器
+ fh = logging.FileHandler('markdown_converter.log')
+ fh.setLevel(logging.ERROR)
+ self.logger.addHandler(fh)
+
+ def _check_markitdown_installation(self) -> None:
+ """检查是否安装了 markitdown"""
+ try:
+ # 尝试导入 markitdown 库
+ from markitdown import MarkItDown
+ self.logger.info("markitdown 库已安装")
+ except ImportError:
+ self.logger.warning("markitdown 库未安装,尝试安装...")
+ try:
+ subprocess.check_call(["pip", "install", "markitdown"])
+ self.logger.info("markitdown 库安装成功")
+ from markitdown import MarkItDown
+ except (subprocess.SubprocessError, ImportError):
+ self.logger.error("无法安装 markitdown 库,请手动安装")
+ self.markitdown_available = False
+ return
+
+ self.markitdown_available = True
+
+ def _validate_file(self, file_path: Union[str, Path], max_size_mb: int = 100) -> Path:
+ """验证文件
+
+ Args:
+ file_path: 文件路径
+ max_size_mb: 允许的最大文件大小(MB)
+
+ Returns:
+ Path: 验证后的Path对象
+
+ Raises:
+ ValueError: 文件不存在、格式不支持或大小超限
+ PermissionError: 没有读取权限
+ """
+ path = Path(file_path).resolve()
+
+ if not path.exists():
+ raise ValueError(f"文件不存在: {path}")
+
+ if not path.is_file():
+ raise ValueError(f"不是一个文件: {path}")
+
+ if not os.access(path, os.R_OK):
+ raise PermissionError(f"没有读取权限: {path}")
+
+ file_size_mb = path.stat().st_size / (1024 * 1024)
+ if file_size_mb > max_size_mb:
+ raise ValueError(
+ f"文件大小 ({file_size_mb:.1f}MB) 超过限制 {max_size_mb}MB"
+ )
+
+ if path.suffix.lower() not in self.SUPPORTED_EXTENSIONS:
+ raise ValueError(
+ f"不支持的格式: {path.suffix}. "
+ f"支持的格式: {', '.join(sorted(self.SUPPORTED_EXTENSIONS))}"
+ )
+
+ return path
+
+ def _cleanup_text(self, text: str) -> str:
+ """清理文本
+
+ Args:
+ text: 原始文本
+
+ Returns:
+ str: 清理后的文本
+ """
+ if self.config.text_cleanup['remove_extra_spaces']:
+ text = ' '.join(text.split())
+
+ if self.config.text_cleanup['normalize_whitespace']:
+ text = text.replace('\t', ' ').replace('\r', '\n')
+
+ if self.config.text_cleanup['lowercase']:
+ text = text.lower()
+
+ return text.strip()
+
+ @staticmethod
+ def get_supported_formats() -> List[str]:
+ """获取支持的文件格式列表"""
+ return sorted(MarkdownConverter.SUPPORTED_EXTENSIONS)
+
+ def convert_to_markdown(
+ self,
+ file_path: Union[str, Path],
+ output_path: Optional[Union[str, Path]] = None
+ ) -> str:
+ """将 PDF 转换为 Markdown
+
+ Args:
+ file_path: PDF 文件路径
+ output_path: 输出 Markdown 文件路径,如果为 None 则返回内容而不保存
+
+ Returns:
+ str: 转换后的 Markdown 内容
+
+ Raises:
+ Exception: 转换过程中的错误
+ """
+ try:
+ path = self._validate_file(file_path)
+ self.logger.info(f"处理: {path}")
+
+ if not self.markitdown_available:
+ raise ImportError("markitdown 库未安装,无法进行转换")
+
+ # 导入 markitdown 库
+ from markitdown import MarkItDown
+
+ # 准备输出目录
+ if output_path:
+ output_path = Path(output_path)
+ output_dir = output_path.parent
+ output_dir.mkdir(parents=True, exist_ok=True)
+ else:
+ # 创建临时目录作为输出目录
+ temp_dir = tempfile.mkdtemp()
+ output_dir = Path(temp_dir)
+ output_path = output_dir / f"{path.stem}.md"
+
+ # 图片目录
+ image_dir = output_dir / self.config.image_dir
+ image_dir.mkdir(parents=True, exist_ok=True)
+
+ # 创建 MarkItDown 实例并进行转换
+ if self.config.docintel_endpoint:
+ md = MarkItDown(docintel_endpoint=self.config.docintel_endpoint)
+ elif self.config.llm_client and self.config.llm_model:
+ md = MarkItDown(
+ enable_plugins=self.config.enable_plugins,
+ llm_client=self.config.llm_client,
+ llm_model=self.config.llm_model
+ )
+ else:
+ md = MarkItDown(enable_plugins=self.config.enable_plugins)
+
+ # 执行转换
+ result = md.convert(str(path))
+ markdown_content = result.text_content
+
+ # 清理文本
+ markdown_content = self._cleanup_text(markdown_content)
+
+ # 如果需要保存到文件
+ if output_path:
+ with open(output_path, 'w', encoding='utf-8') as f:
+ f.write(markdown_content)
+ self.logger.info(f"转换成功,输出到: {output_path}")
+
+ return markdown_content
+
+ except Exception as e:
+ self.logger.error(f"转换失败: {e}")
+ raise
+ finally:
+ # 如果使用了临时目录且没有指定输出路径,则清理临时目录
+ if 'temp_dir' in locals() and not output_path:
+ shutil.rmtree(temp_dir, ignore_errors=True)
+
+ def convert_to_markdown_and_save(
+ self,
+ file_path: Union[str, Path],
+ output_path: Union[str, Path]
+ ) -> Path:
+ """将 PDF 转换为 Markdown 并保存到指定路径
+
+ Args:
+ file_path: PDF 文件路径
+ output_path: 输出 Markdown 文件路径
+
+ Returns:
+ Path: 输出文件的 Path 对象
+
+ Raises:
+ Exception: 转换过程中的错误
+ """
+ self.convert_to_markdown(file_path, output_path)
+ return Path(output_path)
+
+ def batch_convert(
+ self,
+ file_paths: List[Union[str, Path]],
+ output_dir: Union[str, Path]
+ ) -> List[Path]:
+ """批量转换多个 PDF 文件为 Markdown
+
+ Args:
+ file_paths: PDF 文件路径列表
+ output_dir: 输出目录路径
+
+ Returns:
+ List[Path]: 输出文件路径列表
+
+ Raises:
+ Exception: 转换过程中的错误
+ """
+ output_dir = Path(output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ output_paths = []
+ for file_path in file_paths:
+ path = Path(file_path)
+ output_path = output_dir / f"{path.stem}.md"
+
+ try:
+ self.convert_to_markdown(file_path, output_path)
+ output_paths.append(output_path)
+ self.logger.info(f"成功转换: {path} -> {output_path}")
+ except Exception as e:
+ self.logger.error(f"转换失败 {path}: {e}")
+
+ return output_paths
+
+
+def main():
+ """主函数:演示用法"""
+ # 配置
+ config = MarkdownConverterConfig(
+ extract_images=True,
+ extract_tables=True,
+ extract_code_blocks=True,
+ extract_math=True,
+ enable_plugins=False,
+ text_cleanup={
+ 'remove_extra_spaces': True,
+ 'normalize_whitespace': True,
+ 'remove_special_chars': False,
+ 'lowercase': False
+ }
+ )
+
+ # 创建转换器
+ converter = MarkdownConverter(config)
+
+ # 使用示例
+ try:
+ # 替换为实际的文件路径
+ sample_file = './crazy_functions/doc_fns/read_fns/paper/2501.12599v1.pdf'
+ if Path(sample_file).exists():
+ # 转换为 Markdown 并打印内容
+ markdown_content = converter.convert_to_markdown(sample_file)
+ print("转换后的 Markdown 内容:")
+ print(markdown_content[:500] + "...") # 只打印前500个字符
+
+ # 转换并保存到文件
+ output_file = f"./output_{Path(sample_file).stem}.md"
+ output_path = converter.convert_to_markdown_and_save(sample_file, output_file)
+ print(f"\n已保存到: {output_path}")
+
+ # 使用LLM增强的示例 (需要添加相应的导入和配置)
+ # try:
+ # from openai import OpenAI
+ # client = OpenAI()
+ # llm_config = MarkdownConverterConfig(
+ # llm_client=client,
+ # llm_model="gpt-4o"
+ # )
+ # llm_converter = MarkdownConverter(llm_config)
+ # llm_result = llm_converter.convert_to_markdown("example.jpg")
+ # print("LLM增强的结果:")
+ # print(llm_result[:500] + "...")
+ # except ImportError:
+ # print("未安装OpenAI库,跳过LLM示例")
+ else:
+ print(f"示例文件 {sample_file} 不存在")
+
+ print("\n支持的格式:", converter.get_supported_formats())
+
+ except Exception as e:
+ print(f"错误: {e}")
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/crazy_functions/doc_fns/read_fns/unstructured_all/__init__.py b/crazy_functions/doc_fns/read_fns/unstructured_all/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/crazy_functions/doc_fns/read_fns/unstructured_all/paper_metadata_extractor.py b/crazy_functions/doc_fns/read_fns/unstructured_all/paper_metadata_extractor.py
new file mode 100644
index 00000000..bfa0180f
--- /dev/null
+++ b/crazy_functions/doc_fns/read_fns/unstructured_all/paper_metadata_extractor.py
@@ -0,0 +1,493 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Optional, Set, Dict, Union, List
+from dataclasses import dataclass, field
+import logging
+import os
+import re
+
+from unstructured.partition.auto import partition
+from unstructured.documents.elements import (
+ Text, Title, NarrativeText, ListItem, Table,
+ Footer, Header, PageBreak, Image, Address
+)
+
+
+@dataclass
+class PaperMetadata:
+ """论文元数据类"""
+ title: str = ""
+ authors: List[str] = field(default_factory=list)
+ affiliations: List[str] = field(default_factory=list)
+ journal: str = ""
+ volume: str = ""
+ issue: str = ""
+ year: str = ""
+ doi: str = ""
+ date: str = ""
+ publisher: str = ""
+ conference: str = ""
+ abstract: str = ""
+ keywords: List[str] = field(default_factory=list)
+
+
+@dataclass
+class ExtractorConfig:
+ """元数据提取器配置类"""
+ paragraph_separator: str = '\n\n'
+ text_cleanup: Dict[str, bool] = field(default_factory=lambda: {
+ 'remove_extra_spaces': True,
+ 'normalize_whitespace': True,
+ 'remove_special_chars': False,
+ 'lowercase': False
+ })
+
+
+class PaperMetadataExtractor:
+ """论文元数据提取器
+
+ 使用unstructured库从多种文档格式中提取论文的标题、作者、摘要等元数据信息。
+ """
+
+ SUPPORTED_EXTENSIONS: Set[str] = {
+ '.pdf', '.docx', '.doc', '.txt', '.ppt', '.pptx',
+ '.xlsx', '.xls', '.md', '.org', '.odt', '.rst',
+ '.rtf', '.epub', '.html', '.xml', '.json'
+ }
+
+ # 定义论文各部分的关键词模式
+ SECTION_PATTERNS = {
+ 'abstract': r'\b(摘要|abstract|summary|概要|résumé|zusammenfassung|аннотация)\b',
+ 'keywords': r'\b(关键词|keywords|key\s+words|关键字|mots[- ]clés|schlüsselwörter|ключевые слова)\b',
+ }
+
+ def __init__(self, config: Optional[ExtractorConfig] = None):
+ """初始化提取器
+
+ Args:
+ config: 提取器配置对象,如果为None则使用默认配置
+ """
+ self.config = config or ExtractorConfig()
+ self._setup_logging()
+
+ def _setup_logging(self) -> None:
+ """配置日志记录器"""
+ logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ )
+ self.logger = logging.getLogger(__name__)
+
+ # 添加文件处理器
+ fh = logging.FileHandler('paper_metadata_extractor.log')
+ fh.setLevel(logging.ERROR)
+ self.logger.addHandler(fh)
+
+ def _validate_file(self, file_path: Union[str, Path], max_size_mb: int = 100) -> Path:
+ """验证文件
+
+ Args:
+ file_path: 文件路径
+ max_size_mb: 允许的最大文件大小(MB)
+
+ Returns:
+ Path: 验证后的Path对象
+
+ Raises:
+ ValueError: 文件不存在、格式不支持或大小超限
+ PermissionError: 没有读取权限
+ """
+ path = Path(file_path).resolve()
+
+ if not path.exists():
+ raise ValueError(f"文件不存在: {path}")
+
+ if not path.is_file():
+ raise ValueError(f"不是文件: {path}")
+
+ if not os.access(path, os.R_OK):
+ raise PermissionError(f"没有读取权限: {path}")
+
+ file_size_mb = path.stat().st_size / (1024 * 1024)
+ if file_size_mb > max_size_mb:
+ raise ValueError(
+ f"文件大小 ({file_size_mb:.1f}MB) 超过限制 {max_size_mb}MB"
+ )
+
+ if path.suffix.lower() not in self.SUPPORTED_EXTENSIONS:
+ raise ValueError(
+ f"不支持的文件格式: {path.suffix}. "
+ f"支持的格式: {', '.join(sorted(self.SUPPORTED_EXTENSIONS))}"
+ )
+
+ return path
+
+ def _cleanup_text(self, text: str) -> str:
+ """清理文本
+
+ Args:
+ text: 原始文本
+
+ Returns:
+ str: 清理后的文本
+ """
+ if self.config.text_cleanup['remove_extra_spaces']:
+ text = ' '.join(text.split())
+
+ if self.config.text_cleanup['normalize_whitespace']:
+ text = text.replace('\t', ' ').replace('\r', '\n')
+
+ if self.config.text_cleanup['lowercase']:
+ text = text.lower()
+
+ return text.strip()
+
+ @staticmethod
+ def get_supported_formats() -> List[str]:
+ """获取支持的文件格式列表"""
+ return sorted(PaperMetadataExtractor.SUPPORTED_EXTENSIONS)
+
+ def extract_metadata(self, file_path: Union[str, Path], strategy: str = "fast") -> PaperMetadata:
+ """提取论文元数据
+
+ Args:
+ file_path: 文件路径
+ strategy: 提取策略 ("fast" 或 "accurate")
+
+ Returns:
+ PaperMetadata: 提取的论文元数据
+
+ Raises:
+ Exception: 提取过程中的错误
+ """
+ try:
+ path = self._validate_file(file_path)
+ self.logger.info(f"正在处理: {path}")
+
+ # 使用unstructured库分解文档
+ elements = partition(
+ str(path),
+ strategy=strategy,
+ include_metadata=True,
+ nlp=False,
+ )
+
+ # 提取元数据
+ metadata = PaperMetadata()
+
+ # 提取标题和作者
+ self._extract_title_and_authors(elements, metadata)
+
+ # 提取摘要和关键词
+ self._extract_abstract_and_keywords(elements, metadata)
+
+ # 提取其他元数据
+ self._extract_additional_metadata(elements, metadata)
+
+ return metadata
+
+ except Exception as e:
+ self.logger.error(f"元数据提取失败: {e}")
+ raise
+
+ def _extract_title_and_authors(self, elements, metadata: PaperMetadata) -> None:
+ """从文档中提取标题和作者信息 - 改进版"""
+ # 收集所有潜在的标题候选
+ title_candidates = []
+ all_text = []
+ raw_text = []
+
+ # 首先收集文档前30个元素的文本,用于辅助判断
+ for i, element in enumerate(elements[:30]):
+ if isinstance(element, (Text, Title, NarrativeText)):
+ text = str(element).strip()
+ if text:
+ all_text.append(text)
+ raw_text.append(text)
+
+ # 打印出原始文本,用于调试
+ print("原始文本前10行:")
+ for i, text in enumerate(raw_text[:10]):
+ print(f"{i}: {text}")
+
+ # 1. 尝试查找连续的标题片段并合并它们
+ i = 0
+ while i < len(all_text) - 1:
+ current = all_text[i]
+ next_text = all_text[i + 1]
+
+ # 检查是否存在标题分割情况:一行以冒号结尾,下一行像是标题的延续
+ if current.endswith(':') and len(current) < 50 and len(next_text) > 5 and next_text[0].isupper():
+ # 合并这两行文本
+ combined_title = f"{current} {next_text}"
+ # 查找合并前的文本并替换
+ all_text[i] = combined_title
+ all_text.pop(i + 1)
+ # 给合并后的标题很高的分数
+ title_candidates.append((combined_title, 15, i))
+ else:
+ i += 1
+
+ # 2. 首先尝试从标题元素中查找
+ for i, element in enumerate(elements[:15]): # 只检查前15个元素
+ if isinstance(element, Title):
+ title_text = str(element).strip()
+ # 排除常见的非标题内容
+ if title_text.lower() not in ['abstract', '摘要', 'introduction', '引言']:
+ # 计算标题分数(越高越可能是真正的标题)
+ score = self._evaluate_title_candidate(title_text, i, element)
+ title_candidates.append((title_text, score, i))
+
+ # 3. 特别处理常见的论文标题格式
+ for i, text in enumerate(all_text[:15]):
+ # 特别检查"KIMI K1.5:"类型的前缀标题
+ if re.match(r'^[A-Z][A-Z0-9\s\.]+(\s+K\d+(\.\d+)?)?:', text):
+ score = 12 # 给予很高的分数
+ title_candidates.append((text, score, i))
+
+ # 如果下一行也是全大写,很可能是标题的延续
+ if i+1 < len(all_text) and all_text[i+1].isupper() and len(all_text[i+1]) > 10:
+ combined_title = f"{text} {all_text[i+1]}"
+ title_candidates.append((combined_title, 15, i)) # 给合并标题更高分数
+
+ # 匹配全大写的标题行
+ elif text.isupper() and len(text) > 10 and len(text) < 100:
+ score = 10 - i * 0.5 # 越靠前越可能是标题
+ title_candidates.append((text, score, i))
+
+ # 对标题候选按分数排序并选取最佳候选
+ if title_candidates:
+ title_candidates.sort(key=lambda x: x[1], reverse=True)
+ metadata.title = title_candidates[0][0]
+ title_position = title_candidates[0][2]
+ print(f"所有标题候选: {title_candidates[:3]}")
+ else:
+ # 如果没有找到合适的标题,使用一个备选策略
+ for text in all_text[:10]:
+ if text.isupper() and len(text) > 10 and len(text) < 200: # 大写且适当长度的文本
+ metadata.title = text
+ break
+ title_position = 0
+
+ # 提取作者信息 - 改进后的作者提取逻辑
+ author_candidates = []
+
+ # 1. 特别处理"TECHNICAL REPORT OF"之后的行,通常是作者或团队
+ for i, text in enumerate(all_text):
+ if "TECHNICAL REPORT" in text.upper() and i+1 < len(all_text):
+ team_text = all_text[i+1].strip()
+ if re.search(r'\b(team|group|lab)\b', team_text, re.IGNORECASE):
+ author_candidates.append((team_text, 15))
+
+ # 2. 查找包含Team的文本
+ for text in all_text[:20]:
+ if "Team" in text and len(text) < 30:
+ # 这很可能是团队名
+ author_candidates.append((text, 12))
+
+ # 添加作者到元数据
+ if author_candidates:
+ # 按分数排序
+ author_candidates.sort(key=lambda x: x[1], reverse=True)
+
+ # 去重
+ seen_authors = set()
+ for author, _ in author_candidates:
+ if author.lower() not in seen_authors and not author.isdigit():
+ seen_authors.add(author.lower())
+ metadata.authors.append(author)
+
+ # 如果没有找到作者,尝试查找隶属机构信息中的团队名称
+ if not metadata.authors:
+ for text in all_text[:20]:
+ if re.search(r'\b(team|group|lab|laboratory|研究组|团队)\b', text, re.IGNORECASE):
+ if len(text) < 50: # 避免太长的文本
+ metadata.authors.append(text.strip())
+ break
+
+ # 提取隶属机构信息
+ for i, element in enumerate(elements[:30]):
+ element_text = str(element).strip()
+ if re.search(r'(university|institute|department|school|laboratory|college|center|centre|\d{5,}|^[a-zA-Z]+@|学院|大学|研究所|研究院)', element_text, re.IGNORECASE):
+ # 可能是隶属机构
+ if element_text not in metadata.affiliations and len(element_text) > 10:
+ metadata.affiliations.append(element_text)
+
+ def _evaluate_title_candidate(self, text, position, element):
+ """评估标题候选项的可能性分数"""
+ score = 0
+
+ # 位置因素:越靠前越可能是标题
+ score += max(0, 10 - position) * 0.5
+
+ # 长度因素:标题通常不会太短也不会太长
+ if 10 <= len(text) <= 150:
+ score += 3
+ elif len(text) < 10:
+ score -= 2
+ elif len(text) > 150:
+ score -= 3
+
+ # 格式因素
+ if text.isupper(): # 全大写可能是标题
+ score += 2
+ if re.match(r'^[A-Z]', text): # 首字母大写
+ score += 1
+ if ':' in text: # 标题常包含冒号
+ score += 1.5
+
+ # 内容因素
+ if re.search(r'\b(scaling|learning|model|approach|method|system|framework|analysis)\b', text.lower()):
+ score += 2 # 包含常见的学术论文关键词
+
+ # 避免误判
+ if re.match(r'^\d+$', text): # 纯数字
+ score -= 10
+ if re.search(r'^(http|www|doi)', text.lower()): # URL或DOI
+ score -= 5
+ if len(text.split()) <= 2 and len(text) < 15: # 太短的短语
+ score -= 3
+
+ # 元数据因素(如果有)
+ if hasattr(element, 'metadata') and element.metadata:
+ # 修复:正确处理ElementMetadata对象
+ try:
+ # 尝试通过getattr安全地获取属性
+ font_size = getattr(element.metadata, 'font_size', None)
+ if font_size is not None and font_size > 14: # 假设标准字体大小是12
+ score += 3
+
+ font_weight = getattr(element.metadata, 'font_weight', None)
+ if font_weight == 'bold':
+ score += 2 # 粗体加分
+ except (AttributeError, TypeError):
+ # 如果metadata的访问方式不正确,尝试其他可能的访问方式
+ try:
+ metadata_dict = element.metadata.__dict__ if hasattr(element.metadata, '__dict__') else {}
+ if 'font_size' in metadata_dict and metadata_dict['font_size'] > 14:
+ score += 3
+ if 'font_weight' in metadata_dict and metadata_dict['font_weight'] == 'bold':
+ score += 2
+ except Exception:
+ # 如果所有尝试都失败,忽略元数据处理
+ pass
+
+ return score
+
+ def _extract_abstract_and_keywords(self, elements, metadata: PaperMetadata) -> None:
+ """从文档中提取摘要和关键词"""
+ abstract_found = False
+ keywords_found = False
+ abstract_text = []
+
+ for i, element in enumerate(elements):
+ element_text = str(element).strip().lower()
+
+ # 寻找摘要部分
+ if not abstract_found and (
+ isinstance(element, Title) and
+ re.search(self.SECTION_PATTERNS['abstract'], element_text, re.IGNORECASE)
+ ):
+ abstract_found = True
+ continue
+
+ # 如果找到摘要部分,收集内容直到遇到关键词部分或新章节
+ if abstract_found and not keywords_found:
+ # 检查是否遇到关键词部分或新章节
+ if (
+ isinstance(element, Title) or
+ re.search(self.SECTION_PATTERNS['keywords'], element_text, re.IGNORECASE) or
+ re.match(r'\b(introduction|引言|method|方法)\b', element_text, re.IGNORECASE)
+ ):
+ keywords_found = re.search(self.SECTION_PATTERNS['keywords'], element_text, re.IGNORECASE)
+ abstract_found = False # 停止收集摘要
+ else:
+ # 收集摘要文本
+ if isinstance(element, (Text, NarrativeText)) and element_text:
+ abstract_text.append(element_text)
+
+ # 如果找到关键词部分,提取关键词
+ if keywords_found and not abstract_found and not metadata.keywords:
+ if isinstance(element, (Text, NarrativeText)):
+ # 清除可能的"关键词:"/"Keywords:"前缀
+ cleaned_text = re.sub(r'^\s*(关键词|keywords|key\s+words)\s*[::]\s*', '', element_text, flags=re.IGNORECASE)
+
+ # 尝试按不同分隔符分割
+ for separator in [';', ';', ',', ',']:
+ if separator in cleaned_text:
+ metadata.keywords = [k.strip() for k in cleaned_text.split(separator) if k.strip()]
+ break
+
+ # 如果未能分割,将整个文本作为一个关键词
+ if not metadata.keywords and cleaned_text:
+ metadata.keywords = [cleaned_text]
+
+ keywords_found = False # 已提取关键词,停止处理
+
+ # 设置摘要文本
+ if abstract_text:
+ metadata.abstract = self.config.paragraph_separator.join(abstract_text)
+
+ def _extract_additional_metadata(self, elements, metadata: PaperMetadata) -> None:
+ """提取其他元数据信息"""
+ for element in elements[:30]: # 只检查文档前部分
+ element_text = str(element).strip()
+
+ # 尝试匹配DOI
+ doi_match = re.search(r'(doi|DOI):\s*(10\.\d{4,}\/[a-zA-Z0-9.-]+)', element_text)
+ if doi_match and not metadata.doi:
+ metadata.doi = doi_match.group(2)
+
+ # 尝试匹配日期
+ date_match = re.search(r'(published|received|accepted|submitted):\s*(\d{1,2}\s+[a-zA-Z]+\s+\d{4}|\d{4}[-/]\d{1,2}[-/]\d{1,2})', element_text, re.IGNORECASE)
+ if date_match and not metadata.date:
+ metadata.date = date_match.group(2)
+
+ # 尝试匹配年份
+ year_match = re.search(r'\b(19|20)\d{2}\b', element_text)
+ if year_match and not metadata.year:
+ metadata.year = year_match.group(0)
+
+ # 尝试匹配期刊/会议名称
+ journal_match = re.search(r'(journal|conference):\s*([^,;.]+)', element_text, re.IGNORECASE)
+ if journal_match:
+ if "journal" in journal_match.group(1).lower() and not metadata.journal:
+ metadata.journal = journal_match.group(2).strip()
+ elif not metadata.conference:
+ metadata.conference = journal_match.group(2).strip()
+
+
+def main():
+ """主函数:演示用法"""
+ # 创建提取器
+ extractor = PaperMetadataExtractor()
+
+ # 使用示例
+ try:
+ # 替换为实际的文件路径
+ sample_file = '/Users/boyin.liu/Documents/示例文档/论文/3.pdf'
+ if Path(sample_file).exists():
+ metadata = extractor.extract_metadata(sample_file)
+ print("提取的元数据:")
+ print(f"标题: {metadata.title}")
+ print(f"作者: {', '.join(metadata.authors)}")
+ print(f"机构: {', '.join(metadata.affiliations)}")
+ print(f"摘要: {metadata.abstract[:200]}...")
+ print(f"关键词: {', '.join(metadata.keywords)}")
+ print(f"DOI: {metadata.doi}")
+ print(f"日期: {metadata.date}")
+ print(f"年份: {metadata.year}")
+ print(f"期刊: {metadata.journal}")
+ print(f"会议: {metadata.conference}")
+ else:
+ print(f"示例文件 {sample_file} 不存在")
+
+ print("\n支持的格式:", extractor.get_supported_formats())
+
+ except Exception as e:
+ print(f"错误: {e}")
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/crazy_functions/doc_fns/read_fns/unstructured_all/paper_structure_extractor.py b/crazy_functions/doc_fns/read_fns/unstructured_all/paper_structure_extractor.py
new file mode 100644
index 00000000..e5ee7cb2
--- /dev/null
+++ b/crazy_functions/doc_fns/read_fns/unstructured_all/paper_structure_extractor.py
@@ -0,0 +1,1220 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Optional, Set, Dict, Union, List, Tuple, Any
+from dataclasses import dataclass, field
+import logging
+import os
+import re
+
+from unstructured.partition.auto import partition
+from unstructured.documents.elements import (
+ Text, Title, NarrativeText, ListItem, Table,
+ Footer, Header, PageBreak, Image, Address
+)
+
+# 引入元数据提取器
+from crazy_functions.doc_fns.read_fns.unstructured_all.paper_metadata_extractor import PaperMetadata, PaperMetadataExtractor
+
+
+@dataclass
+class PaperSection:
+ """论文章节数据类"""
+ section_type: str # 章节类型,如"abstract", "introduction", "method", "result", "discussion", "conclusion", "references"等
+ title: str # 章节标题
+ content: str # 章节内容
+ level: int = 0 # 标题级别,0为主标题,1为一级标题,以此类推
+ subsections: List['PaperSection'] = field(default_factory=list) # 子章节列表
+
+
+@dataclass
+class Figure:
+ """论文图表数据类"""
+ id: str # 图表ID,如"Figure 1"
+ caption: str # 图表标题
+ content: str # 图表描述内容
+ position: int # 在文档中的位置索引
+
+
+@dataclass
+class Formula:
+ """论文公式数据类"""
+ id: str # 公式ID,如"(1)"
+ content: str # 公式内容
+ position: int # 在文档中的位置索引
+
+
+@dataclass
+class Reference:
+ """参考文献数据类"""
+ id: str = "" # 引用编号,如"[1]"
+ text: str = "" # 完整引用文本
+ title: str = "" # 文献标题
+ authors: List[str] = field(default_factory=list) # 作者列表
+ year: str = "" # 出版年份
+ source: str = "" # 来源(期刊、会议等)
+
+
+@dataclass
+class StructuredPaper:
+ """结构化论文数据类"""
+ metadata: PaperMetadata = field(default_factory=PaperMetadata)
+ sections: List[PaperSection] = field(default_factory=list)
+ figures: List[Figure] = field(default_factory=list)
+ tables: List[Figure] = field(default_factory=list)
+ formulas: List[Formula] = field(default_factory=list)
+ references: List[Reference] = field(default_factory=list)
+ full_text: str = ""
+ keywords: List[str] = field(default_factory=list)
+
+
+@dataclass
+class ExtractorConfig:
+ """提取器配置类"""
+ extract_figures: bool = True
+ extract_tables: bool = True
+ extract_formulas: bool = True
+ extract_references: bool = True
+ paragraph_separator: str = '\n\n'
+ text_cleanup: Dict[str, bool] = field(default_factory=lambda: {
+ 'remove_extra_spaces': True,
+ 'normalize_whitespace': True,
+ 'remove_special_chars': False,
+ 'lowercase': False
+ })
+
+
+class PaperStructureExtractor:
+ """论文结构提取器
+
+ 从各种文档格式中提取论文的完整结构化信息,包括元数据、章节结构、图表、公式、参考文献等。
+ """
+
+ # 定义论文各部分的关键词模式
+ PAPER_SECTION_PATTERNS = {
+ 'abstract': r'\b(摘要|abstract|summary|概要|résumé|zusammenfassung|аннотация)\b',
+ 'keywords': r'\b(关键词|keywords|key\s+words|关键字|mots[- ]clés|schlüsselwörter|ключевые слова)\b',
+ 'introduction': r'\b(引言|介绍|绪论|introduction|background|引言:|概述|einleitung|введение)\b',
+ 'related_work': r'\b(相关工作|related\s+work|literature\s+review|研究现状|prior\s+work|verwandte arbeiten|предыдущие работы)\b',
+ 'method': r'\b(方法|材料与方法|methodology|materials\s+and\s+methods|methods|approach|experimental|实验|算法|algorithm|方法:|研究方法|methoden|методы)\b',
+ 'result': r'\b(结果|results|findings|observations|实验结果|结果与分析|ergebnisse|результаты)\b',
+ 'discussion': r'\b(讨论|discussion|analysis|interpretation|分析|讨论与分析|diskussion|обсуждение)\b',
+ 'conclusion': r'\b(结论|总结|conclusion|summary|concluding\s+remarks|结语|总结与展望|schlussfolgerung|заключение)\b',
+ 'references': r'\b(参考文献|references|bibliography|引用|citation|文献|literatur|литература)\b',
+ 'acknowledgement': r'\b(致谢|acknowledgement|acknowledgment|鸣谢|acknowledgements|danksagung|благодарности)\b',
+ 'appendix': r'\b(附录|appendix|supplementary|补充材料|appendices|anhang|приложение)\b',
+ 'table': r'\b(表\s*\d+|table\s*\d+|tabelle\s*\d+|таблица\s*\d+)\b',
+ 'figure': r'\b(图\s*\d+|figure\s*\d+|fig.\s*\d+|abbildung\s*\d+|рисунок\s*\d+)\b'
+ }
+
+ SUPPORTED_EXTENSIONS = PaperMetadataExtractor.SUPPORTED_EXTENSIONS
+
+ def __init__(self, config: Optional[ExtractorConfig] = None):
+ """初始化提取器
+
+ Args:
+ config: 提取器配置对象,如果为None则使用默认配置
+ """
+ self.config = config or ExtractorConfig()
+ self.metadata_extractor = PaperMetadataExtractor()
+ self._setup_logging()
+
+ def _setup_logging(self) -> None:
+ """配置日志记录器"""
+ logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ )
+ self.logger = logging.getLogger(__name__)
+
+ # 添加文件处理器
+ fh = logging.FileHandler('paper_structure_extractor.log')
+ fh.setLevel(logging.ERROR)
+ self.logger.addHandler(fh)
+
+ def _cleanup_text(self, text: str) -> str:
+ """清理文本
+
+ Args:
+ text: 原始文本
+
+ Returns:
+ str: 清理后的文本
+ """
+ if self.config.text_cleanup['remove_extra_spaces']:
+ text = ' '.join(text.split())
+
+ if self.config.text_cleanup['normalize_whitespace']:
+ text = text.replace('\t', ' ').replace('\r', '\n')
+
+ if self.config.text_cleanup['remove_special_chars']:
+ # 只保留字母、数字、基本标点和中文字符
+ text = re.sub(r'[^\w\s.,;:!?,。;:!?、\u4e00-\u9fff]', '', text)
+
+ if self.config.text_cleanup['lowercase']:
+ text = text.lower()
+
+ return text.strip()
+
+ @staticmethod
+ def get_supported_formats() -> List[str]:
+ """获取支持的文件格式列表"""
+ return sorted(PaperStructureExtractor.SUPPORTED_EXTENSIONS)
+
+ def extract_paper_structure(self, file_path: Union[str, Path], strategy: str = "fast") -> StructuredPaper:
+ """提取论文的完整结构化信息
+
+ Args:
+ file_path: 文件路径
+ strategy: 提取策略 ("fast" 或 "accurate")
+
+ Returns:
+ StructuredPaper: 结构化的论文数据
+
+ Raises:
+ Exception: 提取过程中的错误
+ """
+ try:
+ path = Path(file_path).resolve()
+ self.logger.info(f"正在处理论文结构: {path}")
+
+ # 创建结构化论文对象
+ paper = StructuredPaper()
+
+ # 提取元数据
+ paper.metadata = self.metadata_extractor.extract_metadata(path, strategy)
+
+ # 使用unstructured库分解文档
+ elements = partition(
+ str(path),
+ strategy=strategy,
+ include_metadata=True,
+ nlp=False,
+ )
+
+ # 提取关键词
+ paper.keywords = paper.metadata.keywords
+
+ # 提取章节结构
+ paper.sections = self._extract_sections(elements)
+
+ # 提取图表
+ if self.config.extract_figures:
+ paper.figures, paper.tables = self._extract_figures_and_tables(elements)
+
+ # 提取公式
+ if self.config.extract_formulas:
+ paper.formulas = self._extract_formulas(elements)
+
+ # 提取参考文献
+ if self.config.extract_references:
+ paper.references = self._extract_references(elements)
+
+ # 提取完整文本
+ paper.full_text = self._extract_full_text(elements)
+
+ return paper
+
+ except Exception as e:
+ self.logger.error(f"结构提取失败: {e}")
+ raise
+
+ def _extract_sections(self, elements) -> List[PaperSection]:
+ """提取论文的章节结构
+
+ Args:
+ elements: 文档元素列表
+
+ Returns:
+ List[PaperSection]: 章节列表
+ """
+ # 第一遍:识别所有标题元素
+ title_elements = []
+ for i, element in enumerate(elements):
+ if isinstance(element, Title):
+ title_text = str(element).strip()
+
+ # 添加过滤条件,排除非章节标题
+ if self._is_likely_section_title(title_text, element, i, elements):
+ section_type = self._identify_section_type(title_text)
+ level = self._estimate_title_level(element, elements)
+ title_elements.append((i, title_text, section_type, level, element))
+
+ # 按层级排序,确保层级低的(数字大的)在后面
+ title_elements.sort(key=lambda x: (x[0], x[3]))
+
+ # 第二遍:创建章节内容
+ sections = []
+ for i, (index, title_text, section_type, level, element) in enumerate(title_elements):
+ # 提取章节内容
+ content = ""
+ if i < len(title_elements) - 1:
+ # 提取到下一章节开始
+ next_index = title_elements[i+1][0]
+ content = self._extract_content_between_indices(elements, index+1, next_index)
+ else:
+ # 提取到文档结束
+ content = self._extract_content_after_index(elements, index+1)
+
+ # 创建章节对象
+ section = PaperSection(
+ section_type=section_type,
+ title=title_text,
+ content=content,
+ level=level,
+ subsections=[]
+ )
+ sections.append(section)
+
+ # 构建章节层次结构
+ hierarchical_sections = self._build_section_hierarchy(sections)
+ return hierarchical_sections
+
+ def _is_likely_section_title(self, title_text: str, element, index: int, elements) -> bool:
+ """判断标题是否可能是章节标题"""
+ title_lower = title_text.lower()
+
+ # 首先检查是否在参考文献部分
+ if self._is_in_references_section(index, elements):
+ # 参考文献部分的标题处理策略:
+ # 1. 只有特定格式的标题才被接受
+ # 2. 通常参考文献中的内容不应被识别为标题
+
+ # 检查是否是有效的参考文献标题格式
+ valid_ref_title_patterns = [
+ r'^references$',
+ r'^bibliography$',
+ r'^参考文献$',
+ r'^\d+\.\s*references$',
+ r'^文献$',
+ r'^引用文献$'
+ ]
+
+ is_valid_ref_title = any(re.match(pattern, title_lower) for pattern in valid_ref_title_patterns)
+
+ # 在参考文献部分,除非是明确的子分类标题,否则都不认为是标题
+ if not is_valid_ref_title:
+ # 检查特定格式:常见的参考文献子类别
+ ref_subcategory_patterns = [
+ r'^primary\s+sources$',
+ r'^secondary\s+sources$',
+ r'^books$',
+ r'^journals$',
+ r'^conference\s+papers$',
+ r'^web\s+sources$',
+ r'^further\s+reading$',
+ r'^monographs$'
+ ]
+
+ is_ref_subcategory = any(re.match(pattern, title_lower) for pattern in ref_subcategory_patterns)
+
+ # 如果不是子类别标题,在参考文献部分很可能不是标题
+ if not is_ref_subcategory:
+ # 检查是否包含出版物特征(会议、期刊、年份等)
+ pub_features = [
+ r'conference', r'proceedings', r'journal', r'transactions',
+ r'symposium', r'workshop', r'international', r'annual',
+ r'\d{4}', r'pp\.', r'vol\.', r'pages', r'ieee', r'acm'
+ ]
+
+ has_pub_features = any(re.search(pattern, title_lower) for pattern in pub_features)
+
+ if has_pub_features:
+ return False
+
+ # 检查文本长度和格式特征
+ if len(title_text) > 50 or title_text.count(' ') > 10:
+ return False
+
+ # 检查是否包含DOI、arXiv等标识
+ if re.search(r'doi|arxiv|http|url|issn|isbn', title_lower):
+ return False
+
+ # 检查是否为数学表达式(例如"max θ")- 保留现有的模式检测
+ math_expr_patterns = [
+ r'^(max|min|sup|inf|lim|arg\s*max|arg\s*min)\s+[a-zA-Z\u0370-\u03FF\u0400-\u04FF θΘ]+$',
+ r'^E\s*\(', # 期望值表达式开头
+ r'^∑|∏|∫|∂|∇|∆', # 以数学符号开头
+ r'^\s*\([a-zA-Z0-9]\)\s*$', # 如 (a), (1) 等单个字母/数字的标识
+ ]
+
+ # 如果匹配任何数学表达式模式,不太可能是章节标题
+ for pattern in math_expr_patterns:
+ if re.search(pattern, title_text):
+ return False
+
+ # 检查标题文本本身是否过短(短标题通常不是章节标题,除非是明确的关键词)
+ if len(title_text) < 4 and not re.match(r'^(abstract|introduction|methods?|results?|discussion|conclusion|references)$', title_lower, re.IGNORECASE):
+ return False
+
+ # 标题中包含括号、大量符号等可能是公式
+ if re.search(r'[)}]n$|[([{)\]}].*[([{)\]}]|\d+[=><≥≤]|[a-z]\s*=', title_text):
+ return False
+
+ # =================== 增强后续内容长度检查 ===================
+ # 查找下一个非空元素
+ next_elements = []
+ total_followup_content = ""
+ next_title_index = -1
+
+ # 收集标题后的内容,直到遇到另一个标题或超过限制
+ for i in range(index+1, min(index+10, len(elements))):
+ if str(elements[i]).strip():
+ next_elements.append(elements[i])
+ if not isinstance(elements[i], Title):
+ total_followup_content += str(elements[i])
+ else:
+ next_title_index = i
+ break
+
+ # 核心检查:标题后内容长度判断
+ # 1. 如果后面没有内容,这不太可能是标题
+ if not next_elements:
+ return False
+
+ # 2. 如果后面第一个元素不是标题但内容很短(少于100字符)
+ if next_elements and not isinstance(next_elements[0], Title):
+ first_element_length = len(str(next_elements[0]))
+ # 检查是否存在第二个非标题元素,如果没有或内容同样很短
+ if (len(next_elements) == 1 or
+ (len(next_elements) > 1 and not isinstance(next_elements[1], Title) and
+ len(str(next_elements[1])) < 50)):
+ # 如果后续内容总长度小于阈值,可能不是真正的标题
+ if first_element_length < 100 and len(total_followup_content) < 150:
+ # 只有常见章节标题可以例外
+ section_type = self._identify_section_type(title_text)
+ main_sections = ['abstract', 'introduction', 'method', 'result',
+ 'discussion', 'conclusion', 'references', 'acknowledgement']
+ if section_type not in main_sections:
+ # 额外检查:如果紧接着的内容包含数学符号,更可能是公式的一部分
+ if re.search(r'[+\-*/=<>≤≥≈≠∑∏∫∂√∞∝∇≡∀∃∄⊂⊃∈∉]|i\s*=|x\s*[ij]|y\s*[ij*]|\(\d+\)', str(next_elements[0])):
+ return False
+ # 检查标题文本是否包含可疑的数学符号或编号
+ if re.search(r'[(){}\[\]∑∏∫i]|^\w{1,2}$', title_text):
+ return False
+
+ # 最后根据总体内容长度判断
+ if len(total_followup_content) < 150:
+ return False
+
+ # 3. 如果后面第一个元素是标题,检查级别关系
+ elif next_elements and isinstance(next_elements[0], Title):
+ # 获取当前和下一个标题的级别
+ current_level = self._estimate_title_level(element, elements)
+ next_level = self._estimate_title_level(next_elements[0], elements)
+
+ # 如果下一个标题级别不是子标题(级别更大),当前标题可能是有问题的
+ if next_level <= current_level:
+ # 检查前后是否有更多数学内容
+ if self._surrounding_has_math_symbols(index, elements):
+ return False
+
+ # 对于非主要章节标题特别严格
+ section_type = self._identify_section_type(title_text)
+ if section_type not in ['abstract', 'introduction', 'method', 'result', 'discussion', 'conclusion', 'references']:
+ # 检查标题文本是否匹配常见章节编号模式
+ if not re.match(r'^\d+(\.\d+)*\.\s+', title_text):
+ return False
+
+ # 定义明确的非章节标题模式
+ non_section_patterns = [
+ r'received|accepted|submitted|revised|published',
+ r'key\s*words|keywords',
+ r'^(table|表)\s*\d+',
+ r'^(figure|fig\.|图)\s*\d+',
+ r'^p[- ]value', # P值通常不是章节
+ r'^(age|sex|gender|stage)(\s+|:)', # 表格中的变量名
+ r'male\s+female', # 表格内容
+ r'≤|≥', # 表格中的比较符号
+ r'^not applicable\.?$', # "Not applicable" 文本
+ r'^[t](\d+)', # T1, T2等肿瘤分期不是章节
+ r'^[nm](\d+)', # N0, M1等肿瘤分期不是章节
+ ]
+
+ # 如果匹配任何非章节模式,返回False
+ for pattern in non_section_patterns:
+ if re.search(pattern, title_lower, re.IGNORECASE):
+ return False
+
+ # 检查是否为表格内容的更强化逻辑
+
+ # 1. 检查前后文本模式 - 表格行通常有一定的模式
+
+ # 检查前面的元素 - 如果前面几个元素都是Title且长度相似,可能是表格
+ similar_title_count = 0
+ if index > 1:
+ for i in range(max(0, index-5), index):
+ if isinstance(elements[i], Title):
+ prev_title_text = str(elements[i]).strip()
+ # 检查长度是否相似
+ if 0.7 <= len(prev_title_text) / len(title_text) <= 1.3:
+ similar_title_count += 1
+ # 检查格式是否相似(例如都是由空格分隔的几个词)
+ if len(prev_title_text.split()) == len(title_text.split()):
+ similar_title_count += 1
+
+ # 检查后面的元素 - 如果后面几个元素都是Title且长度相似,可能是表格
+ if index < len(elements) - 1:
+ for i in range(index+1, min(index+5, len(elements))):
+ if isinstance(elements[i], Title):
+ next_title_text = str(elements[i]).strip()
+ # 检查长度是否相似
+ if 0.7 <= len(next_title_text) / len(title_text) <= 1.3:
+ similar_title_count += 1
+ # 检查格式是否相似
+ if len(next_title_text.split()) == len(title_text.split()):
+ similar_title_count += 1
+
+ # 如果周围有多个相似的Title元素,可能是表格内容
+ if similar_title_count >= 4:
+ return False
+
+ # 2. 检查内容特征 - 表格行通常有特定的特征
+
+ # 检查是否像表格数据行
+ if len(title_text) < 40: # 表格行通常不会太长
+ words = title_text.split()
+
+ # 表格可能格式: "项目 数值 数值" 或 "组别 n 百分比" 等
+ if 2 <= len(words) <= 6:
+ # 检查是否包含数字或百分比 - 表格行特征
+ has_numbers = any(re.search(r'\d', word) for word in words)
+ has_percentages = '%' in title_text
+
+ # 检查短词占比 - 表格行通常是短词
+ short_words_ratio = sum(1 for word in words if len(word) <= 5) / len(words)
+
+ # 综合判断
+ if (has_numbers or has_percentages) and short_words_ratio > 0.6:
+ # 再检查内容长度 - 表格行后通常没有长内容
+ followup_content_length = self._calculate_followup_content_length(index, elements, max_elements=3)
+ if followup_content_length < 100:
+ return False
+
+ # 3. 检查前后内容长度
+
+ # 计算前面内容长度
+ preceding_content_length = 0
+ for i in range(max(0, index-3), index):
+ if isinstance(elements[i], (Text, NarrativeText)):
+ preceding_content_length += len(str(elements[i]))
+
+ # 计算后面内容长度
+ followup_content_length = self._calculate_followup_content_length(index, elements)
+
+ # 真正的章节标题前面通常是另一章节的结尾(有少量文本)或文档开始,后面有大量文本
+ if preceding_content_length > 200 and followup_content_length < 150:
+ # 如果前面有大量文本,后面文本很少,可能不是章节标题
+ return False
+
+ # 标题应该有足够长的后续内容(除非是参考文献等特殊章节)
+ section_type = self._identify_section_type(title_text)
+ main_sections = ['abstract', 'introduction', 'method', 'result',
+ 'discussion', 'conclusion', 'references', 'acknowledgement']
+
+ if section_type in ['references', 'acknowledgement']:
+ return True # 特殊章节不需要内容长度检查
+
+ # 其他章节,根据章节类型和编号情况进行判断
+ if section_type in main_sections:
+ return followup_content_length >= 200 # 主要章节要求200字符以上
+ elif re.match(r'^\d+(\.\d+)*\.?\s+', title_text): # 带编号的章节
+ return followup_content_length >= 150 # 编号章节要求150字符以上
+ else:
+ return followup_content_length >= 300 # 其他可能章节要求300字符以上
+
+ def _calculate_followup_content_length(self, index: int, elements, max_elements: int = 10) -> int:
+ """计算标题后面的内容长度
+
+ Args:
+ index: 标题在元素列表中的索引
+ elements: 所有元素列表
+ max_elements: 最多检查后续多少个元素
+
+ Returns:
+ int: 内容长度
+ """
+ content_length = 0
+ for i in range(index + 1, min(index + max_elements + 1, len(elements))):
+ if isinstance(elements[i], Title):
+ # 如果遇到另一个标题,停止计算
+ break
+ if isinstance(elements[i], (Text, NarrativeText)):
+ content_length += len(str(elements[i]))
+ return content_length
+
+ def _identify_section_type(self, title_text: str) -> str:
+ """根据标题文本识别章节类型"""
+ title_lower = title_text.lower()
+
+ for section_type, pattern in self.PAPER_SECTION_PATTERNS.items():
+ if re.search(pattern, title_lower):
+ return section_type
+
+ # 尝试识别编号章节
+ if re.match(r'^(\d+\.|\d+\s+)', title_lower):
+ # 如果是数字开头,可能是正文章节
+ return "content"
+
+ return "other"
+
+ def _estimate_title_level(self, title_element, all_elements) -> int:
+ """估计标题的层级"""
+ title_text = str(title_element).strip()
+
+ # 通过标题文本中的编号格式判断层级
+ # 查找诸如 "1."、"1.1"、"1.1.1" 等模式
+ level_patterns = [
+ (r'^(\d+\.?\s+)', 1), # 1. 或 1 开头为一级标题
+ (r'^(\d+\.\d+\.?\s+)', 2), # 1.1. 或 1.1 开头为二级标题
+ (r'^(\d+\.\d+\.\d+\.?\s+)', 3), # 1.1.1. 或 1.1.1 开头为三级标题
+ (r'^(\d+\.\d+\.\d+\.\d+\.?\s+)', 4), # 1.1.1.1. 或 1.1.1.1 开头为四级标题
+ ]
+
+ for pattern, level in level_patterns:
+ if re.match(pattern, title_text):
+ return level
+
+ # 检查标题是否是常见的主要章节标题
+ main_sections = {'abstract', 'introduction', 'method', 'result', 'discussion', 'conclusion', 'references'}
+ if self._identify_section_type(title_text) in main_sections:
+ return 1
+
+ # 检查字体大小(如果元数据中有)
+ if hasattr(title_element, 'metadata') and title_element.metadata:
+ try:
+ # 尝试获取字体大小信息
+ font_size = getattr(title_element.metadata, 'font_size', None)
+ if font_size is not None:
+ # 根据字体大小确定层级(较大字体为较低层级)
+ if font_size > 16:
+ return 1
+ elif font_size > 14:
+ return 2
+ else:
+ return 3
+ except (AttributeError, TypeError):
+ pass
+
+ # 默认为1级标题
+ return 1
+
+ def _extract_content_between_indices(self, elements, start_index: int, end_index: int) -> str:
+ """提取指定索引范围内的内容"""
+ content_parts = []
+
+ for i in range(start_index, end_index):
+ element = elements[i]
+ if isinstance(element, (Text, NarrativeText, ListItem, Table)):
+ content_parts.append(self._cleanup_text(str(element)))
+
+ return self.config.paragraph_separator.join(content_parts)
+
+ def _extract_content_after_index(self, elements, start_index: int) -> str:
+ """提取从指定索引到文档结束的内容"""
+ content_parts = []
+
+ for i in range(start_index, len(elements)):
+ element = elements[i]
+ if isinstance(element, (Text, NarrativeText, ListItem, Table)):
+ content_parts.append(self._cleanup_text(str(element)))
+
+ return self.config.paragraph_separator.join(content_parts)
+
+ def _build_section_hierarchy(self, sections: List[PaperSection]) -> List[PaperSection]:
+ """构建章节的层次结构"""
+ if not sections:
+ return []
+
+ # 按层级排序
+ root_sections = []
+ current_parents = {0: None} # 每个层级的当前父节点
+
+ for section in sections:
+ # 找到当前节点的父节点
+ parent_level = None
+ for level in sorted([k for k in current_parents.keys() if k < section.level], reverse=True):
+ parent_level = level
+ break
+
+ if parent_level is None:
+ # 顶级节点
+ root_sections.append(section)
+ else:
+ # 添加为子节点
+ parent = current_parents[parent_level]
+ if parent:
+ parent.subsections.append(section)
+ else:
+ root_sections.append(section)
+
+ # 更新当前层级的父节点
+ current_parents[section.level] = section
+
+ # 清除所有更深层级的父节点缓存
+ deeper_levels = [k for k in current_parents.keys() if k > section.level]
+ for level in deeper_levels:
+ current_parents.pop(level, None)
+
+ return root_sections
+
+ def _extract_figures_and_tables(self, elements) -> Tuple[List[Figure], List[Figure]]:
+ """提取文档中的图表信息"""
+ figures = []
+ tables = []
+
+ for i, element in enumerate(elements):
+ element_text = str(element).strip()
+
+ # 查找图表标识
+ fig_match = re.match(r'^(figure|fig\.|图)\s*(\d+)[.:](.*)', element_text, re.IGNORECASE)
+ table_match = re.match(r'^(table|表)\s*(\d+)[.:](.*)', element_text, re.IGNORECASE)
+
+ if fig_match:
+ fig_id = f"{fig_match.group(1)} {fig_match.group(2)}"
+ caption = fig_match.group(3).strip()
+
+ # 查找图表描述(通常在图表标识下方)
+ description = ""
+ for j in range(i+1, min(i+5, len(elements))):
+ next_text = str(elements[j]).strip()
+ if isinstance(elements[j], (Title, Table)) or re.match(r'^(figure|fig\.|table|图|表)\s*\d+', next_text, re.IGNORECASE):
+ break
+ description += next_text + " "
+
+ figures.append(Figure(
+ id=fig_id,
+ caption=caption,
+ content=description.strip(),
+ position=i
+ ))
+
+ elif table_match:
+ table_id = f"{table_match.group(1)} {table_match.group(2)}"
+ caption = table_match.group(3).strip()
+
+ # 对于表格,尝试获取表格内容
+ table_content = ""
+ if i+1 < len(elements) and isinstance(elements[i+1], Table):
+ table_content = str(elements[i+1])
+
+ tables.append(Figure(
+ id=table_id,
+ caption=caption,
+ content=table_content,
+ position=i
+ ))
+
+ # 检查元素本身是否为表格
+ elif isinstance(element, Table):
+ # 查找表格标题(通常在表格之前)
+ caption = ""
+ if i > 0:
+ prev_text = str(elements[i-1]).strip()
+ if re.match(r'^(table|表)\s*\d+', prev_text, re.IGNORECASE):
+ caption = prev_text
+
+ if not caption:
+ caption = f"Table {len(tables) + 1}"
+
+ tables.append(Figure(
+ id=f"Table {len(tables) + 1}",
+ caption=caption,
+ content=element_text,
+ position=i
+ ))
+
+ # 检查元素本身是否为图片
+ elif isinstance(element, Image):
+ # 查找图片标题(通常在图片之前或之后)
+ caption = ""
+ for j in range(max(0, i-2), min(i+3, len(elements))):
+ if j != i:
+ j_text = str(elements[j]).strip()
+ if re.match(r'^(figure|fig\.|图)\s*\d+', j_text, re.IGNORECASE):
+ caption = j_text
+ break
+
+ if not caption:
+ caption = f"Figure {len(figures) + 1}"
+
+ figures.append(Figure(
+ id=f"Figure {len(figures) + 1}",
+ caption=caption,
+ content="[Image]",
+ position=i
+ ))
+
+ return figures, tables
+
+ def _surrounding_has_math_symbols(self, index: int, elements, window: int = 3) -> bool:
+ """检查周围元素是否包含较多数学符号
+
+ Args:
+ index: 当前元素索引
+ elements: 所有元素
+ window: 检查的窗口大小
+
+ Returns:
+ bool: 是否包含较多数学符号
+ """
+ math_symbols = r'[+\-*/=<>≤≥≈≠∑∏∫∂√∞∝∇≡∀∃∄⊂⊃∈∉θΘαβγδ\[\]\{\}]'
+
+ # 检查前后各window个元素
+ start = max(0, index - window)
+ end = min(len(elements), index + window + 1)
+
+ math_symbol_count = 0
+ total_text = ""
+
+ for i in range(start, end):
+ if i == index:
+ continue # 跳过当前元素
+
+ if isinstance(elements[i], (Text, NarrativeText, Title)):
+ text = str(elements[i])
+ total_text += text
+ math_symbol_count += len(re.findall(math_symbols, text))
+
+ # 计算数学符号密度
+ if total_text:
+ density = math_symbol_count / len(total_text)
+ return density > 0.05 # 如果密度超过5%,认为是数学内容
+
+ return False
+
+ def _extract_formulas(self, elements) -> List[Formula]:
+ """提取文档中的公式"""
+ formulas = []
+ formula_pattern = r'^\s*\((\d+)\)\s*'
+
+ # 标记可能是标题但实际是公式的索引
+ formula_title_indices = set()
+
+ # 第一遍:识别可能被误解为标题的公式
+ for i, element in enumerate(elements):
+ if isinstance(element, Title):
+ title_text = str(element).strip()
+
+ # 检查是否符合数学表达式模式
+ math_expr_patterns = [
+ r'^(max|min|sup|inf|lim|arg\s*max|arg\s*min)\s+[a-zA-Z\u0370-\u03FF\u0400-\u04FF θΘ]+$',
+ r'^E\s*\(', # 期望值表达式
+ r'^∑|∏|∫|∂|∇|∆', # 以数学符号开头
+ ]
+
+ is_math_expr = any(re.search(pattern, title_text) for pattern in math_expr_patterns)
+
+ if is_math_expr:
+ # 判断是否是真正的标题
+ # 1. 检查后面元素的长度
+ next_is_short = False
+ for j in range(i+1, min(i+3, len(elements))):
+ if isinstance(elements[j], (Text, NarrativeText)) and len(str(elements[j])) < 50:
+ next_is_short = True
+ break
+
+ # 2. 检查周围是否有数学符号
+ surrounding_has_math = self._surrounding_has_math_symbols(i, elements)
+
+ if next_is_short or surrounding_has_math:
+ formula_title_indices.add(i)
+
+ # 第二遍:提取所有公式,包括被误识别为标题的公式
+ for i, element in enumerate(elements):
+ element_text = str(element).strip()
+ is_formula = False
+ formula_id = ""
+
+ # 处理被误识别为标题的公式
+ if i in formula_title_indices:
+ is_formula = True
+ formula_id = f"Formula-{len(formulas)+1}"
+ else:
+ # 常规公式识别逻辑,与之前相同
+ formula_match = re.match(formula_pattern, element_text)
+
+ if formula_match:
+ formula_id = f"({formula_match.group(1)})"
+ # 移除公式编号
+ element_text = re.sub(formula_pattern, '', element_text)
+ is_formula = True
+
+ if is_formula:
+ # 检查后续元素是否需要合并(例如,如果标题是"max θ",后面元素通常是公式的其余部分)
+ merged_content = element_text
+ j = i + 1
+ while j < min(i+3, len(elements)):
+ next_elem = elements[j]
+ next_text = str(next_elem).strip()
+
+ # 如果下一个元素很短且包含数学符号,可能是公式的一部分
+ if len(next_text) < 50 and re.search(r'[+\-*/=<>≤≥≈≠∑∏∫∂√∞∝∇≡]', next_text):
+ merged_content += " " + next_text
+ j += 1
+ else:
+ break
+
+ formulas.append(Formula(
+ id=formula_id,
+ content=merged_content,
+ position=i
+ ))
+
+ return formulas
+
+ def _extract_references(self, elements) -> List[Reference]:
+ """提取文档中的参考文献"""
+ references = []
+
+ # 首先找到参考文献部分
+ ref_section_start = -1
+ for i, element in enumerate(elements):
+ if isinstance(element, Title) and re.search(self.PAPER_SECTION_PATTERNS['references'], str(element), re.IGNORECASE):
+ ref_section_start = i
+ break
+
+ if ref_section_start == -1:
+ # 没有找到明确的参考文献部分,尝试在文档末尾寻找
+ # 参考文献通常在文档的最后20%
+ start_pos = int(len(elements) * 0.8)
+ for i in range(start_pos, len(elements)):
+ element_text = str(elements[i]).strip()
+ # 常见的参考文献格式特征
+ if re.match(r'^\[\d+\]|\(\d+\)|^\d+\.\s+[A-Z]', element_text):
+ ref_section_start = i
+ break
+
+ if ref_section_start != -1:
+ # 提取参考文献列表
+ current_ref = None
+ inside_ref = False # 标记是否在一个参考文献项内
+
+ for i in range(ref_section_start + 1, len(elements)):
+ element = elements[i]
+
+ # 忽略标题元素 - 这些可能是错误识别的参考文献部分
+ if isinstance(element, Title):
+ # 检查是否是真正的参考文献部分结束标题
+ title_text = str(element).lower().strip()
+ if re.search(r'^(appendix|appendices|supplementary|acknowledgements?|附录|致谢)$', title_text):
+ # 遇到下一个主要章节,结束参考文献提取
+ break
+
+ # 对于可能是参考文献一部分的标题,将其内容合并到当前参考文献
+ if current_ref and inside_ref:
+ current_ref.text += " " + str(element)
+ continue
+
+ element_text = str(element).strip()
+ if not element_text:
+ continue
+
+ # 检查是否是新的参考文献条目
+ ref_start_match = re.match(r'^\[(\d+)\]|\((\d+)\)|^(\d+)\.\s+', element_text)
+
+ if ref_start_match:
+ # 如果已有参考文献,保存它
+ if current_ref and current_ref.text:
+ references.append(current_ref)
+ inside_ref = False
+
+ # 提取引用ID
+ ref_id = ""
+ if ref_start_match.group(1): # [1] 格式
+ ref_id = f"[{ref_start_match.group(1)}]"
+ # 移除ID前缀
+ element_text = re.sub(r'^\[\d+\]\s*', '', element_text)
+ elif ref_start_match.group(2): # (1) 格式
+ ref_id = f"({ref_start_match.group(2)})"
+ # 移除ID前缀
+ element_text = re.sub(r'^\(\d+\)\s*', '', element_text)
+ elif ref_start_match.group(3): # 1. 格式
+ ref_id = f"{ref_start_match.group(3)}."
+ # 移除ID前缀
+ element_text = re.sub(r'^\d+\.\s+', '', element_text)
+
+ # 创建新的参考文献
+ current_ref = Reference(id=ref_id, text=element_text)
+ inside_ref = True
+
+ # 尝试提取作者和年份
+ author_year_match = re.match(r'^([^,]+),\s*(?:\()?(\d{4})(?:\))?', element_text)
+ if author_year_match:
+ authors_text = author_year_match.group(1).strip()
+ # 尝试分割多个作者
+ authors = [a.strip() for a in re.split(r',|and|&|;|、|等', authors_text) if a.strip()]
+ current_ref.authors = authors
+ current_ref.year = author_year_match.group(2)
+
+ elif current_ref and inside_ref:
+ # 继续当前参考文献
+ current_ref.text += " " + element_text
+
+ # 添加最后一个参考文献
+ if current_ref and current_ref.text:
+ references.append(current_ref)
+
+ return references
+
+ def _extract_full_text(self, elements) -> str:
+ """提取文档的完整文本"""
+ text_parts = []
+
+ for element in elements:
+ if isinstance(element, (Text, NarrativeText, Title, ListItem, Table)):
+ text = str(element).strip()
+ if text:
+ text_parts.append(self._cleanup_text(text))
+
+ return self.config.paragraph_separator.join(text_parts)
+
+ def generate_markdown(self, paper: StructuredPaper) -> str:
+ """将论文结构化数据转换为Markdown格式
+
+ Args:
+ paper: 结构化论文数据对象
+
+ Returns:
+ str: 完整的Markdown格式论文文本
+ """
+ md_parts = []
+
+ # 标题和作者信息
+ md_parts.append(f"# {paper.metadata.title}\n")
+
+ if paper.metadata.authors:
+ authors_str = ", ".join(paper.metadata.authors)
+ md_parts.append(f"**作者:** {authors_str}\n")
+
+ # 发表信息
+ pub_info = []
+ if hasattr(paper.metadata, 'journal') and paper.metadata.journal:
+ pub_info.append(paper.metadata.journal)
+ if hasattr(paper.metadata, 'publication_date') and paper.metadata.publication_date:
+ pub_info.append(paper.metadata.publication_date)
+ elif hasattr(paper.metadata, 'date') and paper.metadata.date:
+ pub_info.append(paper.metadata.date)
+ elif hasattr(paper.metadata, 'year') and paper.metadata.year:
+ pub_info.append(paper.metadata.year)
+
+ if pub_info:
+ md_parts.append(f"**发表信息:** {', '.join(pub_info)}\n")
+
+ # DOI和URL
+ if hasattr(paper.metadata, 'doi') and paper.metadata.doi:
+ md_parts.append(f"**DOI:** {paper.metadata.doi}\n")
+ if hasattr(paper.metadata, 'url') and paper.metadata.url:
+ md_parts.append(f"**URL:** {paper.metadata.url}\n")
+
+ # 摘要
+ abstract_section = next((s for s in paper.sections if s.section_type == 'abstract'), None)
+ if abstract_section:
+ md_parts.append(f"## 摘要\n\n{abstract_section.content}\n")
+ elif hasattr(paper.metadata, 'abstract') and paper.metadata.abstract:
+ md_parts.append(f"## 摘要\n\n{paper.metadata.abstract}\n")
+
+ # 关键词
+ if paper.keywords:
+ md_parts.append(f"**关键词:** {', '.join(paper.keywords)}\n")
+
+ # 章节内容
+ md_parts.append(self._format_sections_markdown(paper.sections))
+
+ # 图表
+ if paper.figures:
+ md_parts.append("## 图\n")
+ for fig in paper.figures:
+ md_parts.append(f"### {fig.id}: {fig.caption}\n\n{fig.content}\n")
+
+ if paper.tables:
+ md_parts.append("## 表\n")
+ for table in paper.tables:
+ md_parts.append(f"### {table.id}: {table.caption}\n\n{table.content}\n")
+
+ # 公式
+ if paper.formulas:
+ md_parts.append("## 公式\n")
+ for formula in paper.formulas:
+ # 使用代码块包装公式内容,而不是作为标题
+ formatted_content = self._format_formula_content(formula.content)
+ md_parts.append(f"**{formula.id}**\n\n```math\n{formatted_content}\n```\n")
+
+ # 参考文献
+ if paper.references:
+ md_parts.append("## 参考文献\n")
+ for ref in paper.references:
+ md_parts.append(f"{ref.id} {ref.text}\n")
+
+ return "\n".join(md_parts)
+
+ def _format_sections_markdown(self, sections: List[PaperSection], level: int = 0) -> str:
+ """递归格式化章节内容为Markdown
+
+ Args:
+ sections: 章节列表
+ level: 当前章节级别
+
+ Returns:
+ str: 格式化后的Markdown文本
+ """
+ if not sections:
+ return ""
+
+ md_parts = []
+ for section in sections:
+ # 计算标题级别(注意Markdown最多支持6级标题)
+ header_level = min(section.level + 2, 6) # +2是因为文章标题是h1,摘要是h2
+ header_marks = '#' * header_level
+
+ # 忽略已经作为摘要处理的部分
+ if level == 0 and section.section_type == 'abstract':
+ continue
+
+ # 添加章节标题和内容
+ md_parts.append(f"{header_marks} {section.title}\n")
+ if section.content:
+ md_parts.append(f"{section.content}\n")
+
+ # 递归处理子章节
+ if section.subsections:
+ md_parts.append(self._format_sections_markdown(
+ section.subsections, level + 1))
+
+ return "\n".join(md_parts)
+
+ def _format_formula_content(self, content: str) -> str:
+ """
+ 格式化公式内容,确保不会被误解为Markdown语法
+
+ Args:
+ content: 原始公式内容
+
+ Returns:
+ str: 格式化后的公式内容
+ """
+ # 移除可能导致Markdown格式错误的前缀
+ content = re.sub(r'^#+\s*', '', content)
+
+ # 清理(cid:X)这样的特殊字符序列,这些通常是PDF解析错误
+ content = re.sub(r'\(cid:\d+\)', '', content)
+
+ # 将多行公式合并成单行(如果需要)
+ content = re.sub(r'\s*\n\s*', ' ', content)
+
+ # 如果公式包含"max"、"min"等关键字,确保它们不被分割
+ # 这里特别处理类似"max θ"的情况
+ content = re.sub(r'(max|min|sup|inf|lim|arg\s*max|arg\s*min)\s+([a-zA-Z\u0370-\u03FF\u0400-\u04FF]+)', r'\1_{\2}', content)
+
+ return content.strip()
+
+ def _is_in_references_section(self, index: int, elements) -> bool:
+ """判断元素是否位于参考文献部分
+
+ Args:
+ index: 当前元素索引
+ elements: 所有元素列表
+
+ Returns:
+ bool: 是否在参考文献部分
+ """
+ # 方法1:查找前面是否有明确的参考文献标题
+ for i in range(index-1, max(0, index-100), -1):
+ if isinstance(elements[i], Title):
+ title_text = str(elements[i]).lower().strip()
+ if re.search(r'^(references|bibliography|参考文献|引用|文献)(\s|$)', title_text):
+ return True
+ # 检查编号形式
+ if re.match(r'^\d+\.\s*(references|bibliography|参考文献)', title_text):
+ return True
+
+ # 方法2:基于位置启发式(通常参考文献在论文末尾)
+ if index > len(elements) * 0.75: # 如果在文档后四分之一
+ # 搜索前后文本是否包含参考文献特征
+ ref_features = 0
+ window = 20 # 查看周围20个元素
+
+ start = max(0, index - window)
+ end = min(len(elements), index + window)
+
+ for i in range(start, end):
+ if i == index:
+ continue
+
+ text = str(elements[i]).lower()
+
+ # 检查参考文献特征
+ if re.search(r'\[\d+\]|\(\d{4}\)|et\s+al\.', text):
+ ref_features += 1
+ if re.search(r'proceedings|journal|conference|transactions|vol\.|pp\.', text):
+ ref_features += 1
+ if re.search(r'doi:|arxiv:|https?://|ieee|acm|springer', text):
+ ref_features += 1
+
+ # 如果周围文本具有足够的参考文献特征
+ if ref_features >= 5:
+ return True
+
+ return False
+
+
+def main():
+ """主函数:演示用法"""
+ # 创建提取器
+ extractor = PaperStructureExtractor()
+
+ # 使用示例
+ try:
+ # 替换为实际的文件路径
+ sample_file = '/Users/boyin.liu/Documents/示例文档/论文/3.pdf'
+ if Path(sample_file).exists():
+ paper = extractor.extract_paper_structure(sample_file)
+
+ print("\n===== 论文结构化信息 =====")
+ print(f"标题: {paper.metadata.title}")
+ print(f"作者: {', '.join(paper.metadata.authors)}")
+
+ print("\n--- 章节结构 ---")
+ for i, section in enumerate(paper.sections):
+ print(f"{i+1}. {section.title} ({section.section_type})")
+ if section.subsections:
+ for j, subsection in enumerate(section.subsections):
+ print(f" {i+1}.{j+1} {subsection.title}")
+
+ print("\n--- 图表 ---")
+ print(f"图: {len(paper.figures)}")
+ for i, fig in enumerate(paper.figures[:3]):
+ print(f"图 {i+1}: {fig.caption[:50]}...")
+
+ print(f"\n表: {len(paper.tables)}")
+ for i, table in enumerate(paper.tables[:3]):
+ print(f"表 {i+1}: {table.caption[:50]}...")
+
+ print(f"\n--- 公式: {len(paper.formulas)} ---")
+ for i, formula in enumerate(paper.formulas[:3]):
+ print(f"公式 {formula.id}: {formula.content[:30]}...")
+
+ print(f"\n--- 参考文献: {len(paper.references)} ---")
+ for i, ref in enumerate(paper.references[:5]):
+ print(f"{ref.id} {ref.text[:50]}...")
+
+ print("\n--- 摘要 ---")
+ abstract_section = next((s for s in paper.sections if s.section_type == 'abstract'), None)
+ if abstract_section:
+ print(abstract_section.content[:200] + "...")
+ else:
+ print(paper.metadata.abstract[:200] + "...")
+
+ else:
+ print(f"示例文件 {sample_file} 不存在")
+
+ print("\n支持的格式:", extractor.get_supported_formats())
+
+ except Exception as e:
+ print(f"错误: {e}")
+ import traceback
+ traceback.print_exc()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_md.py b/crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_md.py
new file mode 100644
index 00000000..78c48eec
--- /dev/null
+++ b/crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_md.py
@@ -0,0 +1,86 @@
+from pathlib import Path
+from crazy_functions.doc_fns.read_fns.unstructured_all.paper_structure_extractor import PaperStructureExtractor
+
+def extract_and_save_as_markdown(paper_path, output_path=None):
+ """
+ 提取论文结构并保存为Markdown格式
+
+ 参数:
+ paper_path: 论文文件路径
+ output_path: 输出的Markdown文件路径,如果不指定,将使用与输入相同的文件名但扩展名为.md
+
+ 返回:
+ 保存的Markdown文件路径
+ """
+ # 创建提取器
+ extractor = PaperStructureExtractor()
+
+ # 解析文件路径
+ paper_path = Path(paper_path)
+
+ # 如果未指定输出路径,使用相同文件名但扩展名为.md
+ if output_path is None:
+ output_path = paper_path.with_suffix('.md')
+ else:
+ output_path = Path(output_path)
+
+ # 确保输出目录存在
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+
+ print(f"正在处理论文: {paper_path}")
+
+ try:
+ # 提取论文结构
+ paper = extractor.extract_paper_structure(paper_path)
+
+ # 生成Markdown内容
+ markdown_content = extractor.generate_markdown(paper)
+
+ # 保存到文件
+ with open(output_path, 'w', encoding='utf-8') as f:
+ f.write(markdown_content)
+
+ print(f"已成功保存Markdown文件: {output_path}")
+
+ # 打印摘要信息
+ print("\n论文摘要信息:")
+ print(f"标题: {paper.metadata.title}")
+ print(f"作者: {', '.join(paper.metadata.authors)}")
+ print(f"关键词: {', '.join(paper.keywords)}")
+ print(f"章节数: {len(paper.sections)}")
+ print(f"图表数: {len(paper.figures)}")
+ print(f"表格数: {len(paper.tables)}")
+ print(f"公式数: {len(paper.formulas)}")
+ print(f"参考文献数: {len(paper.references)}")
+
+ return output_path
+
+ except Exception as e:
+ print(f"处理论文时出错: {e}")
+ import traceback
+ traceback.print_exc()
+ return None
+
+# 使用示例
+if __name__ == "__main__":
+ # 替换为实际的论文文件路径
+ sample_paper = "crazy_functions/doc_fns/read_fns/paper/2501.12599v1.pdf"
+
+ # 可以指定输出路径,也可以使用默认路径
+ # output_file = "/path/to/output/paper_structure.md"
+ # extract_and_save_as_markdown(sample_paper, output_file)
+
+ # 使用默认输出路径(与输入文件同名但扩展名为.md)
+ extract_and_save_as_markdown(sample_paper)
+
+ # # 批量处理多个论文的示例
+ # paper_dir = Path("/path/to/papers/folder")
+ # output_dir = Path("/path/to/output/folder")
+ #
+ # # 确保输出目录存在
+ # output_dir.mkdir(parents=True, exist_ok=True)
+ #
+ # # 处理目录中的所有PDF文件
+ # for paper_file in paper_dir.glob("*.pdf"):
+ # output_file = output_dir / f"{paper_file.stem}.md"
+ # extract_and_save_as_markdown(paper_file, output_file)
\ No newline at end of file
diff --git a/crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_reader.py b/crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_reader.py
new file mode 100644
index 00000000..7b39696b
--- /dev/null
+++ b/crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_reader.py
@@ -0,0 +1,275 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Optional, Set, Dict, Union, List
+from dataclasses import dataclass, field
+import logging
+import os
+
+from unstructured.partition.auto import partition
+from unstructured.documents.elements import (
+ Text, Title, NarrativeText, ListItem, Table,
+ Footer, Header, PageBreak, Image, Address
+)
+
+
+@dataclass
+class TextExtractorConfig:
+ """通用文档提取器配置类
+
+ Attributes:
+ extract_headers_footers: 是否提取页眉页脚
+ extract_tables: 是否提取表格内容
+ extract_lists: 是否提取列表内容
+ extract_titles: 是否提取标题
+ paragraph_separator: 段落之间的分隔符
+ text_cleanup: 文本清理选项字典
+ """
+ extract_headers_footers: bool = False
+ extract_tables: bool = True
+ extract_lists: bool = True
+ extract_titles: bool = True
+ paragraph_separator: str = '\n\n'
+ text_cleanup: Dict[str, bool] = field(default_factory=lambda: {
+ 'remove_extra_spaces': True,
+ 'normalize_whitespace': True,
+ 'remove_special_chars': False,
+ 'lowercase': False
+ })
+
+
+class UnstructuredTextExtractor:
+ """通用文档文本内容提取器
+
+ 使用 unstructured 库支持多种文档格式的文本提取,提供统一的接口和配置选项。
+ """
+
+ SUPPORTED_EXTENSIONS: Set[str] = {
+ # 文档格式
+ '.pdf', '.docx', '.doc', '.txt',
+ # 演示文稿
+ '.ppt', '.pptx',
+ # 电子表格
+ '.xlsx', '.xls', '.csv',
+ # 图片
+ '.png', '.jpg', '.jpeg', '.tiff',
+ # 邮件
+ '.eml', '.msg', '.p7s',
+ # Markdown
+ ".md",
+ # Org Mode
+ ".org",
+ # Open Office
+ ".odt",
+ # reStructured Text
+ ".rst",
+ # Rich Text
+ ".rtf",
+ # TSV
+ ".tsv",
+ # EPUB
+ '.epub',
+ # 其他格式
+ '.html', '.xml', '.json',
+ }
+
+ def __init__(self, config: Optional[TextExtractorConfig] = None):
+ """初始化提取器
+
+ Args:
+ config: 提取器配置对象,如果为None则使用默认配置
+ """
+ self.config = config or TextExtractorConfig()
+ self._setup_logging()
+
+ def _setup_logging(self) -> None:
+ """配置日志记录器"""
+ logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ )
+ self.logger = logging.getLogger(__name__)
+
+ # 添加文件处理器
+ fh = logging.FileHandler('text_extractor.log')
+ fh.setLevel(logging.ERROR)
+ self.logger.addHandler(fh)
+
+ def _validate_file(self, file_path: Union[str, Path], max_size_mb: int = 100) -> Path:
+ """验证文件
+
+ Args:
+ file_path: 文件路径
+ max_size_mb: 允许的最大文件大小(MB)
+
+ Returns:
+ Path: 验证后的Path对象
+
+ Raises:
+ ValueError: 文件不存在、格式不支持或大小超限
+ PermissionError: 没有读取权限
+ """
+ path = Path(file_path).resolve()
+
+ if not path.exists():
+ raise ValueError(f"File not found: {path}")
+
+ if not path.is_file():
+ raise ValueError(f"Not a file: {path}")
+
+ if not os.access(path, os.R_OK):
+ raise PermissionError(f"No read permission: {path}")
+
+ file_size_mb = path.stat().st_size / (1024 * 1024)
+ if file_size_mb > max_size_mb:
+ raise ValueError(
+ f"File size ({file_size_mb:.1f}MB) exceeds limit of {max_size_mb}MB"
+ )
+
+ if path.suffix.lower() not in self.SUPPORTED_EXTENSIONS:
+ raise ValueError(
+ f"Unsupported format: {path.suffix}. "
+ f"Supported: {', '.join(sorted(self.SUPPORTED_EXTENSIONS))}"
+ )
+
+ return path
+
+ def _cleanup_text(self, text: str) -> str:
+ """清理文本
+
+ Args:
+ text: 原始文本
+
+ Returns:
+ str: 清理后的文本
+ """
+ if self.config.text_cleanup['remove_extra_spaces']:
+ text = ' '.join(text.split())
+
+ if self.config.text_cleanup['normalize_whitespace']:
+ text = text.replace('\t', ' ').replace('\r', '\n')
+
+ if self.config.text_cleanup['lowercase']:
+ text = text.lower()
+
+ return text.strip()
+
+ def _should_extract_element(self, element) -> bool:
+ """判断是否应该提取某个元素
+
+ Args:
+ element: 文档元素
+
+ Returns:
+ bool: 是否应该提取
+ """
+ if isinstance(element, (Text, NarrativeText)):
+ return True
+
+ if isinstance(element, Title) and self.config.extract_titles:
+ return True
+
+ if isinstance(element, ListItem) and self.config.extract_lists:
+ return True
+
+ if isinstance(element, Table) and self.config.extract_tables:
+ return True
+
+ if isinstance(element, (Header, Footer)) and self.config.extract_headers_footers:
+ return True
+
+ return False
+
+ @staticmethod
+ def get_supported_formats() -> List[str]:
+ """获取支持的文件格式列表"""
+ return sorted(UnstructuredTextExtractor.SUPPORTED_EXTENSIONS)
+
+ def extract_text(
+ self,
+ file_path: Union[str, Path],
+ strategy: str = "fast"
+ ) -> str:
+ """提取文本
+
+ Args:
+ file_path: 文件路径
+ strategy: 提取策略 ("fast" 或 "accurate")
+
+ Returns:
+ str: 提取的文本内容
+
+ Raises:
+ Exception: 提取过程中的错误
+ """
+ try:
+ path = self._validate_file(file_path)
+ self.logger.info(f"Processing: {path}")
+
+ # 修改这里:添加 nlp=False 参数来禁用 NLTK
+ elements = partition(
+ str(path),
+ strategy=strategy,
+ include_metadata=True,
+ nlp=True,
+ )
+
+ # 其余代码保持不变
+ text_parts = []
+ for element in elements:
+ if self._should_extract_element(element):
+ text = str(element)
+ cleaned_text = self._cleanup_text(text)
+ if cleaned_text:
+ if isinstance(element, (Header, Footer)):
+ prefix = "[Header] " if isinstance(element, Header) else "[Footer] "
+ text_parts.append(f"{prefix}{cleaned_text}")
+ else:
+ text_parts.append(cleaned_text)
+
+ return self.config.paragraph_separator.join(text_parts)
+
+ except Exception as e:
+ self.logger.error(f"Extraction failed: {e}")
+ raise
+
+
+
+def main():
+ """主函数:演示用法"""
+ # 配置
+ config = TextExtractorConfig(
+ extract_headers_footers=True,
+ extract_tables=True,
+ extract_lists=True,
+ extract_titles=True,
+ text_cleanup={
+ 'remove_extra_spaces': True,
+ 'normalize_whitespace': True,
+ 'remove_special_chars': False,
+ 'lowercase': False
+ }
+ )
+
+ # 创建提取器
+ extractor = UnstructuredTextExtractor(config)
+
+ # 使用示例
+ try:
+ # 替换为实际的文件路径
+ sample_file = './crazy_functions/doc_fns/read_fns/paper/2501.12599v1.pdf'
+ if Path(sample_file).exists() or True:
+ text = extractor.extract_text(sample_file)
+ print("提取的文本:")
+ print(text)
+ else:
+ print(f"示例文件 {sample_file} 不存在")
+
+ print("\n支持的格式:", extractor.get_supported_formats())
+
+ except Exception as e:
+ print(f"错误: {e}")
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/crazy_functions/doc_fns/read_fns/web_reader.py b/crazy_functions/doc_fns/read_fns/web_reader.py
new file mode 100644
index 00000000..33c78286
--- /dev/null
+++ b/crazy_functions/doc_fns/read_fns/web_reader.py
@@ -0,0 +1,219 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Union
+from urllib.parse import urlparse
+import logging
+import trafilatura
+import requests
+from pathlib import Path
+
+
+@dataclass
+class WebExtractorConfig:
+ """网页内容提取器配置类
+
+ Attributes:
+ extract_comments: 是否提取评论
+ extract_tables: 是否提取表格
+ extract_links: 是否保留链接信息
+ paragraph_separator: 段落分隔符
+ timeout: 网络请求超时时间(秒)
+ max_retries: 最大重试次数
+ user_agent: 自定义User-Agent
+ text_cleanup: 文本清理选项
+ """
+ extract_comments: bool = False
+ extract_tables: bool = True
+ extract_links: bool = False
+ paragraph_separator: str = '\n\n'
+ timeout: int = 10
+ max_retries: int = 3
+ user_agent: str = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+ text_cleanup: Dict[str, bool] = field(default_factory=lambda: {
+ 'remove_extra_spaces': True,
+ 'normalize_whitespace': True,
+ 'remove_special_chars': False,
+ 'lowercase': False
+ })
+
+
+class WebTextExtractor:
+ """网页文本内容提取器
+
+ 使用trafilatura库提取网页中的主要文本内容,去除广告、导航等无关内容。
+ """
+
+ def __init__(self, config: Optional[WebExtractorConfig] = None):
+ """初始化提取器
+
+ Args:
+ config: 提取器配置对象,如果为None则使用默认配置
+ """
+ self.config = config or WebExtractorConfig()
+ self._setup_logging()
+
+ def _setup_logging(self) -> None:
+ """配置日志记录器"""
+ logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ )
+ self.logger = logging.getLogger(__name__)
+
+ # 添加文件处理器
+ fh = logging.FileHandler('web_extractor.log')
+ fh.setLevel(logging.ERROR)
+ self.logger.addHandler(fh)
+
+ def _validate_url(self, url: str) -> bool:
+ """验证URL格式是否有效
+
+ Args:
+ url: 网页URL
+
+ Returns:
+ bool: URL是否有效
+ """
+ try:
+ result = urlparse(url)
+ return all([result.scheme, result.netloc])
+ except Exception:
+ return False
+
+ def _download_webpage(self, url: str) -> Optional[str]:
+ """下载网页内容
+
+ Args:
+ url: 网页URL
+
+ Returns:
+ Optional[str]: 网页HTML内容,失败返回None
+
+ Raises:
+ Exception: 下载失败时抛出异常
+ """
+ headers = {'User-Agent': self.config.user_agent}
+
+ for attempt in range(self.config.max_retries):
+ try:
+ response = requests.get(
+ url,
+ headers=headers,
+ timeout=self.config.timeout
+ )
+ response.raise_for_status()
+ return response.text
+ except requests.RequestException as e:
+ self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
+ if attempt == self.config.max_retries - 1:
+ raise Exception(f"Failed to download webpage after {self.config.max_retries} attempts: {e}")
+ return None
+
+ def _cleanup_text(self, text: str) -> str:
+ """清理文本
+
+ Args:
+ text: 原始文本
+
+ Returns:
+ str: 清理后的文本
+ """
+ if not text:
+ return ""
+
+ if self.config.text_cleanup['remove_extra_spaces']:
+ text = ' '.join(text.split())
+
+ if self.config.text_cleanup['normalize_whitespace']:
+ text = text.replace('\t', ' ').replace('\r', '\n')
+
+ if self.config.text_cleanup['lowercase']:
+ text = text.lower()
+
+ return text.strip()
+
+ def extract_text(self, url: str) -> str:
+ """提取网页文本内容
+
+ Args:
+ url: 网页URL
+
+ Returns:
+ str: 提取的文本内容
+
+ Raises:
+ ValueError: URL无效时抛出
+ Exception: 提取失败时抛出
+ """
+ try:
+ if not self._validate_url(url):
+ raise ValueError(f"Invalid URL: {url}")
+
+ self.logger.info(f"Processing URL: {url}")
+
+ # 下载网页
+ html_content = self._download_webpage(url)
+ if not html_content:
+ raise Exception("Failed to download webpage")
+
+ # 配置trafilatura提取选项
+ extract_config = {
+ 'include_comments': self.config.extract_comments,
+ 'include_tables': self.config.extract_tables,
+ 'include_links': self.config.extract_links,
+ 'no_fallback': False, # 允许使用后备提取器
+ }
+
+ # 提取文本
+ extracted_text = trafilatura.extract(
+ html_content,
+ **extract_config
+ )
+
+ if not extracted_text:
+ raise Exception("No content could be extracted")
+
+ # 清理文本
+ cleaned_text = self._cleanup_text(extracted_text)
+
+ return cleaned_text
+
+ except Exception as e:
+ self.logger.error(f"Extraction failed: {e}")
+ raise
+
+
+def main():
+ """主函数:演示用法"""
+ # 配置
+ config = WebExtractorConfig(
+ extract_comments=False,
+ extract_tables=True,
+ extract_links=False,
+ timeout=10,
+ text_cleanup={
+ 'remove_extra_spaces': True,
+ 'normalize_whitespace': True,
+ 'remove_special_chars': False,
+ 'lowercase': False
+ }
+ )
+
+ # 创建提取器
+ extractor = WebTextExtractor(config)
+
+ # 使用示例
+ try:
+ # 替换为实际的URL
+ sample_url = 'https://arxiv.org/abs/2412.00036'
+ text = extractor.extract_text(sample_url)
+ print("提取的文本:")
+ print(text)
+
+ except Exception as e:
+ print(f"错误: {e}")
+
+
+if __name__ == "__main__":
+ main()