From f42aad5093aef4510ab395ee00d5aeaca769347e Mon Sep 17 00:00:00 2001 From: binary-husky Date: Wed, 4 Jun 2025 00:20:09 +0800 Subject: [PATCH] implement doc_fns --- crazy_functions/doc_fns/AI_review_doc.py | 812 +++++++++++ crazy_functions/doc_fns/__init__.py | 0 .../doc_fns/batch_file_query_doc.py | 812 +++++++++++ crazy_functions/doc_fns/content_folder.py | 237 ++++ .../doc_fns/conversation_doc/excel_doc.py | 211 +++ .../doc_fns/conversation_doc/html_doc.py | 190 +++ .../doc_fns/conversation_doc/markdown_doc.py | 39 + .../doc_fns/conversation_doc/pdf_doc.py | 172 +++ .../doc_fns/conversation_doc/txt_doc.py | 79 ++ .../doc_fns/conversation_doc/word2pdf.py | 155 +++ .../doc_fns/conversation_doc/word_doc.py | 177 +++ crazy_functions/doc_fns/read_fns/__init__.py | 0 .../doc_fns/read_fns/docx_reader.py | 6 + .../doc_fns/read_fns/excel_reader.py | 286 ++++ .../read_fns/markitdown/markdown_reader.py | 359 +++++ .../read_fns/unstructured_all/__init__.py | 0 .../paper_metadata_extractor.py | 493 +++++++ .../paper_structure_extractor.py | 1220 +++++++++++++++++ .../unstructured_all/unstructured_md.py | 86 ++ .../unstructured_all/unstructured_reader.py | 275 ++++ .../doc_fns/read_fns/web_reader.py | 219 +++ 21 files changed, 5828 insertions(+) create mode 100644 crazy_functions/doc_fns/AI_review_doc.py create mode 100644 crazy_functions/doc_fns/__init__.py create mode 100644 crazy_functions/doc_fns/batch_file_query_doc.py create mode 100644 crazy_functions/doc_fns/content_folder.py create mode 100644 crazy_functions/doc_fns/conversation_doc/excel_doc.py create mode 100644 crazy_functions/doc_fns/conversation_doc/html_doc.py create mode 100644 crazy_functions/doc_fns/conversation_doc/markdown_doc.py create mode 100644 crazy_functions/doc_fns/conversation_doc/pdf_doc.py create mode 100644 crazy_functions/doc_fns/conversation_doc/txt_doc.py create mode 100644 crazy_functions/doc_fns/conversation_doc/word2pdf.py create mode 100644 crazy_functions/doc_fns/conversation_doc/word_doc.py create mode 100644 crazy_functions/doc_fns/read_fns/__init__.py create mode 100644 crazy_functions/doc_fns/read_fns/docx_reader.py create mode 100644 crazy_functions/doc_fns/read_fns/excel_reader.py create mode 100644 crazy_functions/doc_fns/read_fns/markitdown/markdown_reader.py create mode 100644 crazy_functions/doc_fns/read_fns/unstructured_all/__init__.py create mode 100644 crazy_functions/doc_fns/read_fns/unstructured_all/paper_metadata_extractor.py create mode 100644 crazy_functions/doc_fns/read_fns/unstructured_all/paper_structure_extractor.py create mode 100644 crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_md.py create mode 100644 crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_reader.py create mode 100644 crazy_functions/doc_fns/read_fns/web_reader.py diff --git a/crazy_functions/doc_fns/AI_review_doc.py b/crazy_functions/doc_fns/AI_review_doc.py new file mode 100644 index 00000000..483ffc63 --- /dev/null +++ b/crazy_functions/doc_fns/AI_review_doc.py @@ -0,0 +1,812 @@ +import os +import time +from abc import ABC, abstractmethod +from datetime import datetime +from docx import Document +from docx.enum.style import WD_STYLE_TYPE +from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING +from docx.oxml.ns import qn +from docx.shared import Inches, Cm +from docx.shared import Pt, RGBColor, Inches +from typing import Dict, List, Tuple +import markdown +from crazy_functions.doc_fns.conversation_doc.word_doc import convert_markdown_to_word + + + +class DocumentFormatter(ABC): + """文档格式化基类,定义文档格式化的基本接口""" + + def __init__(self, final_summary: str, file_summaries_map: Dict, failed_files: List[Tuple]): + self.final_summary = final_summary + self.file_summaries_map = file_summaries_map + self.failed_files = failed_files + + @abstractmethod + def format_failed_files(self) -> str: + """格式化失败文件列表""" + pass + + @abstractmethod + def format_file_summaries(self) -> str: + """格式化文件总结内容""" + pass + + @abstractmethod + def create_document(self) -> str: + """创建完整文档""" + pass + + +class WordFormatter(DocumentFormatter): + """Word格式文档生成器 - 符合中国政府公文格式规范(GB/T 9704-2012),并进行了优化""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.doc = Document() + self._setup_document() + self._create_styles() + # 初始化三级标题编号系统 + self.numbers = { + 1: 0, # 一级标题编号 + 2: 0, # 二级标题编号 + 3: 0 # 三级标题编号 + } + + def _setup_document(self): + """设置文档基本格式,包括页面设置和页眉""" + sections = self.doc.sections + for section in sections: + # 设置页面大小为A4 + section.page_width = Cm(21) + section.page_height = Cm(29.7) + # 设置页边距 + section.top_margin = Cm(3.7) # 上边距37mm + section.bottom_margin = Cm(3.5) # 下边距35mm + section.left_margin = Cm(2.8) # 左边距28mm + section.right_margin = Cm(2.6) # 右边距26mm + # 设置页眉页脚距离 + section.header_distance = Cm(2.0) + section.footer_distance = Cm(2.0) + + # 添加页眉 + header = section.header + header_para = header.paragraphs[0] + header_para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT + header_run = header_para.add_run("该文档由GPT-academic生成") + header_run.font.name = '仿宋' + header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + header_run.font.size = Pt(9) + + def _create_styles(self): + """创建文档样式""" + # 创建正文样式 + style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH) + style.font.name = '仿宋' + style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + style.font.size = Pt(14) + style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + style.paragraph_format.space_after = Pt(0) + style.paragraph_format.first_line_indent = Pt(28) + + # 创建各级标题样式 + self._create_heading_style('Title_Custom', '方正小标宋简体', 32, WD_PARAGRAPH_ALIGNMENT.CENTER) + self._create_heading_style('Heading1_Custom', '黑体', 22, WD_PARAGRAPH_ALIGNMENT.LEFT) + self._create_heading_style('Heading2_Custom', '黑体', 18, WD_PARAGRAPH_ALIGNMENT.LEFT) + self._create_heading_style('Heading3_Custom', '黑体', 16, WD_PARAGRAPH_ALIGNMENT.LEFT) + + def _create_heading_style(self, style_name: str, font_name: str, font_size: int, alignment): + """创建标题样式""" + style = self.doc.styles.add_style(style_name, WD_STYLE_TYPE.PARAGRAPH) + style.font.name = font_name + style._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) + style.font.size = Pt(font_size) + style.font.bold = True + style.paragraph_format.alignment = alignment + style.paragraph_format.space_before = Pt(12) + style.paragraph_format.space_after = Pt(12) + style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + return style + + def _get_heading_number(self, level: int) -> str: + """ + 生成标题编号 + + Args: + level: 标题级别 (0-3) + + Returns: + str: 格式化的标题编号 + """ + if level == 0: # 主标题不需要编号 + return "" + + self.numbers[level] += 1 # 增加当前级别的编号 + + # 重置下级标题编号 + for i in range(level + 1, 4): + self.numbers[i] = 0 + + # 根据级别返回不同格式的编号 + if level == 1: + return f"{self.numbers[1]}. " + elif level == 2: + return f"{self.numbers[1]}.{self.numbers[2]} " + elif level == 3: + return f"{self.numbers[1]}.{self.numbers[2]}.{self.numbers[3]} " + return "" + + def _add_heading(self, text: str, level: int): + """ + 添加带编号的标题 + + Args: + text: 标题文本 + level: 标题级别 (0-3) + """ + style_map = { + 0: 'Title_Custom', + 1: 'Heading1_Custom', + 2: 'Heading2_Custom', + 3: 'Heading3_Custom' + } + + number = self._get_heading_number(level) + paragraph = self.doc.add_paragraph(style=style_map[level]) + + if number: + number_run = paragraph.add_run(number) + font_size = 22 if level == 1 else (18 if level == 2 else 16) + self._get_run_style(number_run, '黑体', font_size, True) + + text_run = paragraph.add_run(text) + font_size = 32 if level == 0 else (22 if level == 1 else (18 if level == 2 else 16)) + self._get_run_style(text_run, '黑体', font_size, True) + + # 主标题添加日期 + if level == 0: + date_paragraph = self.doc.add_paragraph() + date_paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER + date_run = date_paragraph.add_run(datetime.now().strftime('%Y年%m月%d日')) + self._get_run_style(date_run, '仿宋', 16, False) + + return paragraph + + def _get_run_style(self, run, font_name: str, font_size: int, bold: bool = False): + """设置文本运行对象的样式""" + run.font.name = font_name + run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) + run.font.size = Pt(font_size) + run.font.bold = bold + + def format_failed_files(self) -> str: + """格式化失败文件列表""" + result = [] + if not self.failed_files: + return "\n".join(result) + + result.append("处理失败文件:") + for fp, reason in self.failed_files: + result.append(f"• {os.path.basename(fp)}: {reason}") + + self._add_heading("处理失败文件", 1) + for fp, reason in self.failed_files: + self._add_content(f"• {os.path.basename(fp)}: {reason}", indent=False) + self.doc.add_paragraph() + + return "\n".join(result) + + def _add_content(self, text: str, indent: bool = True): + """添加正文内容,使用convert_markdown_to_word处理文本""" + # 使用convert_markdown_to_word处理markdown文本 + processed_text = convert_markdown_to_word(text) + paragraph = self.doc.add_paragraph(processed_text, style='Normal_Custom') + if not indent: + paragraph.paragraph_format.first_line_indent = Pt(0) + return paragraph + + def format_file_summaries(self) -> str: + """ + 格式化文件总结内容,确保正确的标题层级并处理markdown文本 + """ + result = [] + # 首先对文件路径进行分组整理 + file_groups = {} + for path in sorted(self.file_summaries_map.keys()): + dir_path = os.path.dirname(path) + if dir_path not in file_groups: + file_groups[dir_path] = [] + file_groups[dir_path].append(path) + + # 处理没有目录的文件 + root_files = file_groups.get("", []) + if root_files: + for path in sorted(root_files): + file_name = os.path.basename(path) + result.append(f"\n📄 {file_name}") + result.append(self.file_summaries_map[path]) + # 无目录的文件作为二级标题 + self._add_heading(f"📄 {file_name}", 2) + # 使用convert_markdown_to_word处理文件内容 + self._add_content(convert_markdown_to_word(self.file_summaries_map[path])) + self.doc.add_paragraph() + + # 处理有目录的文件 + for dir_path in sorted(file_groups.keys()): + if dir_path == "": # 跳过已处理的根目录文件 + continue + + # 添加目录作为二级标题 + result.append(f"\n📁 {dir_path}") + self._add_heading(f"📁 {dir_path}", 2) + + # 该目录下的所有文件作为三级标题 + for path in sorted(file_groups[dir_path]): + file_name = os.path.basename(path) + result.append(f"\n📄 {file_name}") + result.append(self.file_summaries_map[path]) + + # 添加文件名作为三级标题 + self._add_heading(f"📄 {file_name}", 3) + # 使用convert_markdown_to_word处理文件内容 + self._add_content(convert_markdown_to_word(self.file_summaries_map[path])) + self.doc.add_paragraph() + + return "\n".join(result) + + + def create_document(self): + """创建完整Word文档并返回文档对象""" + # 重置所有编号 + for level in self.numbers: + self.numbers[level] = 0 + + # 添加主标题 + self._add_heading("文档总结报告", 0) + self.doc.add_paragraph() + + # 添加总体摘要,使用convert_markdown_to_word处理 + self._add_heading("总体摘要", 1) + self._add_content(convert_markdown_to_word(self.final_summary)) + self.doc.add_paragraph() + + # 添加失败文件列表(如果有) + if self.failed_files: + self.format_failed_files() + + # 添加文件详细总结 + self._add_heading("各文件详细总结", 1) + self.format_file_summaries() + + return self.doc + + def save_as_pdf(self, word_path, pdf_path=None): + """将生成的Word文档转换为PDF + + 参数: + word_path: Word文档的路径 + pdf_path: 可选,PDF文件的输出路径。如果未指定,将使用与Word文档相同的名称和位置 + + 返回: + 生成的PDF文件路径,如果转换失败则返回None + """ + from crazy_functions.doc_fns.conversation_doc.word2pdf import WordToPdfConverter + try: + pdf_path = WordToPdfConverter.convert_to_pdf(word_path, pdf_path) + return pdf_path + except Exception as e: + print(f"PDF转换失败: {str(e)}") + return None + + +class MarkdownFormatter(DocumentFormatter): + """Markdown格式文档生成器""" + + def format_failed_files(self) -> str: + if not self.failed_files: + return "" + + formatted_text = ["\n## ⚠️ 处理失败的文件"] + for fp, reason in self.failed_files: + formatted_text.append(f"- {os.path.basename(fp)}: {reason}") + formatted_text.append("\n---") + return "\n".join(formatted_text) + + def format_file_summaries(self) -> str: + formatted_text = [] + sorted_paths = sorted(self.file_summaries_map.keys()) + current_dir = "" + + for path in sorted_paths: + dir_path = os.path.dirname(path) + if dir_path != current_dir: + if dir_path: + formatted_text.append(f"\n## 📁 {dir_path}") + current_dir = dir_path + + file_name = os.path.basename(path) + formatted_text.append(f"\n### 📄 {file_name}") + formatted_text.append(self.file_summaries_map[path]) + formatted_text.append("\n---") + + return "\n".join(formatted_text) + + def create_document(self) -> str: + document = [ + "# 📑 文档总结报告", + "\n## 总体摘要", + self.final_summary + ] + + if self.failed_files: + document.append(self.format_failed_files()) + + document.extend([ + "\n# 📚 各文件详细总结", + self.format_file_summaries() + ]) + + return "\n".join(document) + + + +class HtmlFormatter(DocumentFormatter): + """HTML格式文档生成器 - 优化版""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.md = markdown.Markdown(extensions=['extra','codehilite', 'tables','nl2br']) + self.css_styles = """ + @keyframes fadeIn { + from { opacity: 0; transform: translateY(20px); } + to { opacity: 1; transform: translateY(0); } + } + + @keyframes slideIn { + from { transform: translateX(-20px); opacity: 0; } + to { transform: translateX(0); opacity: 1; } + } + + @keyframes pulse { + 0% { transform: scale(1); } + 50% { transform: scale(1.05); } + 100% { transform: scale(1); } + } + + :root { + /* Enhanced color palette */ + --primary-color: #2563eb; + --primary-light: #eff6ff; + --secondary-color: #1e293b; + --background-color: #f8fafc; + --text-color: #334155; + --text-light: #64748b; + --border-color: #e2e8f0; + --error-color: #ef4444; + --error-light: #fef2f2; + --success-color: #22c55e; + --warning-color: #f59e0b; + --card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1); + --hover-shadow: 0 20px 25px -5px rgb(0 0 0 / 0.1), 0 8px 10px -6px rgb(0 0 0 / 0.1); + + /* Typography */ + --heading-font: "Plus Jakarta Sans", system-ui, sans-serif; + --body-font: "Inter", system-ui, sans-serif; + } + + body { + font-family: var(--body-font); + line-height: 1.8; + max-width: 1200px; + margin: 0 auto; + padding: 2rem; + color: var(--text-color); + background-color: var(--background-color); + font-size: 16px; + -webkit-font-smoothing: antialiased; + } + + .container { + background: white; + padding: 3rem; + border-radius: 24px; + box-shadow: var(--card-shadow); + transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1); + animation: fadeIn 0.6s ease-out; + border: 1px solid var(--border-color); + } + + .container:hover { + box-shadow: var(--hover-shadow); + transform: translateY(-2px); + } + + h1, h2, h3 { + font-family: var(--heading-font); + font-weight: 600; + } + + h1 { + color: var(--primary-color); + font-size: 2.8em; + text-align: center; + margin: 2rem 0 3rem; + padding-bottom: 1.5rem; + border-bottom: 3px solid var(--primary-color); + letter-spacing: -0.03em; + position: relative; + display: flex; + align-items: center; + justify-content: center; + gap: 1rem; + } + + h1::after { + content: ''; + position: absolute; + bottom: -3px; + left: 50%; + transform: translateX(-50%); + width: 120px; + height: 3px; + background: linear-gradient(90deg, var(--primary-color), var(--primary-light)); + border-radius: 3px; + transition: width 0.3s ease; + } + + h1:hover::after { + width: 180px; + } + + h2 { + color: var(--secondary-color); + font-size: 1.9em; + margin: 2.5rem 0 1.5rem; + padding-left: 1.2rem; + border-left: 4px solid var(--primary-color); + letter-spacing: -0.02em; + display: flex; + align-items: center; + gap: 1rem; + transition: all 0.3s ease; + } + + h2:hover { + color: var(--primary-color); + transform: translateX(5px); + } + + h3 { + color: var(--text-color); + font-size: 1.5em; + margin: 2rem 0 1rem; + padding-bottom: 0.8rem; + border-bottom: 2px solid var(--border-color); + transition: all 0.3s ease; + display: flex; + align-items: center; + gap: 0.8rem; + } + + h3:hover { + color: var(--primary-color); + border-bottom-color: var(--primary-color); + } + + .summary { + background: var(--primary-light); + padding: 2.5rem; + border-radius: 16px; + margin: 2.5rem 0; + box-shadow: 0 4px 6px -1px rgba(37, 99, 235, 0.1); + position: relative; + overflow: hidden; + transition: transform 0.3s ease, box-shadow 0.3s ease; + animation: slideIn 0.5s ease-out; + } + + .summary:hover { + transform: translateY(-3px); + box-shadow: 0 8px 12px -2px rgba(37, 99, 235, 0.15); + } + + .summary::before { + content: ''; + position: absolute; + top: 0; + left: 0; + width: 4px; + height: 100%; + background: linear-gradient(to bottom, var(--primary-color), rgba(37, 99, 235, 0.6)); + } + + .summary p { + margin: 1.2rem 0; + line-height: 1.9; + color: var(--text-color); + transition: color 0.3s ease; + } + + .summary:hover p { + color: var(--secondary-color); + } + + .details { + margin-top: 3.5rem; + padding-top: 2.5rem; + border-top: 2px dashed var(--border-color); + animation: fadeIn 0.8s ease-out; + } + + .failed-files { + background: var(--error-light); + padding: 2rem; + border-radius: 16px; + margin: 3rem 0; + border-left: 4px solid var(--error-color); + position: relative; + transition: all 0.3s ease; + animation: slideIn 0.5s ease-out; + } + + .failed-files:hover { + transform: translateX(5px); + box-shadow: 0 8px 15px -3px rgba(239, 68, 68, 0.1); + } + + .failed-files h2 { + color: var(--error-color); + border-left: none; + padding-left: 0; + } + + .failed-files ul { + margin: 1.8rem 0; + padding-left: 1.2rem; + list-style-type: none; + } + + .failed-files li { + margin: 1.2rem 0; + padding: 1.2rem 1.8rem; + background: rgba(239, 68, 68, 0.08); + border-radius: 12px; + transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1); + } + + .failed-files li:hover { + transform: translateX(8px); + background: rgba(239, 68, 68, 0.12); + } + + .directory-section { + margin: 3.5rem 0; + padding: 2rem; + background: var(--background-color); + border-radius: 16px; + position: relative; + transition: all 0.3s ease; + animation: fadeIn 0.6s ease-out; + } + + .directory-section:hover { + background: white; + box-shadow: var(--card-shadow); + } + + .file-summary { + background: white; + padding: 2rem; + margin: 1.8rem 0; + border-radius: 16px; + box-shadow: var(--card-shadow); + border-left: 4px solid var(--border-color); + transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1); + position: relative; + overflow: hidden; + } + + .file-summary:hover { + border-left-color: var(--primary-color); + transform: translateX(8px) translateY(-2px); + box-shadow: var(--hover-shadow); + } + + .file-summary { + background: white; + padding: 2rem; + margin: 1.8rem 0; + border-radius: 16px; + box-shadow: var(--card-shadow); + border-left: 4px solid var(--border-color); + transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1); + position: relative; + } + + .file-summary:hover { + border-left-color: var(--primary-color); + transform: translateX(8px) translateY(-2px); + box-shadow: var(--hover-shadow); + } + + .icon { + display: inline-flex; + align-items: center; + justify-content: center; + width: 32px; + height: 32px; + border-radius: 8px; + background: var(--primary-light); + color: var(--primary-color); + font-size: 1.2em; + transition: all 0.3s ease; + } + + .file-summary:hover .icon, + .directory-section:hover .icon { + transform: scale(1.1); + background: var(--primary-color); + color: white; + } + + /* Smooth scrolling */ + html { + scroll-behavior: smooth; + } + + /* Selection style */ + ::selection { + background: var(--primary-light); + color: var(--primary-color); + } + + /* Print styles */ + @media print { + body { + background: white; + } + .container { + box-shadow: none; + padding: 0; + } + .file-summary, .failed-files { + break-inside: avoid; + box-shadow: none; + } + .icon { + display: none; + } + } + + /* Responsive design */ + @media (max-width: 768px) { + body { + padding: 1rem; + font-size: 15px; + } + + .container { + padding: 1.5rem; + } + + h1 { + font-size: 2.2em; + margin: 1.5rem 0 2rem; + } + + h2 { + font-size: 1.7em; + } + + h3 { + font-size: 1.4em; + } + + .summary, .failed-files, .directory-section { + padding: 1.5rem; + } + + .file-summary { + padding: 1.2rem; + } + + .icon { + width: 28px; + height: 28px; + } + } + + /* Dark mode support */ + @media (prefers-color-scheme: dark) { + :root { + --primary-light: rgba(37, 99, 235, 0.15); + --background-color: #0f172a; + --text-color: #e2e8f0; + --text-light: #94a3b8; + --border-color: #1e293b; + --error-light: rgba(239, 68, 68, 0.15); + } + + .container, .file-summary { + background: #1e293b; + } + + .directory-section { + background: #0f172a; + } + + .directory-section:hover { + background: #1e293b; + } + } + """ + + def format_failed_files(self) -> str: + if not self.failed_files: + return "" + + failed_files_html = ['
'] + failed_files_html.append('

⚠️ 处理失败的文件

') + failed_files_html.append("
") + return "\n".join(failed_files_html) + + def format_file_summaries(self) -> str: + formatted_html = [] + sorted_paths = sorted(self.file_summaries_map.keys()) + current_dir = "" + + for path in sorted_paths: + dir_path = os.path.dirname(path) + if dir_path != current_dir: + if dir_path: + formatted_html.append('
') + formatted_html.append(f'

📁 {dir_path}

') + formatted_html.append('
') + current_dir = dir_path + + file_name = os.path.basename(path) + formatted_html.append('
') + formatted_html.append(f'

📄 {file_name}

') + formatted_html.append(self.md.convert(self.file_summaries_map[path])) + formatted_html.append('
') + + return "\n".join(formatted_html) + + def create_document(self) -> str: + """生成HTML文档 + Returns: + str: 完整的HTML文档字符串 + """ + return f""" + + + + + + 文档总结报告 + + + + + +
+

📑 文档总结报告

+
+

📋 总体摘要

+

{self.md.convert(self.final_summary)}

+
+ {self.format_failed_files()} +
+

📚 各文件详细总结

+ {self.format_file_summaries()} +
+
+ + + + """ \ No newline at end of file diff --git a/crazy_functions/doc_fns/__init__.py b/crazy_functions/doc_fns/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crazy_functions/doc_fns/batch_file_query_doc.py b/crazy_functions/doc_fns/batch_file_query_doc.py new file mode 100644 index 00000000..a2a11e37 --- /dev/null +++ b/crazy_functions/doc_fns/batch_file_query_doc.py @@ -0,0 +1,812 @@ +import os +import time +from abc import ABC, abstractmethod +from datetime import datetime +from docx import Document +from docx.enum.style import WD_STYLE_TYPE +from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING +from docx.oxml.ns import qn +from docx.shared import Inches, Cm +from docx.shared import Pt, RGBColor, Inches +from typing import Dict, List, Tuple +import markdown +from crazy_functions.doc_fns.conversation_doc.word_doc import convert_markdown_to_word + + + +class DocumentFormatter(ABC): + """文档格式化基类,定义文档格式化的基本接口""" + + def __init__(self, final_summary: str, file_summaries_map: Dict, failed_files: List[Tuple]): + self.final_summary = final_summary + self.file_summaries_map = file_summaries_map + self.failed_files = failed_files + + @abstractmethod + def format_failed_files(self) -> str: + """格式化失败文件列表""" + pass + + @abstractmethod + def format_file_summaries(self) -> str: + """格式化文件总结内容""" + pass + + @abstractmethod + def create_document(self) -> str: + """创建完整文档""" + pass + + +class WordFormatter(DocumentFormatter): + """Word格式文档生成器 - 符合中国政府公文格式规范(GB/T 9704-2012),并进行了优化""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.doc = Document() + self._setup_document() + self._create_styles() + # 初始化三级标题编号系统 + self.numbers = { + 1: 0, # 一级标题编号 + 2: 0, # 二级标题编号 + 3: 0 # 三级标题编号 + } + + def _setup_document(self): + """设置文档基本格式,包括页面设置和页眉""" + sections = self.doc.sections + for section in sections: + # 设置页面大小为A4 + section.page_width = Cm(21) + section.page_height = Cm(29.7) + # 设置页边距 + section.top_margin = Cm(3.7) # 上边距37mm + section.bottom_margin = Cm(3.5) # 下边距35mm + section.left_margin = Cm(2.8) # 左边距28mm + section.right_margin = Cm(2.6) # 右边距26mm + # 设置页眉页脚距离 + section.header_distance = Cm(2.0) + section.footer_distance = Cm(2.0) + + # 添加页眉 + header = section.header + header_para = header.paragraphs[0] + header_para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT + header_run = header_para.add_run("该文档由GPT-academic生成") + header_run.font.name = '仿宋' + header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + header_run.font.size = Pt(9) + + def _create_styles(self): + """创建文档样式""" + # 创建正文样式 + style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH) + style.font.name = '仿宋' + style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + style.font.size = Pt(14) + style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + style.paragraph_format.space_after = Pt(0) + style.paragraph_format.first_line_indent = Pt(28) + + # 创建各级标题样式 + self._create_heading_style('Title_Custom', '方正小标宋简体', 32, WD_PARAGRAPH_ALIGNMENT.CENTER) + self._create_heading_style('Heading1_Custom', '黑体', 22, WD_PARAGRAPH_ALIGNMENT.LEFT) + self._create_heading_style('Heading2_Custom', '黑体', 18, WD_PARAGRAPH_ALIGNMENT.LEFT) + self._create_heading_style('Heading3_Custom', '黑体', 16, WD_PARAGRAPH_ALIGNMENT.LEFT) + + def _create_heading_style(self, style_name: str, font_name: str, font_size: int, alignment): + """创建标题样式""" + style = self.doc.styles.add_style(style_name, WD_STYLE_TYPE.PARAGRAPH) + style.font.name = font_name + style._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) + style.font.size = Pt(font_size) + style.font.bold = True + style.paragraph_format.alignment = alignment + style.paragraph_format.space_before = Pt(12) + style.paragraph_format.space_after = Pt(12) + style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + return style + + def _get_heading_number(self, level: int) -> str: + """ + 生成标题编号 + + Args: + level: 标题级别 (0-3) + + Returns: + str: 格式化的标题编号 + """ + if level == 0: # 主标题不需要编号 + return "" + + self.numbers[level] += 1 # 增加当前级别的编号 + + # 重置下级标题编号 + for i in range(level + 1, 4): + self.numbers[i] = 0 + + # 根据级别返回不同格式的编号 + if level == 1: + return f"{self.numbers[1]}. " + elif level == 2: + return f"{self.numbers[1]}.{self.numbers[2]} " + elif level == 3: + return f"{self.numbers[1]}.{self.numbers[2]}.{self.numbers[3]} " + return "" + + def _add_heading(self, text: str, level: int): + """ + 添加带编号的标题 + + Args: + text: 标题文本 + level: 标题级别 (0-3) + """ + style_map = { + 0: 'Title_Custom', + 1: 'Heading1_Custom', + 2: 'Heading2_Custom', + 3: 'Heading3_Custom' + } + + number = self._get_heading_number(level) + paragraph = self.doc.add_paragraph(style=style_map[level]) + + if number: + number_run = paragraph.add_run(number) + font_size = 22 if level == 1 else (18 if level == 2 else 16) + self._get_run_style(number_run, '黑体', font_size, True) + + text_run = paragraph.add_run(text) + font_size = 32 if level == 0 else (22 if level == 1 else (18 if level == 2 else 16)) + self._get_run_style(text_run, '黑体', font_size, True) + + # 主标题添加日期 + if level == 0: + date_paragraph = self.doc.add_paragraph() + date_paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER + date_run = date_paragraph.add_run(datetime.now().strftime('%Y年%m月%d日')) + self._get_run_style(date_run, '仿宋', 16, False) + + return paragraph + + def _get_run_style(self, run, font_name: str, font_size: int, bold: bool = False): + """设置文本运行对象的样式""" + run.font.name = font_name + run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) + run.font.size = Pt(font_size) + run.font.bold = bold + + def format_failed_files(self) -> str: + """格式化失败文件列表""" + result = [] + if not self.failed_files: + return "\n".join(result) + + result.append("处理失败文件:") + for fp, reason in self.failed_files: + result.append(f"• {os.path.basename(fp)}: {reason}") + + self._add_heading("处理失败文件", 1) + for fp, reason in self.failed_files: + self._add_content(f"• {os.path.basename(fp)}: {reason}", indent=False) + self.doc.add_paragraph() + + return "\n".join(result) + + def _add_content(self, text: str, indent: bool = True): + """添加正文内容,使用convert_markdown_to_word处理文本""" + # 使用convert_markdown_to_word处理markdown文本 + processed_text = convert_markdown_to_word(text) + paragraph = self.doc.add_paragraph(processed_text, style='Normal_Custom') + if not indent: + paragraph.paragraph_format.first_line_indent = Pt(0) + return paragraph + + def format_file_summaries(self) -> str: + """ + 格式化文件总结内容,确保正确的标题层级并处理markdown文本 + """ + result = [] + # 首先对文件路径进行分组整理 + file_groups = {} + for path in sorted(self.file_summaries_map.keys()): + dir_path = os.path.dirname(path) + if dir_path not in file_groups: + file_groups[dir_path] = [] + file_groups[dir_path].append(path) + + # 处理没有目录的文件 + root_files = file_groups.get("", []) + if root_files: + for path in sorted(root_files): + file_name = os.path.basename(path) + result.append(f"\n📄 {file_name}") + result.append(self.file_summaries_map[path]) + # 无目录的文件作为二级标题 + self._add_heading(f"📄 {file_name}", 2) + # 使用convert_markdown_to_word处理文件内容 + self._add_content(convert_markdown_to_word(self.file_summaries_map[path])) + self.doc.add_paragraph() + + # 处理有目录的文件 + for dir_path in sorted(file_groups.keys()): + if dir_path == "": # 跳过已处理的根目录文件 + continue + + # 添加目录作为二级标题 + result.append(f"\n📁 {dir_path}") + self._add_heading(f"📁 {dir_path}", 2) + + # 该目录下的所有文件作为三级标题 + for path in sorted(file_groups[dir_path]): + file_name = os.path.basename(path) + result.append(f"\n📄 {file_name}") + result.append(self.file_summaries_map[path]) + + # 添加文件名作为三级标题 + self._add_heading(f"📄 {file_name}", 3) + # 使用convert_markdown_to_word处理文件内容 + self._add_content(convert_markdown_to_word(self.file_summaries_map[path])) + self.doc.add_paragraph() + + return "\n".join(result) + + + def create_document(self): + """创建完整Word文档并返回文档对象""" + # 重置所有编号 + for level in self.numbers: + self.numbers[level] = 0 + + # 添加主标题 + self._add_heading("文档总结报告", 0) + self.doc.add_paragraph() + + # 添加总体摘要,使用convert_markdown_to_word处理 + self._add_heading("总体摘要", 1) + self._add_content(convert_markdown_to_word(self.final_summary)) + self.doc.add_paragraph() + + # 添加失败文件列表(如果有) + if self.failed_files: + self.format_failed_files() + + # 添加文件详细总结 + self._add_heading("各文件详细总结", 1) + self.format_file_summaries() + + return self.doc + + def save_as_pdf(self, word_path, pdf_path=None): + """将生成的Word文档转换为PDF + + 参数: + word_path: Word文档的路径 + pdf_path: 可选,PDF文件的输出路径。如果未指定,将使用与Word文档相同的名称和位置 + + 返回: + 生成的PDF文件路径,如果转换失败则返回None + """ + from crazy_functions.doc_fns.conversation_doc.word2pdf import WordToPdfConverter + try: + pdf_path = WordToPdfConverter.convert_to_pdf(word_path, pdf_path) + return pdf_path + except Exception as e: + print(f"PDF转换失败: {str(e)}") + return None + + +class MarkdownFormatter(DocumentFormatter): + """Markdown格式文档生成器""" + + def format_failed_files(self) -> str: + if not self.failed_files: + return "" + + formatted_text = ["\n## ⚠️ 处理失败的文件"] + for fp, reason in self.failed_files: + formatted_text.append(f"- {os.path.basename(fp)}: {reason}") + formatted_text.append("\n---") + return "\n".join(formatted_text) + + def format_file_summaries(self) -> str: + formatted_text = [] + sorted_paths = sorted(self.file_summaries_map.keys()) + current_dir = "" + + for path in sorted_paths: + dir_path = os.path.dirname(path) + if dir_path != current_dir: + if dir_path: + formatted_text.append(f"\n## 📁 {dir_path}") + current_dir = dir_path + + file_name = os.path.basename(path) + formatted_text.append(f"\n### 📄 {file_name}") + formatted_text.append(self.file_summaries_map[path]) + formatted_text.append("\n---") + + return "\n".join(formatted_text) + + def create_document(self) -> str: + document = [ + "# 📑 文档总结报告", + "\n## 总体摘要", + self.final_summary + ] + + if self.failed_files: + document.append(self.format_failed_files()) + + document.extend([ + "\n# 📚 各文件详细总结", + self.format_file_summaries() + ]) + + return "\n".join(document) + + + +class HtmlFormatter(DocumentFormatter): + """HTML格式文档生成器 - 优化版""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.md = markdown.Markdown(extensions=['extra','codehilite', 'tables','nl2br']) + self.css_styles = """ + @keyframes fadeIn { + from { opacity: 0; transform: translateY(20px); } + to { opacity: 1; transform: translateY(0); } + } + + @keyframes slideIn { + from { transform: translateX(-20px); opacity: 0; } + to { transform: translateX(0); opacity: 1; } + } + + @keyframes pulse { + 0% { transform: scale(1); } + 50% { transform: scale(1.05); } + 100% { transform: scale(1); } + } + + :root { + /* Enhanced color palette */ + --primary-color: #2563eb; + --primary-light: #eff6ff; + --secondary-color: #1e293b; + --background-color: #f8fafc; + --text-color: #334155; + --text-light: #64748b; + --border-color: #e2e8f0; + --error-color: #ef4444; + --error-light: #fef2f2; + --success-color: #22c55e; + --warning-color: #f59e0b; + --card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1); + --hover-shadow: 0 20px 25px -5px rgb(0 0 0 / 0.1), 0 8px 10px -6px rgb(0 0 0 / 0.1); + + /* Typography */ + --heading-font: "Plus Jakarta Sans", system-ui, sans-serif; + --body-font: "Inter", system-ui, sans-serif; + } + + body { + font-family: var(--body-font); + line-height: 1.8; + max-width: 1200px; + margin: 0 auto; + padding: 2rem; + color: var(--text-color); + background-color: var(--background-color); + font-size: 16px; + -webkit-font-smoothing: antialiased; + } + + .container { + background: white; + padding: 3rem; + border-radius: 24px; + box-shadow: var(--card-shadow); + transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1); + animation: fadeIn 0.6s ease-out; + border: 1px solid var(--border-color); + } + + .container:hover { + box-shadow: var(--hover-shadow); + transform: translateY(-2px); + } + + h1, h2, h3 { + font-family: var(--heading-font); + font-weight: 600; + } + + h1 { + color: var(--primary-color); + font-size: 2.8em; + text-align: center; + margin: 2rem 0 3rem; + padding-bottom: 1.5rem; + border-bottom: 3px solid var(--primary-color); + letter-spacing: -0.03em; + position: relative; + display: flex; + align-items: center; + justify-content: center; + gap: 1rem; + } + + h1::after { + content: ''; + position: absolute; + bottom: -3px; + left: 50%; + transform: translateX(-50%); + width: 120px; + height: 3px; + background: linear-gradient(90deg, var(--primary-color), var(--primary-light)); + border-radius: 3px; + transition: width 0.3s ease; + } + + h1:hover::after { + width: 180px; + } + + h2 { + color: var(--secondary-color); + font-size: 1.9em; + margin: 2.5rem 0 1.5rem; + padding-left: 1.2rem; + border-left: 4px solid var(--primary-color); + letter-spacing: -0.02em; + display: flex; + align-items: center; + gap: 1rem; + transition: all 0.3s ease; + } + + h2:hover { + color: var(--primary-color); + transform: translateX(5px); + } + + h3 { + color: var(--text-color); + font-size: 1.5em; + margin: 2rem 0 1rem; + padding-bottom: 0.8rem; + border-bottom: 2px solid var(--border-color); + transition: all 0.3s ease; + display: flex; + align-items: center; + gap: 0.8rem; + } + + h3:hover { + color: var(--primary-color); + border-bottom-color: var(--primary-color); + } + + .summary { + background: var(--primary-light); + padding: 2.5rem; + border-radius: 16px; + margin: 2.5rem 0; + box-shadow: 0 4px 6px -1px rgba(37, 99, 235, 0.1); + position: relative; + overflow: hidden; + transition: transform 0.3s ease, box-shadow 0.3s ease; + animation: slideIn 0.5s ease-out; + } + + .summary:hover { + transform: translateY(-3px); + box-shadow: 0 8px 12px -2px rgba(37, 99, 235, 0.15); + } + + .summary::before { + content: ''; + position: absolute; + top: 0; + left: 0; + width: 4px; + height: 100%; + background: linear-gradient(to bottom, var(--primary-color), rgba(37, 99, 235, 0.6)); + } + + .summary p { + margin: 1.2rem 0; + line-height: 1.9; + color: var(--text-color); + transition: color 0.3s ease; + } + + .summary:hover p { + color: var(--secondary-color); + } + + .details { + margin-top: 3.5rem; + padding-top: 2.5rem; + border-top: 2px dashed var(--border-color); + animation: fadeIn 0.8s ease-out; + } + + .failed-files { + background: var(--error-light); + padding: 2rem; + border-radius: 16px; + margin: 3rem 0; + border-left: 4px solid var(--error-color); + position: relative; + transition: all 0.3s ease; + animation: slideIn 0.5s ease-out; + } + + .failed-files:hover { + transform: translateX(5px); + box-shadow: 0 8px 15px -3px rgba(239, 68, 68, 0.1); + } + + .failed-files h2 { + color: var(--error-color); + border-left: none; + padding-left: 0; + } + + .failed-files ul { + margin: 1.8rem 0; + padding-left: 1.2rem; + list-style-type: none; + } + + .failed-files li { + margin: 1.2rem 0; + padding: 1.2rem 1.8rem; + background: rgba(239, 68, 68, 0.08); + border-radius: 12px; + transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1); + } + + .failed-files li:hover { + transform: translateX(8px); + background: rgba(239, 68, 68, 0.12); + } + + .directory-section { + margin: 3.5rem 0; + padding: 2rem; + background: var(--background-color); + border-radius: 16px; + position: relative; + transition: all 0.3s ease; + animation: fadeIn 0.6s ease-out; + } + + .directory-section:hover { + background: white; + box-shadow: var(--card-shadow); + } + + .file-summary { + background: white; + padding: 2rem; + margin: 1.8rem 0; + border-radius: 16px; + box-shadow: var(--card-shadow); + border-left: 4px solid var(--border-color); + transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1); + position: relative; + overflow: hidden; + } + + .file-summary:hover { + border-left-color: var(--primary-color); + transform: translateX(8px) translateY(-2px); + box-shadow: var(--hover-shadow); + } + + .file-summary { + background: white; + padding: 2rem; + margin: 1.8rem 0; + border-radius: 16px; + box-shadow: var(--card-shadow); + border-left: 4px solid var(--border-color); + transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1); + position: relative; + } + + .file-summary:hover { + border-left-color: var(--primary-color); + transform: translateX(8px) translateY(-2px); + box-shadow: var(--hover-shadow); + } + + .icon { + display: inline-flex; + align-items: center; + justify-content: center; + width: 32px; + height: 32px; + border-radius: 8px; + background: var(--primary-light); + color: var(--primary-color); + font-size: 1.2em; + transition: all 0.3s ease; + } + + .file-summary:hover .icon, + .directory-section:hover .icon { + transform: scale(1.1); + background: var(--primary-color); + color: white; + } + + /* Smooth scrolling */ + html { + scroll-behavior: smooth; + } + + /* Selection style */ + ::selection { + background: var(--primary-light); + color: var(--primary-color); + } + + /* Print styles */ + @media print { + body { + background: white; + } + .container { + box-shadow: none; + padding: 0; + } + .file-summary, .failed-files { + break-inside: avoid; + box-shadow: none; + } + .icon { + display: none; + } + } + + /* Responsive design */ + @media (max-width: 768px) { + body { + padding: 1rem; + font-size: 15px; + } + + .container { + padding: 1.5rem; + } + + h1 { + font-size: 2.2em; + margin: 1.5rem 0 2rem; + } + + h2 { + font-size: 1.7em; + } + + h3 { + font-size: 1.4em; + } + + .summary, .failed-files, .directory-section { + padding: 1.5rem; + } + + .file-summary { + padding: 1.2rem; + } + + .icon { + width: 28px; + height: 28px; + } + } + + /* Dark mode support */ + @media (prefers-color-scheme: dark) { + :root { + --primary-light: rgba(37, 99, 235, 0.15); + --background-color: #0f172a; + --text-color: #e2e8f0; + --text-light: #94a3b8; + --border-color: #1e293b; + --error-light: rgba(239, 68, 68, 0.15); + } + + .container, .file-summary { + background: #1e293b; + } + + .directory-section { + background: #0f172a; + } + + .directory-section:hover { + background: #1e293b; + } + } + """ + + def format_failed_files(self) -> str: + if not self.failed_files: + return "" + + failed_files_html = ['
'] + failed_files_html.append('

⚠️ 处理失败的文件

') + failed_files_html.append("
") + return "\n".join(failed_files_html) + + def format_file_summaries(self) -> str: + formatted_html = [] + sorted_paths = sorted(self.file_summaries_map.keys()) + current_dir = "" + + for path in sorted_paths: + dir_path = os.path.dirname(path) + if dir_path != current_dir: + if dir_path: + formatted_html.append('
') + formatted_html.append(f'

📁 {dir_path}

') + formatted_html.append('
') + current_dir = dir_path + + file_name = os.path.basename(path) + formatted_html.append('
') + formatted_html.append(f'

📄 {file_name}

') + formatted_html.append(self.md.convert(self.file_summaries_map[path])) + formatted_html.append('
') + + return "\n".join(formatted_html) + + def create_document(self) -> str: + """生成HTML文档 + Returns: + str: 完整的HTML文档字符串 + """ + return f""" + + + + + + 文档总结报告 + + + + + +
+

📑 文档总结报告

+
+

📋 总体摘要

+

{self.md.convert(self.final_summary)}

+
+ {self.format_failed_files()} +
+

📚 各文件详细总结

+ {self.format_file_summaries()} +
+
+ + + + """ \ No newline at end of file diff --git a/crazy_functions/doc_fns/content_folder.py b/crazy_functions/doc_fns/content_folder.py new file mode 100644 index 00000000..26f991c7 --- /dev/null +++ b/crazy_functions/doc_fns/content_folder.py @@ -0,0 +1,237 @@ +from abc import ABC, abstractmethod +from typing import Any, Dict, Optional, Type, TypeVar, Generic, Union + +from dataclasses import dataclass +from enum import Enum, auto +import logging +from datetime import datetime + +# 设置日志 +logger = logging.getLogger(__name__) + + +# 自定义异常类定义 +class FoldingError(Exception): + """折叠相关的自定义异常基类""" + pass + + +class FormattingError(FoldingError): + """格式化过程中的错误""" + pass + + +class MetadataError(FoldingError): + """元数据相关的错误""" + pass + + +class ValidationError(FoldingError): + """验证错误""" + pass + + +class FoldingStyle(Enum): + """折叠样式枚举""" + SIMPLE = auto() # 简单折叠 + DETAILED = auto() # 详细折叠(带有额外信息) + NESTED = auto() # 嵌套折叠 + + +@dataclass +class FoldingOptions: + """折叠选项配置""" + style: FoldingStyle = FoldingStyle.DETAILED + code_language: Optional[str] = None # 代码块的语言 + show_timestamp: bool = False # 是否显示时间戳 + indent_level: int = 0 # 缩进级别 + custom_css: Optional[str] = None # 自定义CSS类 + + +T = TypeVar('T') # 用于泛型类型 + + +class BaseMetadata(ABC): + """元数据基类""" + + @abstractmethod + def validate(self) -> bool: + """验证元数据的有效性""" + pass + + def _validate_non_empty_str(self, value: Optional[str]) -> bool: + """验证字符串非空""" + return bool(value and value.strip()) + + +@dataclass +class FileMetadata(BaseMetadata): + """文件元数据""" + rel_path: str + size: float + last_modified: Optional[datetime] = None + mime_type: Optional[str] = None + encoding: str = 'utf-8' + + def validate(self) -> bool: + """验证文件元数据的有效性""" + try: + if not self._validate_non_empty_str(self.rel_path): + return False + if self.size < 0: + return False + return True + except Exception as e: + logger.error(f"File metadata validation error: {str(e)}") + return False + + + + +class ContentFormatter(ABC, Generic[T]): + """内容格式化抽象基类 + + 支持泛型类型参数,可以指定具体的元数据类型。 + """ + + @abstractmethod + def format(self, + content: str, + metadata: T, + options: Optional[FoldingOptions] = None) -> str: + """格式化内容 + + Args: + content: 需要格式化的内容 + metadata: 类型化的元数据 + options: 折叠选项 + + Returns: + str: 格式化后的内容 + + Raises: + FormattingError: 格式化过程中的错误 + """ + pass + + def _create_summary(self, metadata: T) -> str: + """创建折叠摘要,可被子类重写""" + return str(metadata) + + def _format_content_block(self, + content: str, + options: Optional[FoldingOptions]) -> str: + """格式化内容块,处理代码块等特殊格式""" + if not options: + return content + + if options.code_language: + return f"```{options.code_language}\n{content}\n```" + return content + + def _add_indent(self, text: str, level: int) -> str: + """添加缩进""" + if level <= 0: + return text + indent = " " * level + return "\n".join(indent + line for line in text.splitlines()) + + +class FileContentFormatter(ContentFormatter[FileMetadata]): + """文件内容格式化器""" + + def format(self, + content: str, + metadata: FileMetadata, + options: Optional[FoldingOptions] = None) -> str: + """格式化文件内容""" + if not metadata.validate(): + raise MetadataError("Invalid file metadata") + + try: + options = options or FoldingOptions() + + # 构建摘要信息 + summary_parts = [ + f"{metadata.rel_path} ({metadata.size:.2f}MB)", + f"Type: {metadata.mime_type}" if metadata.mime_type else None, + (f"Modified: {metadata.last_modified.strftime('%Y-%m-%d %H:%M:%S')}" + if metadata.last_modified and options.show_timestamp else None) + ] + summary = " | ".join(filter(None, summary_parts)) + + # 构建HTML类 + css_class = f' class="{options.custom_css}"' if options.custom_css else '' + + # 格式化内容 + formatted_content = self._format_content_block(content, options) + + # 组装最终结果 + result = ( + f'{summary}\n\n' + f'{formatted_content}\n\n' + f'\n\n' + ) + + return self._add_indent(result, options.indent_level) + + except Exception as e: + logger.error(f"Error formatting file content: {str(e)}") + raise FormattingError(f"Failed to format file content: {str(e)}") + + +class ContentFoldingManager: + """内容折叠管理器""" + + def __init__(self): + """初始化折叠管理器""" + self._formatters: Dict[str, ContentFormatter] = {} + self._register_default_formatters() + + def _register_default_formatters(self) -> None: + """注册默认的格式化器""" + self.register_formatter('file', FileContentFormatter()) + + def register_formatter(self, name: str, formatter: ContentFormatter) -> None: + """注册新的格式化器""" + if not isinstance(formatter, ContentFormatter): + raise TypeError("Formatter must implement ContentFormatter interface") + self._formatters[name] = formatter + + def _guess_language(self, extension: str) -> Optional[str]: + """根据文件扩展名猜测编程语言""" + extension = extension.lower().lstrip('.') + language_map = { + 'py': 'python', + 'js': 'javascript', + 'java': 'java', + 'cpp': 'cpp', + 'cs': 'csharp', + 'html': 'html', + 'css': 'css', + 'md': 'markdown', + 'json': 'json', + 'xml': 'xml', + 'sql': 'sql', + 'sh': 'bash', + 'yaml': 'yaml', + 'yml': 'yaml', + 'txt': None # 纯文本不需要语言标识 + } + return language_map.get(extension) + + def format_content(self, + content: str, + formatter_type: str, + metadata: Union[FileMetadata], + options: Optional[FoldingOptions] = None) -> str: + """格式化内容""" + formatter = self._formatters.get(formatter_type) + if not formatter: + raise KeyError(f"No formatter registered for type: {formatter_type}") + + if not isinstance(metadata, FileMetadata): + raise TypeError("Invalid metadata type") + + return formatter.format(content, metadata, options) + diff --git a/crazy_functions/doc_fns/conversation_doc/excel_doc.py b/crazy_functions/doc_fns/conversation_doc/excel_doc.py new file mode 100644 index 00000000..ee19e162 --- /dev/null +++ b/crazy_functions/doc_fns/conversation_doc/excel_doc.py @@ -0,0 +1,211 @@ +import re +import os +import pandas as pd +from datetime import datetime +from openpyxl import Workbook + + +class ExcelTableFormatter: + """聊天记录中Markdown表格转Excel生成器""" + + def __init__(self): + """初始化Excel文档对象""" + self.workbook = Workbook() + self._table_count = 0 + self._current_sheet = None + + def _normalize_table_row(self, row): + """标准化表格行,处理不同的分隔符情况""" + row = row.strip() + if row.startswith('|'): + row = row[1:] + if row.endswith('|'): + row = row[:-1] + return [cell.strip() for cell in row.split('|')] + + def _is_separator_row(self, row): + """检查是否是分隔行(由 - 或 : 组成)""" + clean_row = re.sub(r'[\s|]', '', row) + return bool(re.match(r'^[-:]+$', clean_row)) + + def _extract_tables_from_text(self, text): + """从文本中提取所有表格内容""" + if not isinstance(text, str): + return [] + + tables = [] + current_table = [] + is_in_table = False + + for line in text.split('\n'): + line = line.strip() + if not line: + if is_in_table and current_table: + if len(current_table) >= 2: + tables.append(current_table) + current_table = [] + is_in_table = False + continue + + if '|' in line: + if not is_in_table: + is_in_table = True + current_table.append(line) + else: + if is_in_table and current_table: + if len(current_table) >= 2: + tables.append(current_table) + current_table = [] + is_in_table = False + + if is_in_table and current_table and len(current_table) >= 2: + tables.append(current_table) + + return tables + + def _parse_table(self, table_lines): + """解析表格内容为结构化数据""" + try: + headers = self._normalize_table_row(table_lines[0]) + + separator_index = next( + (i for i, line in enumerate(table_lines) if self._is_separator_row(line)), + 1 + ) + + data_rows = [] + for line in table_lines[separator_index + 1:]: + cells = self._normalize_table_row(line) + # 确保单元格数量与表头一致 + while len(cells) < len(headers): + cells.append('') + cells = cells[:len(headers)] + data_rows.append(cells) + + if headers and data_rows: + return { + 'headers': headers, + 'data': data_rows + } + except Exception as e: + print(f"解析表格时发生错误: {str(e)}") + + return None + + def _create_sheet(self, question_num, table_num): + """创建新的工作表""" + sheet_name = f'Q{question_num}_T{table_num}' + if len(sheet_name) > 31: + sheet_name = f'Table{self._table_count}' + + if sheet_name in self.workbook.sheetnames: + sheet_name = f'{sheet_name}_{datetime.now().strftime("%H%M%S")}' + + return self.workbook.create_sheet(title=sheet_name) + + def create_document(self, history): + """ + 处理聊天历史中的所有表格并创建Excel文档 + + Args: + history: 聊天历史列表 + + Returns: + Workbook: 处理完成的Excel工作簿对象,如果没有表格则返回None + """ + has_tables = False + + # 删除默认创建的工作表 + default_sheet = self.workbook['Sheet'] + self.workbook.remove(default_sheet) + + # 遍历所有回答 + for i in range(1, len(history), 2): + answer = history[i] + tables = self._extract_tables_from_text(answer) + + for table_lines in tables: + parsed_table = self._parse_table(table_lines) + if parsed_table: + self._table_count += 1 + sheet = self._create_sheet(i // 2 + 1, self._table_count) + + # 写入表头 + for col, header in enumerate(parsed_table['headers'], 1): + sheet.cell(row=1, column=col, value=header) + + # 写入数据 + for row_idx, row_data in enumerate(parsed_table['data'], 2): + for col_idx, value in enumerate(row_data, 1): + sheet.cell(row=row_idx, column=col_idx, value=value) + + has_tables = True + + return self.workbook if has_tables else None + + +def save_chat_tables(history, save_dir, base_name): + """ + 保存聊天历史中的表格到Excel文件 + + Args: + history: 聊天历史列表 + save_dir: 保存目录 + base_name: 基础文件名 + + Returns: + list: 保存的文件路径列表 + """ + result_files = [] + + try: + # 创建Excel格式 + excel_formatter = ExcelTableFormatter() + workbook = excel_formatter.create_document(history) + + if workbook is not None: + # 确保保存目录存在 + os.makedirs(save_dir, exist_ok=True) + + # 生成Excel文件路径 + excel_file = os.path.join(save_dir, base_name + '.xlsx') + + # 保存Excel文件 + workbook.save(excel_file) + result_files.append(excel_file) + print(f"已保存表格到Excel文件: {excel_file}") + except Exception as e: + print(f"保存Excel格式失败: {str(e)}") + + return result_files + + +# 使用示例 +if __name__ == "__main__": + # 示例聊天历史 + history = [ + "问题1", + """这是第一个表格: + | A | B | C | + |---|---|---| + | 1 | 2 | 3 |""", + + "问题2", + "这是没有表格的回答", + + "问题3", + """回答包含多个表格: + | Name | Age | + |------|-----| + | Tom | 20 | + + 第二个表格: + | X | Y | + |---|---| + | 1 | 2 |""" + ] + + # 保存表格 + save_dir = "output" + base_name = "chat_tables" + saved_files = save_chat_tables(history, save_dir, base_name) \ No newline at end of file diff --git a/crazy_functions/doc_fns/conversation_doc/html_doc.py b/crazy_functions/doc_fns/conversation_doc/html_doc.py new file mode 100644 index 00000000..49e8becb --- /dev/null +++ b/crazy_functions/doc_fns/conversation_doc/html_doc.py @@ -0,0 +1,190 @@ + + +class HtmlFormatter: + """聊天记录HTML格式生成器""" + + def __init__(self, chatbot, history): + self.chatbot = chatbot + self.history = history + self.css_styles = """ + :root { + --primary-color: #2563eb; + --primary-light: #eff6ff; + --secondary-color: #1e293b; + --background-color: #f8fafc; + --text-color: #334155; + --border-color: #e2e8f0; + --card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1); + } + + body { + font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + line-height: 1.8; + margin: 0; + padding: 2rem; + color: var(--text-color); + background-color: var(--background-color); + } + + .container { + max-width: 1200px; + margin: 0 auto; + background: white; + padding: 2rem; + border-radius: 16px; + box-shadow: var(--card-shadow); + } + ::selection { + background: var(--primary-light); + color: var(--primary-color); + } + @keyframes fadeIn { + from { opacity: 0; transform: translateY(20px); } + to { opacity: 1; transform: translateY(0); } + } + + @keyframes slideIn { + from { transform: translateX(-20px); opacity: 0; } + to { transform: translateX(0); opacity: 1; } + } + + .container { + animation: fadeIn 0.6s ease-out; + } + + .QaBox { + animation: slideIn 0.5s ease-out; + transition: all 0.3s ease; + } + + .QaBox:hover { + transform: translateX(5px); + } + .Question, .Answer, .historyBox { + transition: all 0.3s ease; + } + .chat-title { + color: var(--primary-color); + font-size: 2em; + text-align: center; + margin: 1rem 0 2rem; + padding-bottom: 1rem; + border-bottom: 2px solid var(--primary-color); + } + + .chat-body { + display: flex; + flex-direction: column; + gap: 1.5rem; + margin: 2rem 0; + } + + .QaBox { + background: white; + padding: 1.5rem; + border-radius: 8px; + border-left: 4px solid var(--primary-color); + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); + margin-bottom: 1.5rem; + } + + .Question { + color: var(--secondary-color); + font-weight: 500; + margin-bottom: 1rem; + } + + .Answer { + color: var(--text-color); + background: var(--primary-light); + padding: 1rem; + border-radius: 6px; + } + + .history-section { + margin-top: 3rem; + padding-top: 2rem; + border-top: 2px solid var(--border-color); + } + + .history-title { + color: var(--secondary-color); + font-size: 1.5em; + margin-bottom: 1.5rem; + text-align: center; + } + + .historyBox { + background: white; + padding: 1rem; + margin: 0.5rem 0; + border-radius: 6px; + border: 1px solid var(--border-color); + } + + @media (prefers-color-scheme: dark) { + :root { + --background-color: #0f172a; + --text-color: #e2e8f0; + --border-color: #1e293b; + } + + .container, .QaBox { + background: #1e293b; + } + } + """ + + def format_chat_content(self) -> str: + """格式化聊天内容""" + chat_content = [] + for q, a in self.chatbot: + question = str(q) if q is not None else "" + answer = str(a) if a is not None else "" + chat_content.append(f''' +
+
{question}
+
{answer}
+
+ ''') + return "\n".join(chat_content) + + def format_history_content(self) -> str: + """格式化历史记录内容""" + if not self.history: + return "" + + history_content = [] + for entry in self.history: + history_content.append(f''' +
+
{entry}
+
+ ''') + return "\n".join(history_content) + + def create_document(self) -> str: + """生成完整的HTML文档 + + Returns: + str: 完整的HTML文档字符串 + """ + return f""" + + + + + + 对话存档 + + + +
+

对话存档

+
+ {self.format_chat_content()} +
+
+ + + """ \ No newline at end of file diff --git a/crazy_functions/doc_fns/conversation_doc/markdown_doc.py b/crazy_functions/doc_fns/conversation_doc/markdown_doc.py new file mode 100644 index 00000000..15441073 --- /dev/null +++ b/crazy_functions/doc_fns/conversation_doc/markdown_doc.py @@ -0,0 +1,39 @@ + +class MarkdownFormatter: + """Markdown格式文档生成器 - 用于生成对话记录的markdown文档""" + + def __init__(self): + self.content = [] + + def _add_content(self, text: str): + """添加正文内容""" + if text: + self.content.append(f"\n{text}\n") + + def create_document(self, history: list) -> str: + """ + 创建完整的Markdown文档 + Args: + history: 历史记录列表,偶数位置为问题,奇数位置为答案 + Returns: + str: 生成的Markdown文本 + """ + self.content = [] + + # 处理问答对 + for i in range(0, len(history), 2): + question = history[i] + answer = history[i + 1] + + # 添加问题 + self.content.append(f"\n### 问题 {i//2 + 1}") + self._add_content(question) + + # 添加回答 + self.content.append(f"\n### 回答 {i//2 + 1}") + self._add_content(answer) + + # 添加分隔线 + self.content.append("\n---\n") + + return "\n".join(self.content) diff --git a/crazy_functions/doc_fns/conversation_doc/pdf_doc.py b/crazy_functions/doc_fns/conversation_doc/pdf_doc.py new file mode 100644 index 00000000..2b7d15c6 --- /dev/null +++ b/crazy_functions/doc_fns/conversation_doc/pdf_doc.py @@ -0,0 +1,172 @@ +from datetime import datetime +import os +import re +from reportlab.pdfbase import pdfmetrics +from reportlab.pdfbase.ttfonts import TTFont + +def convert_markdown_to_pdf(markdown_text): + """将Markdown文本转换为PDF格式的纯文本""" + if not markdown_text: + return "" + + # 标准化换行符 + markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n') + + # 处理标题、粗体、斜体 + markdown_text = re.sub(r'^#\s+(.+)$', r'\1', markdown_text, flags=re.MULTILINE) + markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text) + markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text) + + # 处理列表 + markdown_text = re.sub(r'^\s*[-*+]\s+(.+?)(?=\n|$)', r'• \1', markdown_text, flags=re.MULTILINE) + markdown_text = re.sub(r'^\s*\d+\.\s+(.+?)(?=\n|$)', r'\1', markdown_text, flags=re.MULTILINE) + + # 处理链接 + markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', markdown_text) + + # 处理段落 + markdown_text = re.sub(r'\n{2,}', '\n', markdown_text) + markdown_text = re.sub(r'(?([^<]+)', r'\2 (\1)', + markdown_text) + + # 6. Preserve paragraph breaks + markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text) # normalize multiple newlines to double newlines + + # 7. Clean up extra spaces but maintain indentation + markdown_text = re.sub(r' +', ' ', markdown_text) + + return markdown_text.strip() + + +class TxtFormatter: + """Chat history TXT document generator""" + + def __init__(self): + self.content = [] + self._setup_document() + + def _setup_document(self): + """Initialize document with header""" + self.content.append("=" * 50) + self.content.append("GPT-Academic对话记录".center(48)) + self.content.append("=" * 50) + + def _format_header(self): + """Create document header with current date""" + from datetime import datetime + date_str = datetime.now().strftime('%Y年%m月%d日') + return [ + date_str.center(48), + "\n" # Add blank line after date + ] + + def create_document(self, history): + """Generate document from chat history""" + # Add header with date + self.content.extend(self._format_header()) + + # Add conversation content + for i in range(0, len(history), 2): + question = history[i] + answer = convert_markdown_to_txt(history[i + 1]) if i + 1 < len(history) else "" + + if question: + self.content.append(f"问题 {i // 2 + 1}:{str(question)}") + self.content.append("") # Add blank line + + if answer: + self.content.append(f"回答 {i // 2 + 1}:{str(answer)}") + self.content.append("") # Add blank line + + # Join all content with newlines + return "\n".join(self.content) diff --git a/crazy_functions/doc_fns/conversation_doc/word2pdf.py b/crazy_functions/doc_fns/conversation_doc/word2pdf.py new file mode 100644 index 00000000..253ecd25 --- /dev/null +++ b/crazy_functions/doc_fns/conversation_doc/word2pdf.py @@ -0,0 +1,155 @@ +from docx2pdf import convert +import os +import platform +import subprocess +from typing import Union +from pathlib import Path +from datetime import datetime + +class WordToPdfConverter: + """Word文档转PDF转换器""" + + @staticmethod + def convert_to_pdf(word_path: Union[str, Path], pdf_path: Union[str, Path] = None) -> str: + """ + 将Word文档转换为PDF + + 参数: + word_path: Word文档的路径 + pdf_path: 可选,PDF文件的输出路径。如果未指定,将使用与Word文档相同的名称和位置 + + 返回: + 生成的PDF文件路径 + + 异常: + 如果转换失败,将抛出相应异常 + """ + try: + # 确保输入路径是Path对象 + word_path = Path(word_path) + + # 如果未指定pdf_path,则使用与word文档相同的名称 + if pdf_path is None: + pdf_path = word_path.with_suffix('.pdf') + else: + pdf_path = Path(pdf_path) + + # 检查操作系统 + if platform.system() == 'Linux': + # Linux系统需要安装libreoffice + which_result = subprocess.run(['which', 'libreoffice'], capture_output=True, text=True) + if which_result.returncode != 0: + raise RuntimeError("请先安装LibreOffice: sudo apt-get install libreoffice") + + print(f"开始转换Word文档: {word_path} 到 PDF") + + # 使用subprocess代替os.system + result = subprocess.run( + ['libreoffice', '--headless', '--convert-to', 'pdf:writer_pdf_Export', + str(word_path), '--outdir', str(pdf_path.parent)], + capture_output=True, text=True + ) + + if result.returncode != 0: + error_msg = result.stderr or "未知错误" + print(f"LibreOffice转换失败,错误信息: {error_msg}") + raise RuntimeError(f"LibreOffice转换失败: {error_msg}") + + print(f"LibreOffice转换输出: {result.stdout}") + + # 如果输出路径与默认生成的不同,则重命名 + default_pdf = word_path.with_suffix('.pdf') + if default_pdf != pdf_path and default_pdf.exists(): + os.rename(default_pdf, pdf_path) + print(f"已将PDF从 {default_pdf} 重命名为 {pdf_path}") + + # 验证PDF是否成功生成 + if not pdf_path.exists() or pdf_path.stat().st_size == 0: + raise RuntimeError("PDF生成失败或文件为空") + + print(f"PDF转换成功,文件大小: {pdf_path.stat().st_size} 字节") + else: + # Windows和MacOS使用docx2pdf + print(f"使用docx2pdf转换 {word_path} 到 {pdf_path}") + convert(word_path, pdf_path) + + # 验证PDF是否成功生成 + if not pdf_path.exists() or pdf_path.stat().st_size == 0: + raise RuntimeError("PDF生成失败或文件为空") + + print(f"PDF转换成功,文件大小: {pdf_path.stat().st_size} 字节") + + return str(pdf_path) + + except Exception as e: + print(f"PDF转换异常: {str(e)}") + raise Exception(f"转换PDF失败: {str(e)}") + + @staticmethod + def batch_convert(word_dir: Union[str, Path], pdf_dir: Union[str, Path] = None) -> list: + """ + 批量转换目录下的所有Word文档 + + 参数: + word_dir: 包含Word文档的目录路径 + pdf_dir: 可选,PDF文件的输出目录。如果未指定,将使用与Word文档相同的目录 + + 返回: + 生成的PDF文件路径列表 + """ + word_dir = Path(word_dir) + if pdf_dir: + pdf_dir = Path(pdf_dir) + pdf_dir.mkdir(parents=True, exist_ok=True) + + converted_files = [] + + for word_file in word_dir.glob("*.docx"): + try: + if pdf_dir: + pdf_path = pdf_dir / word_file.with_suffix('.pdf').name + else: + pdf_path = word_file.with_suffix('.pdf') + + pdf_file = WordToPdfConverter.convert_to_pdf(word_file, pdf_path) + converted_files.append(pdf_file) + + except Exception as e: + print(f"转换 {word_file} 失败: {str(e)}") + + return converted_files + + @staticmethod + def convert_doc_to_pdf(doc, output_dir: Union[str, Path] = None) -> str: + """ + 将docx对象直接转换为PDF + + 参数: + doc: python-docx的Document对象 + output_dir: 可选,输出目录。如果未指定,将使用当前目录 + + 返回: + 生成的PDF文件路径 + """ + try: + # 设置临时文件路径和输出路径 + output_dir = Path(output_dir) if output_dir else Path.cwd() + output_dir.mkdir(parents=True, exist_ok=True) + + # 生成临时word文件 + temp_docx = output_dir / f"temp_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx" + doc.save(temp_docx) + + # 转换为PDF + pdf_path = temp_docx.with_suffix('.pdf') + WordToPdfConverter.convert_to_pdf(temp_docx, pdf_path) + + # 删除临时word文件 + temp_docx.unlink() + + return str(pdf_path) + + except Exception as e: + if temp_docx.exists(): + temp_docx.unlink() + raise Exception(f"转换PDF失败: {str(e)}") \ No newline at end of file diff --git a/crazy_functions/doc_fns/conversation_doc/word_doc.py b/crazy_functions/doc_fns/conversation_doc/word_doc.py new file mode 100644 index 00000000..73556888 --- /dev/null +++ b/crazy_functions/doc_fns/conversation_doc/word_doc.py @@ -0,0 +1,177 @@ +import re +from docx import Document +from docx.shared import Cm, Pt +from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING +from docx.enum.style import WD_STYLE_TYPE +from docx.oxml.ns import qn +from datetime import datetime + + +def convert_markdown_to_word(markdown_text): + # 0. 首先标准化所有换行符为\n + markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n') + + # 1. 处理标题 - 支持更多级别的标题,使用更精确的正则 + # 保留标题标记,以便后续处理时还能识别出标题级别 + markdown_text = re.sub(r'^(#{1,6})\s+(.+?)(?:\s+#+)?$', r'\1 \2', markdown_text, flags=re.MULTILINE) + + # 2. 处理粗体、斜体和加粗斜体 + markdown_text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', markdown_text) # 加粗斜体 + markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text) # 加粗 + markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text) # 斜体 + markdown_text = re.sub(r'_(.+?)_', r'\1', markdown_text) # 下划线斜体 + markdown_text = re.sub(r'__(.+?)__', r'\1', markdown_text) # 下划线加粗 + + # 3. 处理代码块 - 不移除,而是简化格式 + # 多行代码块 + markdown_text = re.sub(r'```(?:\w+)?\n([\s\S]*?)```', r'[代码块]\n\1[/代码块]', markdown_text) + # 单行代码 + markdown_text = re.sub(r'`([^`]+)`', r'[代码]\1[/代码]', markdown_text) + + # 4. 处理列表 - 保留列表结构 + # 匹配无序列表 + markdown_text = re.sub(r'^(\s*)[-*+]\s+(.+?)$', r'\1• \2', markdown_text, flags=re.MULTILINE) + + # 5. 处理Markdown链接 + markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+?)\s*(?:"[^"]*")?\)', r'\1 (\2)', markdown_text) + + # 6. 处理HTML链接 + markdown_text = re.sub(r'([^<]+)', r'\2 (\1)', + markdown_text) + + # 7. 处理图片 + markdown_text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'[图片:\1]', markdown_text) + + return markdown_text + + +class WordFormatter: + """聊天记录Word文档生成器 - 符合中国政府公文格式规范(GB/T 9704-2012)""" + + def __init__(self): + self.doc = Document() + self._setup_document() + self._create_styles() + + def _setup_document(self): + """设置文档基本格式,包括页面设置和页眉""" + sections = self.doc.sections + for section in sections: + # 设置页面大小为A4 + section.page_width = Cm(21) + section.page_height = Cm(29.7) + # 设置页边距 + section.top_margin = Cm(3.7) # 上边距37mm + section.bottom_margin = Cm(3.5) # 下边距35mm + section.left_margin = Cm(2.8) # 左边距28mm + section.right_margin = Cm(2.6) # 右边距26mm + # 设置页眉页脚距离 + section.header_distance = Cm(2.0) + section.footer_distance = Cm(2.0) + + # 添加页眉 + header = section.header + header_para = header.paragraphs[0] + header_para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT + header_run = header_para.add_run("GPT-Academic对话记录") + header_run.font.name = '仿宋' + header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + header_run.font.size = Pt(9) + + def _create_styles(self): + """创建文档样式""" + # 创建正文样式 + style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH) + style.font.name = '仿宋' + style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + style.font.size = Pt(12) # 调整为12磅 + style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + style.paragraph_format.space_after = Pt(0) + + # 创建问题样式 + question_style = self.doc.styles.add_style('Question_Style', WD_STYLE_TYPE.PARAGRAPH) + question_style.font.name = '黑体' + question_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体') + question_style.font.size = Pt(14) # 调整为14磅 + question_style.font.bold = True + question_style.paragraph_format.space_before = Pt(12) # 减小段前距 + question_style.paragraph_format.space_after = Pt(6) + question_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + question_style.paragraph_format.left_indent = Pt(0) # 移除左缩进 + + # 创建回答样式 + answer_style = self.doc.styles.add_style('Answer_Style', WD_STYLE_TYPE.PARAGRAPH) + answer_style.font.name = '仿宋' + answer_style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + answer_style.font.size = Pt(12) # 调整为12磅 + answer_style.paragraph_format.space_before = Pt(6) + answer_style.paragraph_format.space_after = Pt(12) + answer_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + answer_style.paragraph_format.left_indent = Pt(0) # 移除左缩进 + + # 创建标题样式 + title_style = self.doc.styles.add_style('Title_Custom', WD_STYLE_TYPE.PARAGRAPH) + title_style.font.name = '黑体' # 改用黑体 + title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体') + title_style.font.size = Pt(22) # 调整为22磅 + title_style.font.bold = True + title_style.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER + title_style.paragraph_format.space_before = Pt(0) + title_style.paragraph_format.space_after = Pt(24) + title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + + # 添加参考文献样式 + ref_style = self.doc.styles.add_style('Reference_Style', WD_STYLE_TYPE.PARAGRAPH) + ref_style.font.name = '宋体' + ref_style._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体') + ref_style.font.size = Pt(10.5) # 参考文献使用小号字体 + ref_style.paragraph_format.space_before = Pt(3) + ref_style.paragraph_format.space_after = Pt(3) + ref_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE + ref_style.paragraph_format.left_indent = Pt(21) + ref_style.paragraph_format.first_line_indent = Pt(-21) + + # 添加参考文献标题样式 + ref_title_style = self.doc.styles.add_style('Reference_Title_Style', WD_STYLE_TYPE.PARAGRAPH) + ref_title_style.font.name = '黑体' + ref_title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体') + ref_title_style.font.size = Pt(16) + ref_title_style.font.bold = True + ref_title_style.paragraph_format.space_before = Pt(24) + ref_title_style.paragraph_format.space_after = Pt(12) + ref_title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + + def create_document(self, history): + """写入聊天历史""" + # 添加标题 + title_para = self.doc.add_paragraph(style='Title_Custom') + title_run = title_para.add_run('GPT-Academic 对话记录') + + # 添加日期 + date_para = self.doc.add_paragraph() + date_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER + date_run = date_para.add_run(datetime.now().strftime('%Y年%m月%d日')) + date_run.font.name = '仿宋' + date_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + date_run.font.size = Pt(16) + + self.doc.add_paragraph() # 添加空行 + + # 添加对话内容 + for i in range(0, len(history), 2): + question = history[i] + answer = convert_markdown_to_word(history[i + 1]) + + if question: + q_para = self.doc.add_paragraph(style='Question_Style') + q_para.add_run(f'问题 {i//2 + 1}:').bold = True + q_para.add_run(str(question)) + + if answer: + a_para = self.doc.add_paragraph(style='Answer_Style') + a_para.add_run(f'回答 {i//2 + 1}:').bold = True + a_para.add_run(str(answer)) + + + return self.doc + diff --git a/crazy_functions/doc_fns/read_fns/__init__.py b/crazy_functions/doc_fns/read_fns/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crazy_functions/doc_fns/read_fns/docx_reader.py b/crazy_functions/doc_fns/read_fns/docx_reader.py new file mode 100644 index 00000000..9308940b --- /dev/null +++ b/crazy_functions/doc_fns/read_fns/docx_reader.py @@ -0,0 +1,6 @@ +import nltk +nltk.data.path.append('~/nltk_data') +nltk.download('averaged_perceptron_tagger', download_dir='~/nltk_data', + ) +nltk.download('punkt', download_dir='~/nltk_data', + ) \ No newline at end of file diff --git a/crazy_functions/doc_fns/read_fns/excel_reader.py b/crazy_functions/doc_fns/read_fns/excel_reader.py new file mode 100644 index 00000000..d70e9d53 --- /dev/null +++ b/crazy_functions/doc_fns/read_fns/excel_reader.py @@ -0,0 +1,286 @@ +from __future__ import annotations + +import pandas as pd +import numpy as np +from pathlib import Path +from typing import Optional, List, Set, Dict, Union, Iterator, Tuple +from dataclasses import dataclass, field +import logging +from concurrent.futures import ThreadPoolExecutor, as_completed +import chardet +from functools import lru_cache +import os + + +@dataclass +class ExtractorConfig: + """提取器配置类""" + encoding: str = 'auto' + na_filter: bool = True + skip_blank_lines: bool = True + chunk_size: int = 10000 + max_workers: int = 4 + preserve_format: bool = True + read_all_sheets: bool = True # 新增:是否读取所有工作表 + text_cleanup: Dict[str, bool] = field(default_factory=lambda: { + 'remove_extra_spaces': True, + 'normalize_whitespace': False, + 'remove_special_chars': False, + 'lowercase': False + }) + + +class ExcelTextExtractor: + """增强的Excel格式文件文本内容提取器""" + + SUPPORTED_EXTENSIONS: Set[str] = { + '.xlsx', '.xls', '.csv', '.tsv', '.xlsm', '.xltx', '.xltm', '.ods' + } + + def __init__(self, config: Optional[ExtractorConfig] = None): + self.config = config or ExtractorConfig() + self._setup_logging() + self._detect_encoding = lru_cache(maxsize=128)(self._detect_encoding) + + def _setup_logging(self) -> None: + """配置日志记录器""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + self.logger = logging.getLogger(__name__) + fh = logging.FileHandler('excel_extractor.log') + fh.setLevel(logging.ERROR) + self.logger.addHandler(fh) + + def _detect_encoding(self, file_path: Path) -> str: + if self.config.encoding != 'auto': + return self.config.encoding + + try: + with open(file_path, 'rb') as f: + raw_data = f.read(10000) + result = chardet.detect(raw_data) + return result['encoding'] or 'utf-8' + except Exception as e: + self.logger.warning(f"Encoding detection failed: {e}. Using utf-8") + return 'utf-8' + + def _validate_file(self, file_path: Union[str, Path]) -> Path: + path = Path(file_path).resolve() + + if not path.exists(): + raise ValueError(f"File not found: {path}") + + if not path.is_file(): + raise ValueError(f"Not a file: {path}") + + if not os.access(path, os.R_OK): + raise PermissionError(f"No read permission: {path}") + + if path.suffix.lower() not in self.SUPPORTED_EXTENSIONS: + raise ValueError( + f"Unsupported format: {path.suffix}. " + f"Supported: {', '.join(sorted(self.SUPPORTED_EXTENSIONS))}" + ) + + return path + + def _format_value(self, value: Any) -> str: + if pd.isna(value) or value is None: + return '' + if isinstance(value, (int, float)): + return str(value) + return str(value).strip() + + def _process_chunk(self, chunk: pd.DataFrame, columns: Optional[List[str]] = None, sheet_name: str = '') -> str: + """处理数据块,新增sheet_name参数""" + try: + if columns: + chunk = chunk[columns] + + if self.config.preserve_format: + formatted_chunk = chunk.applymap(self._format_value) + rows = [] + + # 添加工作表名称作为标题 + if sheet_name: + rows.append(f"[Sheet: {sheet_name}]") + + # 添加表头 + headers = [str(col) for col in formatted_chunk.columns] + rows.append('\t'.join(headers)) + + # 添加数据行 + for _, row in formatted_chunk.iterrows(): + rows.append('\t'.join(row.values)) + + return '\n'.join(rows) + else: + flat_values = ( + chunk.astype(str) + .replace({'nan': '', 'None': '', 'NaN': ''}) + .values.flatten() + ) + return ' '.join(v for v in flat_values if v) + + except Exception as e: + self.logger.error(f"Error processing chunk: {e}") + raise + + def _read_file(self, file_path: Path) -> Union[pd.DataFrame, Iterator[pd.DataFrame], Dict[str, pd.DataFrame]]: + """读取文件,支持多工作表""" + try: + encoding = self._detect_encoding(file_path) + + if file_path.suffix.lower() in {'.csv', '.tsv'}: + sep = '\t' if file_path.suffix.lower() == '.tsv' else ',' + + # 对大文件使用分块读取 + if file_path.stat().st_size > self.config.chunk_size * 1024: + return pd.read_csv( + file_path, + encoding=encoding, + na_filter=self.config.na_filter, + skip_blank_lines=self.config.skip_blank_lines, + sep=sep, + chunksize=self.config.chunk_size, + on_bad_lines='warn' + ) + else: + return pd.read_csv( + file_path, + encoding=encoding, + na_filter=self.config.na_filter, + skip_blank_lines=self.config.skip_blank_lines, + sep=sep + ) + else: + # Excel文件处理,支持多工作表 + if self.config.read_all_sheets: + # 读取所有工作表 + return pd.read_excel( + file_path, + na_filter=self.config.na_filter, + keep_default_na=self.config.na_filter, + engine='openpyxl', + sheet_name=None # None表示读取所有工作表 + ) + else: + # 只读取第一个工作表 + return pd.read_excel( + file_path, + na_filter=self.config.na_filter, + keep_default_na=self.config.na_filter, + engine='openpyxl', + sheet_name=0 # 读取第一个工作表 + ) + + except Exception as e: + self.logger.error(f"Error reading file {file_path}: {e}") + raise + + def extract_text( + self, + file_path: Union[str, Path], + columns: Optional[List[str]] = None, + separator: str = '\n' + ) -> str: + """提取文本,支持多工作表""" + try: + path = self._validate_file(file_path) + self.logger.info(f"Processing: {path}") + + reader = self._read_file(path) + texts = [] + + # 处理Excel多工作表 + if isinstance(reader, dict): + for sheet_name, df in reader.items(): + sheet_text = self._process_chunk(df, columns, sheet_name) + if sheet_text: + texts.append(sheet_text) + return separator.join(texts) + + # 处理单个DataFrame + elif isinstance(reader, pd.DataFrame): + return self._process_chunk(reader, columns) + + # 处理DataFrame迭代器 + else: + with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor: + futures = { + executor.submit(self._process_chunk, chunk, columns): i + for i, chunk in enumerate(reader) + } + + chunk_texts = [] + for future in as_completed(futures): + try: + text = future.result() + if text: + chunk_texts.append((futures[future], text)) + except Exception as e: + self.logger.error(f"Error in chunk {futures[future]}: {e}") + + # 按块的顺序排序 + chunk_texts.sort(key=lambda x: x[0]) + texts = [text for _, text in chunk_texts] + + # 合并文本,保留格式 + if texts and self.config.preserve_format: + result = texts[0] # 第一块包含表头 + if len(texts) > 1: + # 跳过后续块的表头行 + for text in texts[1:]: + result += '\n' + '\n'.join(text.split('\n')[1:]) + return result + else: + return separator.join(texts) + + except Exception as e: + self.logger.error(f"Extraction failed: {e}") + raise + + @staticmethod + def get_supported_formats() -> List[str]: + """获取支持的文件格式列表""" + return sorted(ExcelTextExtractor.SUPPORTED_EXTENSIONS) + + +def main(): + """主函数:演示用法""" + config = ExtractorConfig( + encoding='auto', + preserve_format=True, + read_all_sheets=True, # 启用多工作表读取 + text_cleanup={ + 'remove_extra_spaces': True, + 'normalize_whitespace': False, + 'remove_special_chars': False, + 'lowercase': False + } + ) + + extractor = ExcelTextExtractor(config) + + try: + sample_file = 'example.xlsx' + if Path(sample_file).exists(): + text = extractor.extract_text( + sample_file, + columns=['title', 'content'] + ) + print("提取的文本:") + print(text) + else: + print(f"示例文件 {sample_file} 不存在") + + print("\n支持的格式:", extractor.get_supported_formats()) + + except Exception as e: + print(f"错误: {e}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/crazy_functions/doc_fns/read_fns/markitdown/markdown_reader.py b/crazy_functions/doc_fns/read_fns/markitdown/markdown_reader.py new file mode 100644 index 00000000..b88212e2 --- /dev/null +++ b/crazy_functions/doc_fns/read_fns/markitdown/markdown_reader.py @@ -0,0 +1,359 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Optional, Set, Dict, Union, List +from dataclasses import dataclass, field +import logging +import os +import re +import subprocess +import tempfile +import shutil + +@dataclass +class MarkdownConverterConfig: + """PDF 到 Markdown 转换器配置类 + + Attributes: + extract_images: 是否提取图片 + extract_tables: 是否尝试保留表格结构 + extract_code_blocks: 是否识别代码块 + extract_math: 是否转换数学公式 + output_dir: 输出目录路径 + image_dir: 图片保存目录路径 + paragraph_separator: 段落之间的分隔符 + text_cleanup: 文本清理选项字典 + docintel_endpoint: Document Intelligence端点URL (可选) + enable_plugins: 是否启用插件 + llm_client: LLM客户端对象 (例如OpenAI client) + llm_model: 要使用的LLM模型名称 + """ + extract_images: bool = True + extract_tables: bool = True + extract_code_blocks: bool = True + extract_math: bool = True + output_dir: str = "" + image_dir: str = "images" + paragraph_separator: str = '\n\n' + text_cleanup: Dict[str, bool] = field(default_factory=lambda: { + 'remove_extra_spaces': True, + 'normalize_whitespace': True, + 'remove_special_chars': False, + 'lowercase': False + }) + docintel_endpoint: str = "" + enable_plugins: bool = False + llm_client: Optional[object] = None + llm_model: str = "" + + +class MarkdownConverter: + """PDF 到 Markdown 转换器 + + 使用 markitdown 库实现 PDF 到 Markdown 的转换,支持多种配置选项。 + """ + + SUPPORTED_EXTENSIONS: Set[str] = { + '.pdf', + } + + def __init__(self, config: Optional[MarkdownConverterConfig] = None): + """初始化转换器 + + Args: + config: 转换器配置对象,如果为None则使用默认配置 + """ + self.config = config or MarkdownConverterConfig() + self._setup_logging() + + # 检查是否安装了 markitdown + self._check_markitdown_installation() + + def _setup_logging(self) -> None: + """配置日志记录器""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + self.logger = logging.getLogger(__name__) + + # 添加文件处理器 + fh = logging.FileHandler('markdown_converter.log') + fh.setLevel(logging.ERROR) + self.logger.addHandler(fh) + + def _check_markitdown_installation(self) -> None: + """检查是否安装了 markitdown""" + try: + # 尝试导入 markitdown 库 + from markitdown import MarkItDown + self.logger.info("markitdown 库已安装") + except ImportError: + self.logger.warning("markitdown 库未安装,尝试安装...") + try: + subprocess.check_call(["pip", "install", "markitdown"]) + self.logger.info("markitdown 库安装成功") + from markitdown import MarkItDown + except (subprocess.SubprocessError, ImportError): + self.logger.error("无法安装 markitdown 库,请手动安装") + self.markitdown_available = False + return + + self.markitdown_available = True + + def _validate_file(self, file_path: Union[str, Path], max_size_mb: int = 100) -> Path: + """验证文件 + + Args: + file_path: 文件路径 + max_size_mb: 允许的最大文件大小(MB) + + Returns: + Path: 验证后的Path对象 + + Raises: + ValueError: 文件不存在、格式不支持或大小超限 + PermissionError: 没有读取权限 + """ + path = Path(file_path).resolve() + + if not path.exists(): + raise ValueError(f"文件不存在: {path}") + + if not path.is_file(): + raise ValueError(f"不是一个文件: {path}") + + if not os.access(path, os.R_OK): + raise PermissionError(f"没有读取权限: {path}") + + file_size_mb = path.stat().st_size / (1024 * 1024) + if file_size_mb > max_size_mb: + raise ValueError( + f"文件大小 ({file_size_mb:.1f}MB) 超过限制 {max_size_mb}MB" + ) + + if path.suffix.lower() not in self.SUPPORTED_EXTENSIONS: + raise ValueError( + f"不支持的格式: {path.suffix}. " + f"支持的格式: {', '.join(sorted(self.SUPPORTED_EXTENSIONS))}" + ) + + return path + + def _cleanup_text(self, text: str) -> str: + """清理文本 + + Args: + text: 原始文本 + + Returns: + str: 清理后的文本 + """ + if self.config.text_cleanup['remove_extra_spaces']: + text = ' '.join(text.split()) + + if self.config.text_cleanup['normalize_whitespace']: + text = text.replace('\t', ' ').replace('\r', '\n') + + if self.config.text_cleanup['lowercase']: + text = text.lower() + + return text.strip() + + @staticmethod + def get_supported_formats() -> List[str]: + """获取支持的文件格式列表""" + return sorted(MarkdownConverter.SUPPORTED_EXTENSIONS) + + def convert_to_markdown( + self, + file_path: Union[str, Path], + output_path: Optional[Union[str, Path]] = None + ) -> str: + """将 PDF 转换为 Markdown + + Args: + file_path: PDF 文件路径 + output_path: 输出 Markdown 文件路径,如果为 None 则返回内容而不保存 + + Returns: + str: 转换后的 Markdown 内容 + + Raises: + Exception: 转换过程中的错误 + """ + try: + path = self._validate_file(file_path) + self.logger.info(f"处理: {path}") + + if not self.markitdown_available: + raise ImportError("markitdown 库未安装,无法进行转换") + + # 导入 markitdown 库 + from markitdown import MarkItDown + + # 准备输出目录 + if output_path: + output_path = Path(output_path) + output_dir = output_path.parent + output_dir.mkdir(parents=True, exist_ok=True) + else: + # 创建临时目录作为输出目录 + temp_dir = tempfile.mkdtemp() + output_dir = Path(temp_dir) + output_path = output_dir / f"{path.stem}.md" + + # 图片目录 + image_dir = output_dir / self.config.image_dir + image_dir.mkdir(parents=True, exist_ok=True) + + # 创建 MarkItDown 实例并进行转换 + if self.config.docintel_endpoint: + md = MarkItDown(docintel_endpoint=self.config.docintel_endpoint) + elif self.config.llm_client and self.config.llm_model: + md = MarkItDown( + enable_plugins=self.config.enable_plugins, + llm_client=self.config.llm_client, + llm_model=self.config.llm_model + ) + else: + md = MarkItDown(enable_plugins=self.config.enable_plugins) + + # 执行转换 + result = md.convert(str(path)) + markdown_content = result.text_content + + # 清理文本 + markdown_content = self._cleanup_text(markdown_content) + + # 如果需要保存到文件 + if output_path: + with open(output_path, 'w', encoding='utf-8') as f: + f.write(markdown_content) + self.logger.info(f"转换成功,输出到: {output_path}") + + return markdown_content + + except Exception as e: + self.logger.error(f"转换失败: {e}") + raise + finally: + # 如果使用了临时目录且没有指定输出路径,则清理临时目录 + if 'temp_dir' in locals() and not output_path: + shutil.rmtree(temp_dir, ignore_errors=True) + + def convert_to_markdown_and_save( + self, + file_path: Union[str, Path], + output_path: Union[str, Path] + ) -> Path: + """将 PDF 转换为 Markdown 并保存到指定路径 + + Args: + file_path: PDF 文件路径 + output_path: 输出 Markdown 文件路径 + + Returns: + Path: 输出文件的 Path 对象 + + Raises: + Exception: 转换过程中的错误 + """ + self.convert_to_markdown(file_path, output_path) + return Path(output_path) + + def batch_convert( + self, + file_paths: List[Union[str, Path]], + output_dir: Union[str, Path] + ) -> List[Path]: + """批量转换多个 PDF 文件为 Markdown + + Args: + file_paths: PDF 文件路径列表 + output_dir: 输出目录路径 + + Returns: + List[Path]: 输出文件路径列表 + + Raises: + Exception: 转换过程中的错误 + """ + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + output_paths = [] + for file_path in file_paths: + path = Path(file_path) + output_path = output_dir / f"{path.stem}.md" + + try: + self.convert_to_markdown(file_path, output_path) + output_paths.append(output_path) + self.logger.info(f"成功转换: {path} -> {output_path}") + except Exception as e: + self.logger.error(f"转换失败 {path}: {e}") + + return output_paths + + +def main(): + """主函数:演示用法""" + # 配置 + config = MarkdownConverterConfig( + extract_images=True, + extract_tables=True, + extract_code_blocks=True, + extract_math=True, + enable_plugins=False, + text_cleanup={ + 'remove_extra_spaces': True, + 'normalize_whitespace': True, + 'remove_special_chars': False, + 'lowercase': False + } + ) + + # 创建转换器 + converter = MarkdownConverter(config) + + # 使用示例 + try: + # 替换为实际的文件路径 + sample_file = './crazy_functions/doc_fns/read_fns/paper/2501.12599v1.pdf' + if Path(sample_file).exists(): + # 转换为 Markdown 并打印内容 + markdown_content = converter.convert_to_markdown(sample_file) + print("转换后的 Markdown 内容:") + print(markdown_content[:500] + "...") # 只打印前500个字符 + + # 转换并保存到文件 + output_file = f"./output_{Path(sample_file).stem}.md" + output_path = converter.convert_to_markdown_and_save(sample_file, output_file) + print(f"\n已保存到: {output_path}") + + # 使用LLM增强的示例 (需要添加相应的导入和配置) + # try: + # from openai import OpenAI + # client = OpenAI() + # llm_config = MarkdownConverterConfig( + # llm_client=client, + # llm_model="gpt-4o" + # ) + # llm_converter = MarkdownConverter(llm_config) + # llm_result = llm_converter.convert_to_markdown("example.jpg") + # print("LLM增强的结果:") + # print(llm_result[:500] + "...") + # except ImportError: + # print("未安装OpenAI库,跳过LLM示例") + else: + print(f"示例文件 {sample_file} 不存在") + + print("\n支持的格式:", converter.get_supported_formats()) + + except Exception as e: + print(f"错误: {e}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/crazy_functions/doc_fns/read_fns/unstructured_all/__init__.py b/crazy_functions/doc_fns/read_fns/unstructured_all/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crazy_functions/doc_fns/read_fns/unstructured_all/paper_metadata_extractor.py b/crazy_functions/doc_fns/read_fns/unstructured_all/paper_metadata_extractor.py new file mode 100644 index 00000000..bfa0180f --- /dev/null +++ b/crazy_functions/doc_fns/read_fns/unstructured_all/paper_metadata_extractor.py @@ -0,0 +1,493 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Optional, Set, Dict, Union, List +from dataclasses import dataclass, field +import logging +import os +import re + +from unstructured.partition.auto import partition +from unstructured.documents.elements import ( + Text, Title, NarrativeText, ListItem, Table, + Footer, Header, PageBreak, Image, Address +) + + +@dataclass +class PaperMetadata: + """论文元数据类""" + title: str = "" + authors: List[str] = field(default_factory=list) + affiliations: List[str] = field(default_factory=list) + journal: str = "" + volume: str = "" + issue: str = "" + year: str = "" + doi: str = "" + date: str = "" + publisher: str = "" + conference: str = "" + abstract: str = "" + keywords: List[str] = field(default_factory=list) + + +@dataclass +class ExtractorConfig: + """元数据提取器配置类""" + paragraph_separator: str = '\n\n' + text_cleanup: Dict[str, bool] = field(default_factory=lambda: { + 'remove_extra_spaces': True, + 'normalize_whitespace': True, + 'remove_special_chars': False, + 'lowercase': False + }) + + +class PaperMetadataExtractor: + """论文元数据提取器 + + 使用unstructured库从多种文档格式中提取论文的标题、作者、摘要等元数据信息。 + """ + + SUPPORTED_EXTENSIONS: Set[str] = { + '.pdf', '.docx', '.doc', '.txt', '.ppt', '.pptx', + '.xlsx', '.xls', '.md', '.org', '.odt', '.rst', + '.rtf', '.epub', '.html', '.xml', '.json' + } + + # 定义论文各部分的关键词模式 + SECTION_PATTERNS = { + 'abstract': r'\b(摘要|abstract|summary|概要|résumé|zusammenfassung|аннотация)\b', + 'keywords': r'\b(关键词|keywords|key\s+words|关键字|mots[- ]clés|schlüsselwörter|ключевые слова)\b', + } + + def __init__(self, config: Optional[ExtractorConfig] = None): + """初始化提取器 + + Args: + config: 提取器配置对象,如果为None则使用默认配置 + """ + self.config = config or ExtractorConfig() + self._setup_logging() + + def _setup_logging(self) -> None: + """配置日志记录器""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + self.logger = logging.getLogger(__name__) + + # 添加文件处理器 + fh = logging.FileHandler('paper_metadata_extractor.log') + fh.setLevel(logging.ERROR) + self.logger.addHandler(fh) + + def _validate_file(self, file_path: Union[str, Path], max_size_mb: int = 100) -> Path: + """验证文件 + + Args: + file_path: 文件路径 + max_size_mb: 允许的最大文件大小(MB) + + Returns: + Path: 验证后的Path对象 + + Raises: + ValueError: 文件不存在、格式不支持或大小超限 + PermissionError: 没有读取权限 + """ + path = Path(file_path).resolve() + + if not path.exists(): + raise ValueError(f"文件不存在: {path}") + + if not path.is_file(): + raise ValueError(f"不是文件: {path}") + + if not os.access(path, os.R_OK): + raise PermissionError(f"没有读取权限: {path}") + + file_size_mb = path.stat().st_size / (1024 * 1024) + if file_size_mb > max_size_mb: + raise ValueError( + f"文件大小 ({file_size_mb:.1f}MB) 超过限制 {max_size_mb}MB" + ) + + if path.suffix.lower() not in self.SUPPORTED_EXTENSIONS: + raise ValueError( + f"不支持的文件格式: {path.suffix}. " + f"支持的格式: {', '.join(sorted(self.SUPPORTED_EXTENSIONS))}" + ) + + return path + + def _cleanup_text(self, text: str) -> str: + """清理文本 + + Args: + text: 原始文本 + + Returns: + str: 清理后的文本 + """ + if self.config.text_cleanup['remove_extra_spaces']: + text = ' '.join(text.split()) + + if self.config.text_cleanup['normalize_whitespace']: + text = text.replace('\t', ' ').replace('\r', '\n') + + if self.config.text_cleanup['lowercase']: + text = text.lower() + + return text.strip() + + @staticmethod + def get_supported_formats() -> List[str]: + """获取支持的文件格式列表""" + return sorted(PaperMetadataExtractor.SUPPORTED_EXTENSIONS) + + def extract_metadata(self, file_path: Union[str, Path], strategy: str = "fast") -> PaperMetadata: + """提取论文元数据 + + Args: + file_path: 文件路径 + strategy: 提取策略 ("fast" 或 "accurate") + + Returns: + PaperMetadata: 提取的论文元数据 + + Raises: + Exception: 提取过程中的错误 + """ + try: + path = self._validate_file(file_path) + self.logger.info(f"正在处理: {path}") + + # 使用unstructured库分解文档 + elements = partition( + str(path), + strategy=strategy, + include_metadata=True, + nlp=False, + ) + + # 提取元数据 + metadata = PaperMetadata() + + # 提取标题和作者 + self._extract_title_and_authors(elements, metadata) + + # 提取摘要和关键词 + self._extract_abstract_and_keywords(elements, metadata) + + # 提取其他元数据 + self._extract_additional_metadata(elements, metadata) + + return metadata + + except Exception as e: + self.logger.error(f"元数据提取失败: {e}") + raise + + def _extract_title_and_authors(self, elements, metadata: PaperMetadata) -> None: + """从文档中提取标题和作者信息 - 改进版""" + # 收集所有潜在的标题候选 + title_candidates = [] + all_text = [] + raw_text = [] + + # 首先收集文档前30个元素的文本,用于辅助判断 + for i, element in enumerate(elements[:30]): + if isinstance(element, (Text, Title, NarrativeText)): + text = str(element).strip() + if text: + all_text.append(text) + raw_text.append(text) + + # 打印出原始文本,用于调试 + print("原始文本前10行:") + for i, text in enumerate(raw_text[:10]): + print(f"{i}: {text}") + + # 1. 尝试查找连续的标题片段并合并它们 + i = 0 + while i < len(all_text) - 1: + current = all_text[i] + next_text = all_text[i + 1] + + # 检查是否存在标题分割情况:一行以冒号结尾,下一行像是标题的延续 + if current.endswith(':') and len(current) < 50 and len(next_text) > 5 and next_text[0].isupper(): + # 合并这两行文本 + combined_title = f"{current} {next_text}" + # 查找合并前的文本并替换 + all_text[i] = combined_title + all_text.pop(i + 1) + # 给合并后的标题很高的分数 + title_candidates.append((combined_title, 15, i)) + else: + i += 1 + + # 2. 首先尝试从标题元素中查找 + for i, element in enumerate(elements[:15]): # 只检查前15个元素 + if isinstance(element, Title): + title_text = str(element).strip() + # 排除常见的非标题内容 + if title_text.lower() not in ['abstract', '摘要', 'introduction', '引言']: + # 计算标题分数(越高越可能是真正的标题) + score = self._evaluate_title_candidate(title_text, i, element) + title_candidates.append((title_text, score, i)) + + # 3. 特别处理常见的论文标题格式 + for i, text in enumerate(all_text[:15]): + # 特别检查"KIMI K1.5:"类型的前缀标题 + if re.match(r'^[A-Z][A-Z0-9\s\.]+(\s+K\d+(\.\d+)?)?:', text): + score = 12 # 给予很高的分数 + title_candidates.append((text, score, i)) + + # 如果下一行也是全大写,很可能是标题的延续 + if i+1 < len(all_text) and all_text[i+1].isupper() and len(all_text[i+1]) > 10: + combined_title = f"{text} {all_text[i+1]}" + title_candidates.append((combined_title, 15, i)) # 给合并标题更高分数 + + # 匹配全大写的标题行 + elif text.isupper() and len(text) > 10 and len(text) < 100: + score = 10 - i * 0.5 # 越靠前越可能是标题 + title_candidates.append((text, score, i)) + + # 对标题候选按分数排序并选取最佳候选 + if title_candidates: + title_candidates.sort(key=lambda x: x[1], reverse=True) + metadata.title = title_candidates[0][0] + title_position = title_candidates[0][2] + print(f"所有标题候选: {title_candidates[:3]}") + else: + # 如果没有找到合适的标题,使用一个备选策略 + for text in all_text[:10]: + if text.isupper() and len(text) > 10 and len(text) < 200: # 大写且适当长度的文本 + metadata.title = text + break + title_position = 0 + + # 提取作者信息 - 改进后的作者提取逻辑 + author_candidates = [] + + # 1. 特别处理"TECHNICAL REPORT OF"之后的行,通常是作者或团队 + for i, text in enumerate(all_text): + if "TECHNICAL REPORT" in text.upper() and i+1 < len(all_text): + team_text = all_text[i+1].strip() + if re.search(r'\b(team|group|lab)\b', team_text, re.IGNORECASE): + author_candidates.append((team_text, 15)) + + # 2. 查找包含Team的文本 + for text in all_text[:20]: + if "Team" in text and len(text) < 30: + # 这很可能是团队名 + author_candidates.append((text, 12)) + + # 添加作者到元数据 + if author_candidates: + # 按分数排序 + author_candidates.sort(key=lambda x: x[1], reverse=True) + + # 去重 + seen_authors = set() + for author, _ in author_candidates: + if author.lower() not in seen_authors and not author.isdigit(): + seen_authors.add(author.lower()) + metadata.authors.append(author) + + # 如果没有找到作者,尝试查找隶属机构信息中的团队名称 + if not metadata.authors: + for text in all_text[:20]: + if re.search(r'\b(team|group|lab|laboratory|研究组|团队)\b', text, re.IGNORECASE): + if len(text) < 50: # 避免太长的文本 + metadata.authors.append(text.strip()) + break + + # 提取隶属机构信息 + for i, element in enumerate(elements[:30]): + element_text = str(element).strip() + if re.search(r'(university|institute|department|school|laboratory|college|center|centre|\d{5,}|^[a-zA-Z]+@|学院|大学|研究所|研究院)', element_text, re.IGNORECASE): + # 可能是隶属机构 + if element_text not in metadata.affiliations and len(element_text) > 10: + metadata.affiliations.append(element_text) + + def _evaluate_title_candidate(self, text, position, element): + """评估标题候选项的可能性分数""" + score = 0 + + # 位置因素:越靠前越可能是标题 + score += max(0, 10 - position) * 0.5 + + # 长度因素:标题通常不会太短也不会太长 + if 10 <= len(text) <= 150: + score += 3 + elif len(text) < 10: + score -= 2 + elif len(text) > 150: + score -= 3 + + # 格式因素 + if text.isupper(): # 全大写可能是标题 + score += 2 + if re.match(r'^[A-Z]', text): # 首字母大写 + score += 1 + if ':' in text: # 标题常包含冒号 + score += 1.5 + + # 内容因素 + if re.search(r'\b(scaling|learning|model|approach|method|system|framework|analysis)\b', text.lower()): + score += 2 # 包含常见的学术论文关键词 + + # 避免误判 + if re.match(r'^\d+$', text): # 纯数字 + score -= 10 + if re.search(r'^(http|www|doi)', text.lower()): # URL或DOI + score -= 5 + if len(text.split()) <= 2 and len(text) < 15: # 太短的短语 + score -= 3 + + # 元数据因素(如果有) + if hasattr(element, 'metadata') and element.metadata: + # 修复:正确处理ElementMetadata对象 + try: + # 尝试通过getattr安全地获取属性 + font_size = getattr(element.metadata, 'font_size', None) + if font_size is not None and font_size > 14: # 假设标准字体大小是12 + score += 3 + + font_weight = getattr(element.metadata, 'font_weight', None) + if font_weight == 'bold': + score += 2 # 粗体加分 + except (AttributeError, TypeError): + # 如果metadata的访问方式不正确,尝试其他可能的访问方式 + try: + metadata_dict = element.metadata.__dict__ if hasattr(element.metadata, '__dict__') else {} + if 'font_size' in metadata_dict and metadata_dict['font_size'] > 14: + score += 3 + if 'font_weight' in metadata_dict and metadata_dict['font_weight'] == 'bold': + score += 2 + except Exception: + # 如果所有尝试都失败,忽略元数据处理 + pass + + return score + + def _extract_abstract_and_keywords(self, elements, metadata: PaperMetadata) -> None: + """从文档中提取摘要和关键词""" + abstract_found = False + keywords_found = False + abstract_text = [] + + for i, element in enumerate(elements): + element_text = str(element).strip().lower() + + # 寻找摘要部分 + if not abstract_found and ( + isinstance(element, Title) and + re.search(self.SECTION_PATTERNS['abstract'], element_text, re.IGNORECASE) + ): + abstract_found = True + continue + + # 如果找到摘要部分,收集内容直到遇到关键词部分或新章节 + if abstract_found and not keywords_found: + # 检查是否遇到关键词部分或新章节 + if ( + isinstance(element, Title) or + re.search(self.SECTION_PATTERNS['keywords'], element_text, re.IGNORECASE) or + re.match(r'\b(introduction|引言|method|方法)\b', element_text, re.IGNORECASE) + ): + keywords_found = re.search(self.SECTION_PATTERNS['keywords'], element_text, re.IGNORECASE) + abstract_found = False # 停止收集摘要 + else: + # 收集摘要文本 + if isinstance(element, (Text, NarrativeText)) and element_text: + abstract_text.append(element_text) + + # 如果找到关键词部分,提取关键词 + if keywords_found and not abstract_found and not metadata.keywords: + if isinstance(element, (Text, NarrativeText)): + # 清除可能的"关键词:"/"Keywords:"前缀 + cleaned_text = re.sub(r'^\s*(关键词|keywords|key\s+words)\s*[::]\s*', '', element_text, flags=re.IGNORECASE) + + # 尝试按不同分隔符分割 + for separator in [';', ';', ',', ',']: + if separator in cleaned_text: + metadata.keywords = [k.strip() for k in cleaned_text.split(separator) if k.strip()] + break + + # 如果未能分割,将整个文本作为一个关键词 + if not metadata.keywords and cleaned_text: + metadata.keywords = [cleaned_text] + + keywords_found = False # 已提取关键词,停止处理 + + # 设置摘要文本 + if abstract_text: + metadata.abstract = self.config.paragraph_separator.join(abstract_text) + + def _extract_additional_metadata(self, elements, metadata: PaperMetadata) -> None: + """提取其他元数据信息""" + for element in elements[:30]: # 只检查文档前部分 + element_text = str(element).strip() + + # 尝试匹配DOI + doi_match = re.search(r'(doi|DOI):\s*(10\.\d{4,}\/[a-zA-Z0-9.-]+)', element_text) + if doi_match and not metadata.doi: + metadata.doi = doi_match.group(2) + + # 尝试匹配日期 + date_match = re.search(r'(published|received|accepted|submitted):\s*(\d{1,2}\s+[a-zA-Z]+\s+\d{4}|\d{4}[-/]\d{1,2}[-/]\d{1,2})', element_text, re.IGNORECASE) + if date_match and not metadata.date: + metadata.date = date_match.group(2) + + # 尝试匹配年份 + year_match = re.search(r'\b(19|20)\d{2}\b', element_text) + if year_match and not metadata.year: + metadata.year = year_match.group(0) + + # 尝试匹配期刊/会议名称 + journal_match = re.search(r'(journal|conference):\s*([^,;.]+)', element_text, re.IGNORECASE) + if journal_match: + if "journal" in journal_match.group(1).lower() and not metadata.journal: + metadata.journal = journal_match.group(2).strip() + elif not metadata.conference: + metadata.conference = journal_match.group(2).strip() + + +def main(): + """主函数:演示用法""" + # 创建提取器 + extractor = PaperMetadataExtractor() + + # 使用示例 + try: + # 替换为实际的文件路径 + sample_file = '/Users/boyin.liu/Documents/示例文档/论文/3.pdf' + if Path(sample_file).exists(): + metadata = extractor.extract_metadata(sample_file) + print("提取的元数据:") + print(f"标题: {metadata.title}") + print(f"作者: {', '.join(metadata.authors)}") + print(f"机构: {', '.join(metadata.affiliations)}") + print(f"摘要: {metadata.abstract[:200]}...") + print(f"关键词: {', '.join(metadata.keywords)}") + print(f"DOI: {metadata.doi}") + print(f"日期: {metadata.date}") + print(f"年份: {metadata.year}") + print(f"期刊: {metadata.journal}") + print(f"会议: {metadata.conference}") + else: + print(f"示例文件 {sample_file} 不存在") + + print("\n支持的格式:", extractor.get_supported_formats()) + + except Exception as e: + print(f"错误: {e}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/crazy_functions/doc_fns/read_fns/unstructured_all/paper_structure_extractor.py b/crazy_functions/doc_fns/read_fns/unstructured_all/paper_structure_extractor.py new file mode 100644 index 00000000..e5ee7cb2 --- /dev/null +++ b/crazy_functions/doc_fns/read_fns/unstructured_all/paper_structure_extractor.py @@ -0,0 +1,1220 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Optional, Set, Dict, Union, List, Tuple, Any +from dataclasses import dataclass, field +import logging +import os +import re + +from unstructured.partition.auto import partition +from unstructured.documents.elements import ( + Text, Title, NarrativeText, ListItem, Table, + Footer, Header, PageBreak, Image, Address +) + +# 引入元数据提取器 +from crazy_functions.doc_fns.read_fns.unstructured_all.paper_metadata_extractor import PaperMetadata, PaperMetadataExtractor + + +@dataclass +class PaperSection: + """论文章节数据类""" + section_type: str # 章节类型,如"abstract", "introduction", "method", "result", "discussion", "conclusion", "references"等 + title: str # 章节标题 + content: str # 章节内容 + level: int = 0 # 标题级别,0为主标题,1为一级标题,以此类推 + subsections: List['PaperSection'] = field(default_factory=list) # 子章节列表 + + +@dataclass +class Figure: + """论文图表数据类""" + id: str # 图表ID,如"Figure 1" + caption: str # 图表标题 + content: str # 图表描述内容 + position: int # 在文档中的位置索引 + + +@dataclass +class Formula: + """论文公式数据类""" + id: str # 公式ID,如"(1)" + content: str # 公式内容 + position: int # 在文档中的位置索引 + + +@dataclass +class Reference: + """参考文献数据类""" + id: str = "" # 引用编号,如"[1]" + text: str = "" # 完整引用文本 + title: str = "" # 文献标题 + authors: List[str] = field(default_factory=list) # 作者列表 + year: str = "" # 出版年份 + source: str = "" # 来源(期刊、会议等) + + +@dataclass +class StructuredPaper: + """结构化论文数据类""" + metadata: PaperMetadata = field(default_factory=PaperMetadata) + sections: List[PaperSection] = field(default_factory=list) + figures: List[Figure] = field(default_factory=list) + tables: List[Figure] = field(default_factory=list) + formulas: List[Formula] = field(default_factory=list) + references: List[Reference] = field(default_factory=list) + full_text: str = "" + keywords: List[str] = field(default_factory=list) + + +@dataclass +class ExtractorConfig: + """提取器配置类""" + extract_figures: bool = True + extract_tables: bool = True + extract_formulas: bool = True + extract_references: bool = True + paragraph_separator: str = '\n\n' + text_cleanup: Dict[str, bool] = field(default_factory=lambda: { + 'remove_extra_spaces': True, + 'normalize_whitespace': True, + 'remove_special_chars': False, + 'lowercase': False + }) + + +class PaperStructureExtractor: + """论文结构提取器 + + 从各种文档格式中提取论文的完整结构化信息,包括元数据、章节结构、图表、公式、参考文献等。 + """ + + # 定义论文各部分的关键词模式 + PAPER_SECTION_PATTERNS = { + 'abstract': r'\b(摘要|abstract|summary|概要|résumé|zusammenfassung|аннотация)\b', + 'keywords': r'\b(关键词|keywords|key\s+words|关键字|mots[- ]clés|schlüsselwörter|ключевые слова)\b', + 'introduction': r'\b(引言|介绍|绪论|introduction|background|引言:|概述|einleitung|введение)\b', + 'related_work': r'\b(相关工作|related\s+work|literature\s+review|研究现状|prior\s+work|verwandte arbeiten|предыдущие работы)\b', + 'method': r'\b(方法|材料与方法|methodology|materials\s+and\s+methods|methods|approach|experimental|实验|算法|algorithm|方法:|研究方法|methoden|методы)\b', + 'result': r'\b(结果|results|findings|observations|实验结果|结果与分析|ergebnisse|результаты)\b', + 'discussion': r'\b(讨论|discussion|analysis|interpretation|分析|讨论与分析|diskussion|обсуждение)\b', + 'conclusion': r'\b(结论|总结|conclusion|summary|concluding\s+remarks|结语|总结与展望|schlussfolgerung|заключение)\b', + 'references': r'\b(参考文献|references|bibliography|引用|citation|文献|literatur|литература)\b', + 'acknowledgement': r'\b(致谢|acknowledgement|acknowledgment|鸣谢|acknowledgements|danksagung|благодарности)\b', + 'appendix': r'\b(附录|appendix|supplementary|补充材料|appendices|anhang|приложение)\b', + 'table': r'\b(表\s*\d+|table\s*\d+|tabelle\s*\d+|таблица\s*\d+)\b', + 'figure': r'\b(图\s*\d+|figure\s*\d+|fig.\s*\d+|abbildung\s*\d+|рисунок\s*\d+)\b' + } + + SUPPORTED_EXTENSIONS = PaperMetadataExtractor.SUPPORTED_EXTENSIONS + + def __init__(self, config: Optional[ExtractorConfig] = None): + """初始化提取器 + + Args: + config: 提取器配置对象,如果为None则使用默认配置 + """ + self.config = config or ExtractorConfig() + self.metadata_extractor = PaperMetadataExtractor() + self._setup_logging() + + def _setup_logging(self) -> None: + """配置日志记录器""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + self.logger = logging.getLogger(__name__) + + # 添加文件处理器 + fh = logging.FileHandler('paper_structure_extractor.log') + fh.setLevel(logging.ERROR) + self.logger.addHandler(fh) + + def _cleanup_text(self, text: str) -> str: + """清理文本 + + Args: + text: 原始文本 + + Returns: + str: 清理后的文本 + """ + if self.config.text_cleanup['remove_extra_spaces']: + text = ' '.join(text.split()) + + if self.config.text_cleanup['normalize_whitespace']: + text = text.replace('\t', ' ').replace('\r', '\n') + + if self.config.text_cleanup['remove_special_chars']: + # 只保留字母、数字、基本标点和中文字符 + text = re.sub(r'[^\w\s.,;:!?,。;:!?、\u4e00-\u9fff]', '', text) + + if self.config.text_cleanup['lowercase']: + text = text.lower() + + return text.strip() + + @staticmethod + def get_supported_formats() -> List[str]: + """获取支持的文件格式列表""" + return sorted(PaperStructureExtractor.SUPPORTED_EXTENSIONS) + + def extract_paper_structure(self, file_path: Union[str, Path], strategy: str = "fast") -> StructuredPaper: + """提取论文的完整结构化信息 + + Args: + file_path: 文件路径 + strategy: 提取策略 ("fast" 或 "accurate") + + Returns: + StructuredPaper: 结构化的论文数据 + + Raises: + Exception: 提取过程中的错误 + """ + try: + path = Path(file_path).resolve() + self.logger.info(f"正在处理论文结构: {path}") + + # 创建结构化论文对象 + paper = StructuredPaper() + + # 提取元数据 + paper.metadata = self.metadata_extractor.extract_metadata(path, strategy) + + # 使用unstructured库分解文档 + elements = partition( + str(path), + strategy=strategy, + include_metadata=True, + nlp=False, + ) + + # 提取关键词 + paper.keywords = paper.metadata.keywords + + # 提取章节结构 + paper.sections = self._extract_sections(elements) + + # 提取图表 + if self.config.extract_figures: + paper.figures, paper.tables = self._extract_figures_and_tables(elements) + + # 提取公式 + if self.config.extract_formulas: + paper.formulas = self._extract_formulas(elements) + + # 提取参考文献 + if self.config.extract_references: + paper.references = self._extract_references(elements) + + # 提取完整文本 + paper.full_text = self._extract_full_text(elements) + + return paper + + except Exception as e: + self.logger.error(f"结构提取失败: {e}") + raise + + def _extract_sections(self, elements) -> List[PaperSection]: + """提取论文的章节结构 + + Args: + elements: 文档元素列表 + + Returns: + List[PaperSection]: 章节列表 + """ + # 第一遍:识别所有标题元素 + title_elements = [] + for i, element in enumerate(elements): + if isinstance(element, Title): + title_text = str(element).strip() + + # 添加过滤条件,排除非章节标题 + if self._is_likely_section_title(title_text, element, i, elements): + section_type = self._identify_section_type(title_text) + level = self._estimate_title_level(element, elements) + title_elements.append((i, title_text, section_type, level, element)) + + # 按层级排序,确保层级低的(数字大的)在后面 + title_elements.sort(key=lambda x: (x[0], x[3])) + + # 第二遍:创建章节内容 + sections = [] + for i, (index, title_text, section_type, level, element) in enumerate(title_elements): + # 提取章节内容 + content = "" + if i < len(title_elements) - 1: + # 提取到下一章节开始 + next_index = title_elements[i+1][0] + content = self._extract_content_between_indices(elements, index+1, next_index) + else: + # 提取到文档结束 + content = self._extract_content_after_index(elements, index+1) + + # 创建章节对象 + section = PaperSection( + section_type=section_type, + title=title_text, + content=content, + level=level, + subsections=[] + ) + sections.append(section) + + # 构建章节层次结构 + hierarchical_sections = self._build_section_hierarchy(sections) + return hierarchical_sections + + def _is_likely_section_title(self, title_text: str, element, index: int, elements) -> bool: + """判断标题是否可能是章节标题""" + title_lower = title_text.lower() + + # 首先检查是否在参考文献部分 + if self._is_in_references_section(index, elements): + # 参考文献部分的标题处理策略: + # 1. 只有特定格式的标题才被接受 + # 2. 通常参考文献中的内容不应被识别为标题 + + # 检查是否是有效的参考文献标题格式 + valid_ref_title_patterns = [ + r'^references$', + r'^bibliography$', + r'^参考文献$', + r'^\d+\.\s*references$', + r'^文献$', + r'^引用文献$' + ] + + is_valid_ref_title = any(re.match(pattern, title_lower) for pattern in valid_ref_title_patterns) + + # 在参考文献部分,除非是明确的子分类标题,否则都不认为是标题 + if not is_valid_ref_title: + # 检查特定格式:常见的参考文献子类别 + ref_subcategory_patterns = [ + r'^primary\s+sources$', + r'^secondary\s+sources$', + r'^books$', + r'^journals$', + r'^conference\s+papers$', + r'^web\s+sources$', + r'^further\s+reading$', + r'^monographs$' + ] + + is_ref_subcategory = any(re.match(pattern, title_lower) for pattern in ref_subcategory_patterns) + + # 如果不是子类别标题,在参考文献部分很可能不是标题 + if not is_ref_subcategory: + # 检查是否包含出版物特征(会议、期刊、年份等) + pub_features = [ + r'conference', r'proceedings', r'journal', r'transactions', + r'symposium', r'workshop', r'international', r'annual', + r'\d{4}', r'pp\.', r'vol\.', r'pages', r'ieee', r'acm' + ] + + has_pub_features = any(re.search(pattern, title_lower) for pattern in pub_features) + + if has_pub_features: + return False + + # 检查文本长度和格式特征 + if len(title_text) > 50 or title_text.count(' ') > 10: + return False + + # 检查是否包含DOI、arXiv等标识 + if re.search(r'doi|arxiv|http|url|issn|isbn', title_lower): + return False + + # 检查是否为数学表达式(例如"max θ")- 保留现有的模式检测 + math_expr_patterns = [ + r'^(max|min|sup|inf|lim|arg\s*max|arg\s*min)\s+[a-zA-Z\u0370-\u03FF\u0400-\u04FF θΘ]+$', + r'^E\s*\(', # 期望值表达式开头 + r'^∑|∏|∫|∂|∇|∆', # 以数学符号开头 + r'^\s*\([a-zA-Z0-9]\)\s*$', # 如 (a), (1) 等单个字母/数字的标识 + ] + + # 如果匹配任何数学表达式模式,不太可能是章节标题 + for pattern in math_expr_patterns: + if re.search(pattern, title_text): + return False + + # 检查标题文本本身是否过短(短标题通常不是章节标题,除非是明确的关键词) + if len(title_text) < 4 and not re.match(r'^(abstract|introduction|methods?|results?|discussion|conclusion|references)$', title_lower, re.IGNORECASE): + return False + + # 标题中包含括号、大量符号等可能是公式 + if re.search(r'[)}]n$|[([{)\]}].*[([{)\]}]|\d+[=><≥≤]|[a-z]\s*=', title_text): + return False + + # =================== 增强后续内容长度检查 =================== + # 查找下一个非空元素 + next_elements = [] + total_followup_content = "" + next_title_index = -1 + + # 收集标题后的内容,直到遇到另一个标题或超过限制 + for i in range(index+1, min(index+10, len(elements))): + if str(elements[i]).strip(): + next_elements.append(elements[i]) + if not isinstance(elements[i], Title): + total_followup_content += str(elements[i]) + else: + next_title_index = i + break + + # 核心检查:标题后内容长度判断 + # 1. 如果后面没有内容,这不太可能是标题 + if not next_elements: + return False + + # 2. 如果后面第一个元素不是标题但内容很短(少于100字符) + if next_elements and not isinstance(next_elements[0], Title): + first_element_length = len(str(next_elements[0])) + # 检查是否存在第二个非标题元素,如果没有或内容同样很短 + if (len(next_elements) == 1 or + (len(next_elements) > 1 and not isinstance(next_elements[1], Title) and + len(str(next_elements[1])) < 50)): + # 如果后续内容总长度小于阈值,可能不是真正的标题 + if first_element_length < 100 and len(total_followup_content) < 150: + # 只有常见章节标题可以例外 + section_type = self._identify_section_type(title_text) + main_sections = ['abstract', 'introduction', 'method', 'result', + 'discussion', 'conclusion', 'references', 'acknowledgement'] + if section_type not in main_sections: + # 额外检查:如果紧接着的内容包含数学符号,更可能是公式的一部分 + if re.search(r'[+\-*/=<>≤≥≈≠∑∏∫∂√∞∝∇≡∀∃∄⊂⊃∈∉]|i\s*=|x\s*[ij]|y\s*[ij*]|\(\d+\)', str(next_elements[0])): + return False + # 检查标题文本是否包含可疑的数学符号或编号 + if re.search(r'[(){}\[\]∑∏∫i]|^\w{1,2}$', title_text): + return False + + # 最后根据总体内容长度判断 + if len(total_followup_content) < 150: + return False + + # 3. 如果后面第一个元素是标题,检查级别关系 + elif next_elements and isinstance(next_elements[0], Title): + # 获取当前和下一个标题的级别 + current_level = self._estimate_title_level(element, elements) + next_level = self._estimate_title_level(next_elements[0], elements) + + # 如果下一个标题级别不是子标题(级别更大),当前标题可能是有问题的 + if next_level <= current_level: + # 检查前后是否有更多数学内容 + if self._surrounding_has_math_symbols(index, elements): + return False + + # 对于非主要章节标题特别严格 + section_type = self._identify_section_type(title_text) + if section_type not in ['abstract', 'introduction', 'method', 'result', 'discussion', 'conclusion', 'references']: + # 检查标题文本是否匹配常见章节编号模式 + if not re.match(r'^\d+(\.\d+)*\.\s+', title_text): + return False + + # 定义明确的非章节标题模式 + non_section_patterns = [ + r'received|accepted|submitted|revised|published', + r'key\s*words|keywords', + r'^(table|表)\s*\d+', + r'^(figure|fig\.|图)\s*\d+', + r'^p[- ]value', # P值通常不是章节 + r'^(age|sex|gender|stage)(\s+|:)', # 表格中的变量名 + r'male\s+female', # 表格内容 + r'≤|≥', # 表格中的比较符号 + r'^not applicable\.?$', # "Not applicable" 文本 + r'^[t](\d+)', # T1, T2等肿瘤分期不是章节 + r'^[nm](\d+)', # N0, M1等肿瘤分期不是章节 + ] + + # 如果匹配任何非章节模式,返回False + for pattern in non_section_patterns: + if re.search(pattern, title_lower, re.IGNORECASE): + return False + + # 检查是否为表格内容的更强化逻辑 + + # 1. 检查前后文本模式 - 表格行通常有一定的模式 + + # 检查前面的元素 - 如果前面几个元素都是Title且长度相似,可能是表格 + similar_title_count = 0 + if index > 1: + for i in range(max(0, index-5), index): + if isinstance(elements[i], Title): + prev_title_text = str(elements[i]).strip() + # 检查长度是否相似 + if 0.7 <= len(prev_title_text) / len(title_text) <= 1.3: + similar_title_count += 1 + # 检查格式是否相似(例如都是由空格分隔的几个词) + if len(prev_title_text.split()) == len(title_text.split()): + similar_title_count += 1 + + # 检查后面的元素 - 如果后面几个元素都是Title且长度相似,可能是表格 + if index < len(elements) - 1: + for i in range(index+1, min(index+5, len(elements))): + if isinstance(elements[i], Title): + next_title_text = str(elements[i]).strip() + # 检查长度是否相似 + if 0.7 <= len(next_title_text) / len(title_text) <= 1.3: + similar_title_count += 1 + # 检查格式是否相似 + if len(next_title_text.split()) == len(title_text.split()): + similar_title_count += 1 + + # 如果周围有多个相似的Title元素,可能是表格内容 + if similar_title_count >= 4: + return False + + # 2. 检查内容特征 - 表格行通常有特定的特征 + + # 检查是否像表格数据行 + if len(title_text) < 40: # 表格行通常不会太长 + words = title_text.split() + + # 表格可能格式: "项目 数值 数值" 或 "组别 n 百分比" 等 + if 2 <= len(words) <= 6: + # 检查是否包含数字或百分比 - 表格行特征 + has_numbers = any(re.search(r'\d', word) for word in words) + has_percentages = '%' in title_text + + # 检查短词占比 - 表格行通常是短词 + short_words_ratio = sum(1 for word in words if len(word) <= 5) / len(words) + + # 综合判断 + if (has_numbers or has_percentages) and short_words_ratio > 0.6: + # 再检查内容长度 - 表格行后通常没有长内容 + followup_content_length = self._calculate_followup_content_length(index, elements, max_elements=3) + if followup_content_length < 100: + return False + + # 3. 检查前后内容长度 + + # 计算前面内容长度 + preceding_content_length = 0 + for i in range(max(0, index-3), index): + if isinstance(elements[i], (Text, NarrativeText)): + preceding_content_length += len(str(elements[i])) + + # 计算后面内容长度 + followup_content_length = self._calculate_followup_content_length(index, elements) + + # 真正的章节标题前面通常是另一章节的结尾(有少量文本)或文档开始,后面有大量文本 + if preceding_content_length > 200 and followup_content_length < 150: + # 如果前面有大量文本,后面文本很少,可能不是章节标题 + return False + + # 标题应该有足够长的后续内容(除非是参考文献等特殊章节) + section_type = self._identify_section_type(title_text) + main_sections = ['abstract', 'introduction', 'method', 'result', + 'discussion', 'conclusion', 'references', 'acknowledgement'] + + if section_type in ['references', 'acknowledgement']: + return True # 特殊章节不需要内容长度检查 + + # 其他章节,根据章节类型和编号情况进行判断 + if section_type in main_sections: + return followup_content_length >= 200 # 主要章节要求200字符以上 + elif re.match(r'^\d+(\.\d+)*\.?\s+', title_text): # 带编号的章节 + return followup_content_length >= 150 # 编号章节要求150字符以上 + else: + return followup_content_length >= 300 # 其他可能章节要求300字符以上 + + def _calculate_followup_content_length(self, index: int, elements, max_elements: int = 10) -> int: + """计算标题后面的内容长度 + + Args: + index: 标题在元素列表中的索引 + elements: 所有元素列表 + max_elements: 最多检查后续多少个元素 + + Returns: + int: 内容长度 + """ + content_length = 0 + for i in range(index + 1, min(index + max_elements + 1, len(elements))): + if isinstance(elements[i], Title): + # 如果遇到另一个标题,停止计算 + break + if isinstance(elements[i], (Text, NarrativeText)): + content_length += len(str(elements[i])) + return content_length + + def _identify_section_type(self, title_text: str) -> str: + """根据标题文本识别章节类型""" + title_lower = title_text.lower() + + for section_type, pattern in self.PAPER_SECTION_PATTERNS.items(): + if re.search(pattern, title_lower): + return section_type + + # 尝试识别编号章节 + if re.match(r'^(\d+\.|\d+\s+)', title_lower): + # 如果是数字开头,可能是正文章节 + return "content" + + return "other" + + def _estimate_title_level(self, title_element, all_elements) -> int: + """估计标题的层级""" + title_text = str(title_element).strip() + + # 通过标题文本中的编号格式判断层级 + # 查找诸如 "1."、"1.1"、"1.1.1" 等模式 + level_patterns = [ + (r'^(\d+\.?\s+)', 1), # 1. 或 1 开头为一级标题 + (r'^(\d+\.\d+\.?\s+)', 2), # 1.1. 或 1.1 开头为二级标题 + (r'^(\d+\.\d+\.\d+\.?\s+)', 3), # 1.1.1. 或 1.1.1 开头为三级标题 + (r'^(\d+\.\d+\.\d+\.\d+\.?\s+)', 4), # 1.1.1.1. 或 1.1.1.1 开头为四级标题 + ] + + for pattern, level in level_patterns: + if re.match(pattern, title_text): + return level + + # 检查标题是否是常见的主要章节标题 + main_sections = {'abstract', 'introduction', 'method', 'result', 'discussion', 'conclusion', 'references'} + if self._identify_section_type(title_text) in main_sections: + return 1 + + # 检查字体大小(如果元数据中有) + if hasattr(title_element, 'metadata') and title_element.metadata: + try: + # 尝试获取字体大小信息 + font_size = getattr(title_element.metadata, 'font_size', None) + if font_size is not None: + # 根据字体大小确定层级(较大字体为较低层级) + if font_size > 16: + return 1 + elif font_size > 14: + return 2 + else: + return 3 + except (AttributeError, TypeError): + pass + + # 默认为1级标题 + return 1 + + def _extract_content_between_indices(self, elements, start_index: int, end_index: int) -> str: + """提取指定索引范围内的内容""" + content_parts = [] + + for i in range(start_index, end_index): + element = elements[i] + if isinstance(element, (Text, NarrativeText, ListItem, Table)): + content_parts.append(self._cleanup_text(str(element))) + + return self.config.paragraph_separator.join(content_parts) + + def _extract_content_after_index(self, elements, start_index: int) -> str: + """提取从指定索引到文档结束的内容""" + content_parts = [] + + for i in range(start_index, len(elements)): + element = elements[i] + if isinstance(element, (Text, NarrativeText, ListItem, Table)): + content_parts.append(self._cleanup_text(str(element))) + + return self.config.paragraph_separator.join(content_parts) + + def _build_section_hierarchy(self, sections: List[PaperSection]) -> List[PaperSection]: + """构建章节的层次结构""" + if not sections: + return [] + + # 按层级排序 + root_sections = [] + current_parents = {0: None} # 每个层级的当前父节点 + + for section in sections: + # 找到当前节点的父节点 + parent_level = None + for level in sorted([k for k in current_parents.keys() if k < section.level], reverse=True): + parent_level = level + break + + if parent_level is None: + # 顶级节点 + root_sections.append(section) + else: + # 添加为子节点 + parent = current_parents[parent_level] + if parent: + parent.subsections.append(section) + else: + root_sections.append(section) + + # 更新当前层级的父节点 + current_parents[section.level] = section + + # 清除所有更深层级的父节点缓存 + deeper_levels = [k for k in current_parents.keys() if k > section.level] + for level in deeper_levels: + current_parents.pop(level, None) + + return root_sections + + def _extract_figures_and_tables(self, elements) -> Tuple[List[Figure], List[Figure]]: + """提取文档中的图表信息""" + figures = [] + tables = [] + + for i, element in enumerate(elements): + element_text = str(element).strip() + + # 查找图表标识 + fig_match = re.match(r'^(figure|fig\.|图)\s*(\d+)[.:](.*)', element_text, re.IGNORECASE) + table_match = re.match(r'^(table|表)\s*(\d+)[.:](.*)', element_text, re.IGNORECASE) + + if fig_match: + fig_id = f"{fig_match.group(1)} {fig_match.group(2)}" + caption = fig_match.group(3).strip() + + # 查找图表描述(通常在图表标识下方) + description = "" + for j in range(i+1, min(i+5, len(elements))): + next_text = str(elements[j]).strip() + if isinstance(elements[j], (Title, Table)) or re.match(r'^(figure|fig\.|table|图|表)\s*\d+', next_text, re.IGNORECASE): + break + description += next_text + " " + + figures.append(Figure( + id=fig_id, + caption=caption, + content=description.strip(), + position=i + )) + + elif table_match: + table_id = f"{table_match.group(1)} {table_match.group(2)}" + caption = table_match.group(3).strip() + + # 对于表格,尝试获取表格内容 + table_content = "" + if i+1 < len(elements) and isinstance(elements[i+1], Table): + table_content = str(elements[i+1]) + + tables.append(Figure( + id=table_id, + caption=caption, + content=table_content, + position=i + )) + + # 检查元素本身是否为表格 + elif isinstance(element, Table): + # 查找表格标题(通常在表格之前) + caption = "" + if i > 0: + prev_text = str(elements[i-1]).strip() + if re.match(r'^(table|表)\s*\d+', prev_text, re.IGNORECASE): + caption = prev_text + + if not caption: + caption = f"Table {len(tables) + 1}" + + tables.append(Figure( + id=f"Table {len(tables) + 1}", + caption=caption, + content=element_text, + position=i + )) + + # 检查元素本身是否为图片 + elif isinstance(element, Image): + # 查找图片标题(通常在图片之前或之后) + caption = "" + for j in range(max(0, i-2), min(i+3, len(elements))): + if j != i: + j_text = str(elements[j]).strip() + if re.match(r'^(figure|fig\.|图)\s*\d+', j_text, re.IGNORECASE): + caption = j_text + break + + if not caption: + caption = f"Figure {len(figures) + 1}" + + figures.append(Figure( + id=f"Figure {len(figures) + 1}", + caption=caption, + content="[Image]", + position=i + )) + + return figures, tables + + def _surrounding_has_math_symbols(self, index: int, elements, window: int = 3) -> bool: + """检查周围元素是否包含较多数学符号 + + Args: + index: 当前元素索引 + elements: 所有元素 + window: 检查的窗口大小 + + Returns: + bool: 是否包含较多数学符号 + """ + math_symbols = r'[+\-*/=<>≤≥≈≠∑∏∫∂√∞∝∇≡∀∃∄⊂⊃∈∉θΘαβγδ\[\]\{\}]' + + # 检查前后各window个元素 + start = max(0, index - window) + end = min(len(elements), index + window + 1) + + math_symbol_count = 0 + total_text = "" + + for i in range(start, end): + if i == index: + continue # 跳过当前元素 + + if isinstance(elements[i], (Text, NarrativeText, Title)): + text = str(elements[i]) + total_text += text + math_symbol_count += len(re.findall(math_symbols, text)) + + # 计算数学符号密度 + if total_text: + density = math_symbol_count / len(total_text) + return density > 0.05 # 如果密度超过5%,认为是数学内容 + + return False + + def _extract_formulas(self, elements) -> List[Formula]: + """提取文档中的公式""" + formulas = [] + formula_pattern = r'^\s*\((\d+)\)\s*' + + # 标记可能是标题但实际是公式的索引 + formula_title_indices = set() + + # 第一遍:识别可能被误解为标题的公式 + for i, element in enumerate(elements): + if isinstance(element, Title): + title_text = str(element).strip() + + # 检查是否符合数学表达式模式 + math_expr_patterns = [ + r'^(max|min|sup|inf|lim|arg\s*max|arg\s*min)\s+[a-zA-Z\u0370-\u03FF\u0400-\u04FF θΘ]+$', + r'^E\s*\(', # 期望值表达式 + r'^∑|∏|∫|∂|∇|∆', # 以数学符号开头 + ] + + is_math_expr = any(re.search(pattern, title_text) for pattern in math_expr_patterns) + + if is_math_expr: + # 判断是否是真正的标题 + # 1. 检查后面元素的长度 + next_is_short = False + for j in range(i+1, min(i+3, len(elements))): + if isinstance(elements[j], (Text, NarrativeText)) and len(str(elements[j])) < 50: + next_is_short = True + break + + # 2. 检查周围是否有数学符号 + surrounding_has_math = self._surrounding_has_math_symbols(i, elements) + + if next_is_short or surrounding_has_math: + formula_title_indices.add(i) + + # 第二遍:提取所有公式,包括被误识别为标题的公式 + for i, element in enumerate(elements): + element_text = str(element).strip() + is_formula = False + formula_id = "" + + # 处理被误识别为标题的公式 + if i in formula_title_indices: + is_formula = True + formula_id = f"Formula-{len(formulas)+1}" + else: + # 常规公式识别逻辑,与之前相同 + formula_match = re.match(formula_pattern, element_text) + + if formula_match: + formula_id = f"({formula_match.group(1)})" + # 移除公式编号 + element_text = re.sub(formula_pattern, '', element_text) + is_formula = True + + if is_formula: + # 检查后续元素是否需要合并(例如,如果标题是"max θ",后面元素通常是公式的其余部分) + merged_content = element_text + j = i + 1 + while j < min(i+3, len(elements)): + next_elem = elements[j] + next_text = str(next_elem).strip() + + # 如果下一个元素很短且包含数学符号,可能是公式的一部分 + if len(next_text) < 50 and re.search(r'[+\-*/=<>≤≥≈≠∑∏∫∂√∞∝∇≡]', next_text): + merged_content += " " + next_text + j += 1 + else: + break + + formulas.append(Formula( + id=formula_id, + content=merged_content, + position=i + )) + + return formulas + + def _extract_references(self, elements) -> List[Reference]: + """提取文档中的参考文献""" + references = [] + + # 首先找到参考文献部分 + ref_section_start = -1 + for i, element in enumerate(elements): + if isinstance(element, Title) and re.search(self.PAPER_SECTION_PATTERNS['references'], str(element), re.IGNORECASE): + ref_section_start = i + break + + if ref_section_start == -1: + # 没有找到明确的参考文献部分,尝试在文档末尾寻找 + # 参考文献通常在文档的最后20% + start_pos = int(len(elements) * 0.8) + for i in range(start_pos, len(elements)): + element_text = str(elements[i]).strip() + # 常见的参考文献格式特征 + if re.match(r'^\[\d+\]|\(\d+\)|^\d+\.\s+[A-Z]', element_text): + ref_section_start = i + break + + if ref_section_start != -1: + # 提取参考文献列表 + current_ref = None + inside_ref = False # 标记是否在一个参考文献项内 + + for i in range(ref_section_start + 1, len(elements)): + element = elements[i] + + # 忽略标题元素 - 这些可能是错误识别的参考文献部分 + if isinstance(element, Title): + # 检查是否是真正的参考文献部分结束标题 + title_text = str(element).lower().strip() + if re.search(r'^(appendix|appendices|supplementary|acknowledgements?|附录|致谢)$', title_text): + # 遇到下一个主要章节,结束参考文献提取 + break + + # 对于可能是参考文献一部分的标题,将其内容合并到当前参考文献 + if current_ref and inside_ref: + current_ref.text += " " + str(element) + continue + + element_text = str(element).strip() + if not element_text: + continue + + # 检查是否是新的参考文献条目 + ref_start_match = re.match(r'^\[(\d+)\]|\((\d+)\)|^(\d+)\.\s+', element_text) + + if ref_start_match: + # 如果已有参考文献,保存它 + if current_ref and current_ref.text: + references.append(current_ref) + inside_ref = False + + # 提取引用ID + ref_id = "" + if ref_start_match.group(1): # [1] 格式 + ref_id = f"[{ref_start_match.group(1)}]" + # 移除ID前缀 + element_text = re.sub(r'^\[\d+\]\s*', '', element_text) + elif ref_start_match.group(2): # (1) 格式 + ref_id = f"({ref_start_match.group(2)})" + # 移除ID前缀 + element_text = re.sub(r'^\(\d+\)\s*', '', element_text) + elif ref_start_match.group(3): # 1. 格式 + ref_id = f"{ref_start_match.group(3)}." + # 移除ID前缀 + element_text = re.sub(r'^\d+\.\s+', '', element_text) + + # 创建新的参考文献 + current_ref = Reference(id=ref_id, text=element_text) + inside_ref = True + + # 尝试提取作者和年份 + author_year_match = re.match(r'^([^,]+),\s*(?:\()?(\d{4})(?:\))?', element_text) + if author_year_match: + authors_text = author_year_match.group(1).strip() + # 尝试分割多个作者 + authors = [a.strip() for a in re.split(r',|and|&|;|、|等', authors_text) if a.strip()] + current_ref.authors = authors + current_ref.year = author_year_match.group(2) + + elif current_ref and inside_ref: + # 继续当前参考文献 + current_ref.text += " " + element_text + + # 添加最后一个参考文献 + if current_ref and current_ref.text: + references.append(current_ref) + + return references + + def _extract_full_text(self, elements) -> str: + """提取文档的完整文本""" + text_parts = [] + + for element in elements: + if isinstance(element, (Text, NarrativeText, Title, ListItem, Table)): + text = str(element).strip() + if text: + text_parts.append(self._cleanup_text(text)) + + return self.config.paragraph_separator.join(text_parts) + + def generate_markdown(self, paper: StructuredPaper) -> str: + """将论文结构化数据转换为Markdown格式 + + Args: + paper: 结构化论文数据对象 + + Returns: + str: 完整的Markdown格式论文文本 + """ + md_parts = [] + + # 标题和作者信息 + md_parts.append(f"# {paper.metadata.title}\n") + + if paper.metadata.authors: + authors_str = ", ".join(paper.metadata.authors) + md_parts.append(f"**作者:** {authors_str}\n") + + # 发表信息 + pub_info = [] + if hasattr(paper.metadata, 'journal') and paper.metadata.journal: + pub_info.append(paper.metadata.journal) + if hasattr(paper.metadata, 'publication_date') and paper.metadata.publication_date: + pub_info.append(paper.metadata.publication_date) + elif hasattr(paper.metadata, 'date') and paper.metadata.date: + pub_info.append(paper.metadata.date) + elif hasattr(paper.metadata, 'year') and paper.metadata.year: + pub_info.append(paper.metadata.year) + + if pub_info: + md_parts.append(f"**发表信息:** {', '.join(pub_info)}\n") + + # DOI和URL + if hasattr(paper.metadata, 'doi') and paper.metadata.doi: + md_parts.append(f"**DOI:** {paper.metadata.doi}\n") + if hasattr(paper.metadata, 'url') and paper.metadata.url: + md_parts.append(f"**URL:** {paper.metadata.url}\n") + + # 摘要 + abstract_section = next((s for s in paper.sections if s.section_type == 'abstract'), None) + if abstract_section: + md_parts.append(f"## 摘要\n\n{abstract_section.content}\n") + elif hasattr(paper.metadata, 'abstract') and paper.metadata.abstract: + md_parts.append(f"## 摘要\n\n{paper.metadata.abstract}\n") + + # 关键词 + if paper.keywords: + md_parts.append(f"**关键词:** {', '.join(paper.keywords)}\n") + + # 章节内容 + md_parts.append(self._format_sections_markdown(paper.sections)) + + # 图表 + if paper.figures: + md_parts.append("## 图\n") + for fig in paper.figures: + md_parts.append(f"### {fig.id}: {fig.caption}\n\n{fig.content}\n") + + if paper.tables: + md_parts.append("## 表\n") + for table in paper.tables: + md_parts.append(f"### {table.id}: {table.caption}\n\n{table.content}\n") + + # 公式 + if paper.formulas: + md_parts.append("## 公式\n") + for formula in paper.formulas: + # 使用代码块包装公式内容,而不是作为标题 + formatted_content = self._format_formula_content(formula.content) + md_parts.append(f"**{formula.id}**\n\n```math\n{formatted_content}\n```\n") + + # 参考文献 + if paper.references: + md_parts.append("## 参考文献\n") + for ref in paper.references: + md_parts.append(f"{ref.id} {ref.text}\n") + + return "\n".join(md_parts) + + def _format_sections_markdown(self, sections: List[PaperSection], level: int = 0) -> str: + """递归格式化章节内容为Markdown + + Args: + sections: 章节列表 + level: 当前章节级别 + + Returns: + str: 格式化后的Markdown文本 + """ + if not sections: + return "" + + md_parts = [] + for section in sections: + # 计算标题级别(注意Markdown最多支持6级标题) + header_level = min(section.level + 2, 6) # +2是因为文章标题是h1,摘要是h2 + header_marks = '#' * header_level + + # 忽略已经作为摘要处理的部分 + if level == 0 and section.section_type == 'abstract': + continue + + # 添加章节标题和内容 + md_parts.append(f"{header_marks} {section.title}\n") + if section.content: + md_parts.append(f"{section.content}\n") + + # 递归处理子章节 + if section.subsections: + md_parts.append(self._format_sections_markdown( + section.subsections, level + 1)) + + return "\n".join(md_parts) + + def _format_formula_content(self, content: str) -> str: + """ + 格式化公式内容,确保不会被误解为Markdown语法 + + Args: + content: 原始公式内容 + + Returns: + str: 格式化后的公式内容 + """ + # 移除可能导致Markdown格式错误的前缀 + content = re.sub(r'^#+\s*', '', content) + + # 清理(cid:X)这样的特殊字符序列,这些通常是PDF解析错误 + content = re.sub(r'\(cid:\d+\)', '', content) + + # 将多行公式合并成单行(如果需要) + content = re.sub(r'\s*\n\s*', ' ', content) + + # 如果公式包含"max"、"min"等关键字,确保它们不被分割 + # 这里特别处理类似"max θ"的情况 + content = re.sub(r'(max|min|sup|inf|lim|arg\s*max|arg\s*min)\s+([a-zA-Z\u0370-\u03FF\u0400-\u04FF]+)', r'\1_{\2}', content) + + return content.strip() + + def _is_in_references_section(self, index: int, elements) -> bool: + """判断元素是否位于参考文献部分 + + Args: + index: 当前元素索引 + elements: 所有元素列表 + + Returns: + bool: 是否在参考文献部分 + """ + # 方法1:查找前面是否有明确的参考文献标题 + for i in range(index-1, max(0, index-100), -1): + if isinstance(elements[i], Title): + title_text = str(elements[i]).lower().strip() + if re.search(r'^(references|bibliography|参考文献|引用|文献)(\s|$)', title_text): + return True + # 检查编号形式 + if re.match(r'^\d+\.\s*(references|bibliography|参考文献)', title_text): + return True + + # 方法2:基于位置启发式(通常参考文献在论文末尾) + if index > len(elements) * 0.75: # 如果在文档后四分之一 + # 搜索前后文本是否包含参考文献特征 + ref_features = 0 + window = 20 # 查看周围20个元素 + + start = max(0, index - window) + end = min(len(elements), index + window) + + for i in range(start, end): + if i == index: + continue + + text = str(elements[i]).lower() + + # 检查参考文献特征 + if re.search(r'\[\d+\]|\(\d{4}\)|et\s+al\.', text): + ref_features += 1 + if re.search(r'proceedings|journal|conference|transactions|vol\.|pp\.', text): + ref_features += 1 + if re.search(r'doi:|arxiv:|https?://|ieee|acm|springer', text): + ref_features += 1 + + # 如果周围文本具有足够的参考文献特征 + if ref_features >= 5: + return True + + return False + + +def main(): + """主函数:演示用法""" + # 创建提取器 + extractor = PaperStructureExtractor() + + # 使用示例 + try: + # 替换为实际的文件路径 + sample_file = '/Users/boyin.liu/Documents/示例文档/论文/3.pdf' + if Path(sample_file).exists(): + paper = extractor.extract_paper_structure(sample_file) + + print("\n===== 论文结构化信息 =====") + print(f"标题: {paper.metadata.title}") + print(f"作者: {', '.join(paper.metadata.authors)}") + + print("\n--- 章节结构 ---") + for i, section in enumerate(paper.sections): + print(f"{i+1}. {section.title} ({section.section_type})") + if section.subsections: + for j, subsection in enumerate(section.subsections): + print(f" {i+1}.{j+1} {subsection.title}") + + print("\n--- 图表 ---") + print(f"图: {len(paper.figures)}") + for i, fig in enumerate(paper.figures[:3]): + print(f"图 {i+1}: {fig.caption[:50]}...") + + print(f"\n表: {len(paper.tables)}") + for i, table in enumerate(paper.tables[:3]): + print(f"表 {i+1}: {table.caption[:50]}...") + + print(f"\n--- 公式: {len(paper.formulas)} ---") + for i, formula in enumerate(paper.formulas[:3]): + print(f"公式 {formula.id}: {formula.content[:30]}...") + + print(f"\n--- 参考文献: {len(paper.references)} ---") + for i, ref in enumerate(paper.references[:5]): + print(f"{ref.id} {ref.text[:50]}...") + + print("\n--- 摘要 ---") + abstract_section = next((s for s in paper.sections if s.section_type == 'abstract'), None) + if abstract_section: + print(abstract_section.content[:200] + "...") + else: + print(paper.metadata.abstract[:200] + "...") + + else: + print(f"示例文件 {sample_file} 不存在") + + print("\n支持的格式:", extractor.get_supported_formats()) + + except Exception as e: + print(f"错误: {e}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + main() diff --git a/crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_md.py b/crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_md.py new file mode 100644 index 00000000..78c48eec --- /dev/null +++ b/crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_md.py @@ -0,0 +1,86 @@ +from pathlib import Path +from crazy_functions.doc_fns.read_fns.unstructured_all.paper_structure_extractor import PaperStructureExtractor + +def extract_and_save_as_markdown(paper_path, output_path=None): + """ + 提取论文结构并保存为Markdown格式 + + 参数: + paper_path: 论文文件路径 + output_path: 输出的Markdown文件路径,如果不指定,将使用与输入相同的文件名但扩展名为.md + + 返回: + 保存的Markdown文件路径 + """ + # 创建提取器 + extractor = PaperStructureExtractor() + + # 解析文件路径 + paper_path = Path(paper_path) + + # 如果未指定输出路径,使用相同文件名但扩展名为.md + if output_path is None: + output_path = paper_path.with_suffix('.md') + else: + output_path = Path(output_path) + + # 确保输出目录存在 + output_path.parent.mkdir(parents=True, exist_ok=True) + + print(f"正在处理论文: {paper_path}") + + try: + # 提取论文结构 + paper = extractor.extract_paper_structure(paper_path) + + # 生成Markdown内容 + markdown_content = extractor.generate_markdown(paper) + + # 保存到文件 + with open(output_path, 'w', encoding='utf-8') as f: + f.write(markdown_content) + + print(f"已成功保存Markdown文件: {output_path}") + + # 打印摘要信息 + print("\n论文摘要信息:") + print(f"标题: {paper.metadata.title}") + print(f"作者: {', '.join(paper.metadata.authors)}") + print(f"关键词: {', '.join(paper.keywords)}") + print(f"章节数: {len(paper.sections)}") + print(f"图表数: {len(paper.figures)}") + print(f"表格数: {len(paper.tables)}") + print(f"公式数: {len(paper.formulas)}") + print(f"参考文献数: {len(paper.references)}") + + return output_path + + except Exception as e: + print(f"处理论文时出错: {e}") + import traceback + traceback.print_exc() + return None + +# 使用示例 +if __name__ == "__main__": + # 替换为实际的论文文件路径 + sample_paper = "crazy_functions/doc_fns/read_fns/paper/2501.12599v1.pdf" + + # 可以指定输出路径,也可以使用默认路径 + # output_file = "/path/to/output/paper_structure.md" + # extract_and_save_as_markdown(sample_paper, output_file) + + # 使用默认输出路径(与输入文件同名但扩展名为.md) + extract_and_save_as_markdown(sample_paper) + + # # 批量处理多个论文的示例 + # paper_dir = Path("/path/to/papers/folder") + # output_dir = Path("/path/to/output/folder") + # + # # 确保输出目录存在 + # output_dir.mkdir(parents=True, exist_ok=True) + # + # # 处理目录中的所有PDF文件 + # for paper_file in paper_dir.glob("*.pdf"): + # output_file = output_dir / f"{paper_file.stem}.md" + # extract_and_save_as_markdown(paper_file, output_file) \ No newline at end of file diff --git a/crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_reader.py b/crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_reader.py new file mode 100644 index 00000000..7b39696b --- /dev/null +++ b/crazy_functions/doc_fns/read_fns/unstructured_all/unstructured_reader.py @@ -0,0 +1,275 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Optional, Set, Dict, Union, List +from dataclasses import dataclass, field +import logging +import os + +from unstructured.partition.auto import partition +from unstructured.documents.elements import ( + Text, Title, NarrativeText, ListItem, Table, + Footer, Header, PageBreak, Image, Address +) + + +@dataclass +class TextExtractorConfig: + """通用文档提取器配置类 + + Attributes: + extract_headers_footers: 是否提取页眉页脚 + extract_tables: 是否提取表格内容 + extract_lists: 是否提取列表内容 + extract_titles: 是否提取标题 + paragraph_separator: 段落之间的分隔符 + text_cleanup: 文本清理选项字典 + """ + extract_headers_footers: bool = False + extract_tables: bool = True + extract_lists: bool = True + extract_titles: bool = True + paragraph_separator: str = '\n\n' + text_cleanup: Dict[str, bool] = field(default_factory=lambda: { + 'remove_extra_spaces': True, + 'normalize_whitespace': True, + 'remove_special_chars': False, + 'lowercase': False + }) + + +class UnstructuredTextExtractor: + """通用文档文本内容提取器 + + 使用 unstructured 库支持多种文档格式的文本提取,提供统一的接口和配置选项。 + """ + + SUPPORTED_EXTENSIONS: Set[str] = { + # 文档格式 + '.pdf', '.docx', '.doc', '.txt', + # 演示文稿 + '.ppt', '.pptx', + # 电子表格 + '.xlsx', '.xls', '.csv', + # 图片 + '.png', '.jpg', '.jpeg', '.tiff', + # 邮件 + '.eml', '.msg', '.p7s', + # Markdown + ".md", + # Org Mode + ".org", + # Open Office + ".odt", + # reStructured Text + ".rst", + # Rich Text + ".rtf", + # TSV + ".tsv", + # EPUB + '.epub', + # 其他格式 + '.html', '.xml', '.json', + } + + def __init__(self, config: Optional[TextExtractorConfig] = None): + """初始化提取器 + + Args: + config: 提取器配置对象,如果为None则使用默认配置 + """ + self.config = config or TextExtractorConfig() + self._setup_logging() + + def _setup_logging(self) -> None: + """配置日志记录器""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + self.logger = logging.getLogger(__name__) + + # 添加文件处理器 + fh = logging.FileHandler('text_extractor.log') + fh.setLevel(logging.ERROR) + self.logger.addHandler(fh) + + def _validate_file(self, file_path: Union[str, Path], max_size_mb: int = 100) -> Path: + """验证文件 + + Args: + file_path: 文件路径 + max_size_mb: 允许的最大文件大小(MB) + + Returns: + Path: 验证后的Path对象 + + Raises: + ValueError: 文件不存在、格式不支持或大小超限 + PermissionError: 没有读取权限 + """ + path = Path(file_path).resolve() + + if not path.exists(): + raise ValueError(f"File not found: {path}") + + if not path.is_file(): + raise ValueError(f"Not a file: {path}") + + if not os.access(path, os.R_OK): + raise PermissionError(f"No read permission: {path}") + + file_size_mb = path.stat().st_size / (1024 * 1024) + if file_size_mb > max_size_mb: + raise ValueError( + f"File size ({file_size_mb:.1f}MB) exceeds limit of {max_size_mb}MB" + ) + + if path.suffix.lower() not in self.SUPPORTED_EXTENSIONS: + raise ValueError( + f"Unsupported format: {path.suffix}. " + f"Supported: {', '.join(sorted(self.SUPPORTED_EXTENSIONS))}" + ) + + return path + + def _cleanup_text(self, text: str) -> str: + """清理文本 + + Args: + text: 原始文本 + + Returns: + str: 清理后的文本 + """ + if self.config.text_cleanup['remove_extra_spaces']: + text = ' '.join(text.split()) + + if self.config.text_cleanup['normalize_whitespace']: + text = text.replace('\t', ' ').replace('\r', '\n') + + if self.config.text_cleanup['lowercase']: + text = text.lower() + + return text.strip() + + def _should_extract_element(self, element) -> bool: + """判断是否应该提取某个元素 + + Args: + element: 文档元素 + + Returns: + bool: 是否应该提取 + """ + if isinstance(element, (Text, NarrativeText)): + return True + + if isinstance(element, Title) and self.config.extract_titles: + return True + + if isinstance(element, ListItem) and self.config.extract_lists: + return True + + if isinstance(element, Table) and self.config.extract_tables: + return True + + if isinstance(element, (Header, Footer)) and self.config.extract_headers_footers: + return True + + return False + + @staticmethod + def get_supported_formats() -> List[str]: + """获取支持的文件格式列表""" + return sorted(UnstructuredTextExtractor.SUPPORTED_EXTENSIONS) + + def extract_text( + self, + file_path: Union[str, Path], + strategy: str = "fast" + ) -> str: + """提取文本 + + Args: + file_path: 文件路径 + strategy: 提取策略 ("fast" 或 "accurate") + + Returns: + str: 提取的文本内容 + + Raises: + Exception: 提取过程中的错误 + """ + try: + path = self._validate_file(file_path) + self.logger.info(f"Processing: {path}") + + # 修改这里:添加 nlp=False 参数来禁用 NLTK + elements = partition( + str(path), + strategy=strategy, + include_metadata=True, + nlp=True, + ) + + # 其余代码保持不变 + text_parts = [] + for element in elements: + if self._should_extract_element(element): + text = str(element) + cleaned_text = self._cleanup_text(text) + if cleaned_text: + if isinstance(element, (Header, Footer)): + prefix = "[Header] " if isinstance(element, Header) else "[Footer] " + text_parts.append(f"{prefix}{cleaned_text}") + else: + text_parts.append(cleaned_text) + + return self.config.paragraph_separator.join(text_parts) + + except Exception as e: + self.logger.error(f"Extraction failed: {e}") + raise + + + +def main(): + """主函数:演示用法""" + # 配置 + config = TextExtractorConfig( + extract_headers_footers=True, + extract_tables=True, + extract_lists=True, + extract_titles=True, + text_cleanup={ + 'remove_extra_spaces': True, + 'normalize_whitespace': True, + 'remove_special_chars': False, + 'lowercase': False + } + ) + + # 创建提取器 + extractor = UnstructuredTextExtractor(config) + + # 使用示例 + try: + # 替换为实际的文件路径 + sample_file = './crazy_functions/doc_fns/read_fns/paper/2501.12599v1.pdf' + if Path(sample_file).exists() or True: + text = extractor.extract_text(sample_file) + print("提取的文本:") + print(text) + else: + print(f"示例文件 {sample_file} 不存在") + + print("\n支持的格式:", extractor.get_supported_formats()) + + except Exception as e: + print(f"错误: {e}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/crazy_functions/doc_fns/read_fns/web_reader.py b/crazy_functions/doc_fns/read_fns/web_reader.py new file mode 100644 index 00000000..33c78286 --- /dev/null +++ b/crazy_functions/doc_fns/read_fns/web_reader.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Dict, Optional, Union +from urllib.parse import urlparse +import logging +import trafilatura +import requests +from pathlib import Path + + +@dataclass +class WebExtractorConfig: + """网页内容提取器配置类 + + Attributes: + extract_comments: 是否提取评论 + extract_tables: 是否提取表格 + extract_links: 是否保留链接信息 + paragraph_separator: 段落分隔符 + timeout: 网络请求超时时间(秒) + max_retries: 最大重试次数 + user_agent: 自定义User-Agent + text_cleanup: 文本清理选项 + """ + extract_comments: bool = False + extract_tables: bool = True + extract_links: bool = False + paragraph_separator: str = '\n\n' + timeout: int = 10 + max_retries: int = 3 + user_agent: str = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + text_cleanup: Dict[str, bool] = field(default_factory=lambda: { + 'remove_extra_spaces': True, + 'normalize_whitespace': True, + 'remove_special_chars': False, + 'lowercase': False + }) + + +class WebTextExtractor: + """网页文本内容提取器 + + 使用trafilatura库提取网页中的主要文本内容,去除广告、导航等无关内容。 + """ + + def __init__(self, config: Optional[WebExtractorConfig] = None): + """初始化提取器 + + Args: + config: 提取器配置对象,如果为None则使用默认配置 + """ + self.config = config or WebExtractorConfig() + self._setup_logging() + + def _setup_logging(self) -> None: + """配置日志记录器""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + self.logger = logging.getLogger(__name__) + + # 添加文件处理器 + fh = logging.FileHandler('web_extractor.log') + fh.setLevel(logging.ERROR) + self.logger.addHandler(fh) + + def _validate_url(self, url: str) -> bool: + """验证URL格式是否有效 + + Args: + url: 网页URL + + Returns: + bool: URL是否有效 + """ + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except Exception: + return False + + def _download_webpage(self, url: str) -> Optional[str]: + """下载网页内容 + + Args: + url: 网页URL + + Returns: + Optional[str]: 网页HTML内容,失败返回None + + Raises: + Exception: 下载失败时抛出异常 + """ + headers = {'User-Agent': self.config.user_agent} + + for attempt in range(self.config.max_retries): + try: + response = requests.get( + url, + headers=headers, + timeout=self.config.timeout + ) + response.raise_for_status() + return response.text + except requests.RequestException as e: + self.logger.warning(f"Attempt {attempt + 1} failed: {e}") + if attempt == self.config.max_retries - 1: + raise Exception(f"Failed to download webpage after {self.config.max_retries} attempts: {e}") + return None + + def _cleanup_text(self, text: str) -> str: + """清理文本 + + Args: + text: 原始文本 + + Returns: + str: 清理后的文本 + """ + if not text: + return "" + + if self.config.text_cleanup['remove_extra_spaces']: + text = ' '.join(text.split()) + + if self.config.text_cleanup['normalize_whitespace']: + text = text.replace('\t', ' ').replace('\r', '\n') + + if self.config.text_cleanup['lowercase']: + text = text.lower() + + return text.strip() + + def extract_text(self, url: str) -> str: + """提取网页文本内容 + + Args: + url: 网页URL + + Returns: + str: 提取的文本内容 + + Raises: + ValueError: URL无效时抛出 + Exception: 提取失败时抛出 + """ + try: + if not self._validate_url(url): + raise ValueError(f"Invalid URL: {url}") + + self.logger.info(f"Processing URL: {url}") + + # 下载网页 + html_content = self._download_webpage(url) + if not html_content: + raise Exception("Failed to download webpage") + + # 配置trafilatura提取选项 + extract_config = { + 'include_comments': self.config.extract_comments, + 'include_tables': self.config.extract_tables, + 'include_links': self.config.extract_links, + 'no_fallback': False, # 允许使用后备提取器 + } + + # 提取文本 + extracted_text = trafilatura.extract( + html_content, + **extract_config + ) + + if not extracted_text: + raise Exception("No content could be extracted") + + # 清理文本 + cleaned_text = self._cleanup_text(extracted_text) + + return cleaned_text + + except Exception as e: + self.logger.error(f"Extraction failed: {e}") + raise + + +def main(): + """主函数:演示用法""" + # 配置 + config = WebExtractorConfig( + extract_comments=False, + extract_tables=True, + extract_links=False, + timeout=10, + text_cleanup={ + 'remove_extra_spaces': True, + 'normalize_whitespace': True, + 'remove_special_chars': False, + 'lowercase': False + } + ) + + # 创建提取器 + extractor = WebTextExtractor(config) + + # 使用示例 + try: + # 替换为实际的URL + sample_url = 'https://arxiv.org/abs/2412.00036' + text = extractor.extract_text(sample_url) + print("提取的文本:") + print(text) + + except Exception as e: + print(f"错误: {e}") + + +if __name__ == "__main__": + main()