This commit is contained in:
lbykkkk
2024-11-23 17:59:17 +08:00
parent ea4cd95645
commit 724940a9d8
5 changed files with 134 additions and 1733 deletions

View File

@@ -5,75 +5,28 @@ This module provides functionality for parsing and extracting structured informa
including metadata, document structure, and content. It uses modular design and clean architecture principles.
"""
import re
from abc import ABC, abstractmethod
from enum import Enum
import logging
from dataclasses import dataclass, field
from typing import List, Optional, Dict
from copy import deepcopy
from crazy_functions.rag_fns.arxiv_fns.latex_cleaner import clean_latex_commands
from crazy_functions.rag_fns.arxiv_fns.section_extractor import Section, SectionLevel, EnhancedSectionExtractor
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
from dataclasses import dataclass, field
from typing import List, Optional, Dict
from enum import Enum
import logging
from copy import deepcopy
from crazy_functions.rag_fns.arxiv_fns.latex_cleaner import clean_latex_commands
logger = logging.getLogger(__name__)
class SectionLevel(Enum):
CHAPTER = 0
SECTION = 1
SUBSECTION = 2
SUBSUBSECTION = 3
PARAGRAPH = 4
SUBPARAGRAPH = 5
@dataclass
class Section:
level: SectionLevel
title: str
content: str = ''
subsections: List['Section'] = field(default_factory=list)
def merge(self, other: 'Section') -> 'Section':
"""Merge this section with another section."""
if self.title != other.title or self.level != other.level:
raise ValueError("Can only merge sections with same title and level")
merged = deepcopy(self)
merged.content = self._merge_content(self.content, other.content)
# Create subsections lookup for efficient merging
subsections_map = {s.title: s for s in merged.subsections}
for other_subsection in other.subsections:
if other_subsection.title in subsections_map:
# Merge existing subsection
idx = next(i for i, s in enumerate(merged.subsections)
if s.title == other_subsection.title)
merged.subsections[idx] = merged.subsections[idx].merge(other_subsection)
else:
# Add new subsection
merged.subsections.append(deepcopy(other_subsection))
return merged
@staticmethod
def _merge_content(content1: str, content2: str) -> str:
"""Merge content strings intelligently."""
if not content1:
return content2
if not content2:
return content1
# Combine non-empty contents with a separator
return f"{content1}\n\n{content2}"
def read_tex_file(file_path):
encodings = ['utf-8', 'latin1', 'gbk', 'gb2312', 'ascii']
for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
return f.read()
except UnicodeDecodeError:
continue
@dataclass
class DocumentStructure:
@@ -114,7 +67,7 @@ class DocumentStructure:
if other_section.title in sections_map:
# Merge existing section
idx = next(i for i, s in enumerate(merged.toc)
if s.title == other_section.title)
if s.title == other_section.title)
merged.toc[idx] = merged.toc[idx].merge(other_section)
else:
# Add new section
@@ -132,11 +85,69 @@ class DocumentStructure:
# Combine non-empty abstracts with a separator
return f"{abstract1}\n\n{abstract2}"
def generate_toc_tree(self, indent_char: str = " ", abstract_preview_length: int = 0) -> str:
"""
Generate a tree-like string representation of the table of contents including abstract.
Args:
indent_char: Character(s) used for indentation. Default is two spaces.
abstract_preview_length: Maximum length of abstract preview. Default is 200 characters.
Returns:
str: A formatted string showing the hierarchical document structure with abstract
"""
def _format_section(section: Section, level: int = 0) -> str:
# Create the current section line with proper indentation
current_line = f"{indent_char * level}{'' if level > 0 else ''} {section.title}\n"
# Recursively process subsections
subsections = ""
if section.subsections:
subsections = "".join(_format_section(subsec, level + 1)
for subsec in section.subsections)
return current_line + subsections
result = []
# Add document title if it exists
if self.title:
result.append(f"{self.title}\n")
# Add abstract if it exists
if self.abstract:
result.append("\n□ Abstract:")
# Format abstract content with word wrap
abstract_preview = self.abstract[:abstract_preview_length]
if len(self.abstract) > abstract_preview_length:
abstract_preview += "..."
# Split abstract into lines and indent them
wrapped_lines = []
current_line = ""
for word in abstract_preview.split():
if len(current_line) + len(word) + 1 <= 80: # 80 characters per line
current_line = (current_line + " " + word).strip()
else:
wrapped_lines.append(current_line)
current_line = word
if current_line:
wrapped_lines.append(current_line)
# Add formatted abstract lines
for line in wrapped_lines:
result.append(f"\n{indent_char}{line}")
result.append("\n") # Add extra newline after abstract
# Add table of contents header if there are sections
if self.toc:
result.append("\n◈ Table of Contents:\n")
# Add all top-level sections and their subsections
result.extend(_format_section(section, 0) for section in self.toc)
return "".join(result)
class BaseExtractor(ABC):
"""Base class for LaTeX content extractors."""
@@ -145,7 +156,6 @@ class BaseExtractor(ABC):
"""Extract specific content from LaTeX document."""
pass
class TitleExtractor(BaseExtractor):
"""Extracts title from LaTeX document."""
@@ -169,7 +179,6 @@ class TitleExtractor(BaseExtractor):
return clean_latex_commands(title)
return ''
class AbstractExtractor(BaseExtractor):
"""Extracts abstract from LaTeX document."""
@@ -193,70 +202,13 @@ class AbstractExtractor(BaseExtractor):
return clean_latex_commands(abstract)
return ''
class SectionExtractor:
"""Extracts document structure including sections and their content."""
def __init__(self):
self.section_pattern = self._compile_section_pattern()
def _compile_section_pattern(self) -> str:
"""Create pattern for matching section commands."""
section_types = '|'.join(level.name.lower() for level in SectionLevel)
return fr'\\({section_types})\*?(?:\[.*?\])?\{{(.*?)\}}'
def extract(self, content: str) -> List[Section]:
"""Extract sections and build document hierarchy."""
sections = []
section_stack = []
matches = list(re.finditer(self.section_pattern, content, re.IGNORECASE))
for i, match in enumerate(matches):
cmd_type = match.group(1).lower()
section_title = match.group(2)
level = SectionLevel[cmd_type.upper()]
content = self._extract_section_content(content, match,
matches[i + 1] if i < len(matches) - 1 else None)
new_section = Section(
level=level,
title=clean_latex_commands(section_title),
content=clean_latex_commands(content)
)
self._update_section_hierarchy(sections, section_stack, new_section)
return sections
def _extract_section_content(self, content: str, current_match: re.Match,
next_match: Optional[re.Match]) -> str:
"""Extract content between current section and next section."""
start_pos = current_match.end()
end_pos = next_match.start() if next_match else len(content)
return content[start_pos:end_pos].strip()
def _update_section_hierarchy(self, sections: List[Section],
stack: List[Section], new_section: Section):
"""Update section hierarchy based on section levels."""
while stack and stack[-1].level.value >= new_section.level.value:
stack.pop()
if stack:
stack[-1].subsections.append(new_section)
else:
sections.append(new_section)
stack.append(new_section)
class EssayStructureParser:
"""Main class for parsing LaTeX documents."""
def __init__(self):
self.title_extractor = TitleExtractor()
self.abstract_extractor = AbstractExtractor()
self.section_extractor = SectionExtractor()
self.section_extractor = EnhancedSectionExtractor() # Using the enhanced extractor
def parse(self, content: str) -> DocumentStructure:
"""Parse LaTeX document and extract structured information."""
@@ -276,17 +228,8 @@ class EssayStructureParser:
"""Preprocess LaTeX content for parsing."""
# Remove comments
content = re.sub(r'(?<!\\)%.*$', '', content, flags=re.MULTILINE)
# # Handle input/include commands
# content = re.sub(r'\\(?:input|include){.*?}', '', content)
#
# # Normalize newlines and whitespace
# content = re.sub(r'\r\n?', '\n', content)
# content = re.sub(r'\n\s*\n', '\n', content)
return content
def pretty_print_structure(doc: DocumentStructure, max_content_length: int = 100):
"""Print document structure in a readable format."""
print(f"Title: {doc.title}\n")
@@ -306,51 +249,32 @@ def pretty_print_structure(doc: DocumentStructure, max_content_length: int = 100
for section in doc.toc:
print_section(section)
# Example usage:
if __name__ == "__main__":
# Sample main.tex
main_tex = r"""
\documentclass{article}
\title{Research Paper}
\begin{document}
\begin{abstract}
Main abstract introducing the research.
\end{abstract}
\section{Introduction}
Overview of the topic...
\section{Background}
Part 1 of background...
\end{document}
"""
# Sample background.tex
background_tex = r"""
\section{Background}
Part 2 of background...
\subsection{Related Work}
Discussion of related work...
\section{Methodology}
Research methods...
"""
# Parse both files
parser = EssayStructureParser() # Assuming LaTeXParser class from previous code
# Test with a file
file_path = 'test_cache/2411.03663/neurips_2024.tex'
main_tex = read_tex_file(file_path)
# Parse main file
parser = EssayStructureParser()
main_doc = parser.parse(main_tex)
background_doc = parser.parse(background_tex)
# Merge documents using smart strategy
merged_doc = main_doc.merge(background_doc)
# Merge other documents
file_path_list = [
"test_cache/2411.03663/1_intro.tex",
"test_cache/2411.03663/0_abstract.tex",
"test_cache/2411.03663/2_pre.tex",
"test_cache/2411.03663/3_method.tex",
"test_cache/2411.03663/4_experiment.tex",
"test_cache/2411.03663/5_related_work.tex",
"test_cache/2411.03663/6_conclu.tex"
]
for file_path in file_path_list:
tex_content = read_tex_file(file_path)
additional_doc = parser.parse(tex_content)
main_doc = main_doc.merge(additional_doc)
# Example of how sections are merged:
print("Original Background section content:",
[s for s in main_doc.toc if s.title == "Background"][0].content)
print("\nMerged Background section content:",
[s for s in merged_doc.toc if s.title == "Background"][0].content)
print("\nMerged structure:")
pretty_print_structure(merged_doc) # Assuming pretty_print_structure from previous code
# Example of appending sections
appended_doc = main_doc.merge(background_doc, strategy='append')
print("\nAppended structure (may have duplicate sections):")
pretty_print_structure(appended_doc)
tree= main_doc.generate_toc_tree()
pretty_print_structure(main_doc)