gpt_academic/crazy_functions/rag_fns/arxiv_fns/tex_utils.py

import re
import os
import logging
from pathlib import Path
from typing import List, Tuple, Dict, Set, Optional, Callable
from crazy_functions.rag_fns.arxiv_fns.latex_patterns import LaTeXPatterns

class TexUtils:
    """TeX文档处理器类"""

    def __init__(self, ):
        """
        初始化TeX处理器

        Args:
            char_range: 字符数范围(最小值, 最大值)
        """
        self.logger = logging.getLogger(__name__)

        # 初始化LaTeX环境和命令模式
        self._init_patterns()
        self.latex_only_patterns = LaTeXPatterns.latex_only_patterns


    def _init_patterns(self):
        """初始化LaTeX模式匹配规则"""
        # 特殊环境模式
        self.special_envs = LaTeXPatterns.special_envs
        # 章节模式
        self.section_patterns = LaTeXPatterns.section_patterns
        # 包含模式
        self.include_patterns = LaTeXPatterns.include_patterns
        # 元数据模式
        self.metadata_patterns = LaTeXPatterns.metadata_patterns

    def read_file(self, file_path: str) -> Optional[str]:
        """
        读取TeX文件内容，支持多种编码

        Args:
            file_path: 文件路径

        Returns:
            Optional[str]: 文件内容或None
        """
        encodings = ['utf-8', 'latin1', 'gbk', 'gb2312', 'ascii']
        for encoding in encodings:
            try:
                with open(file_path, 'r', encoding=encoding) as f:
                    return f.read()
            except UnicodeDecodeError:
                continue

        self.logger.warning(f"Failed to read {file_path} with all encodings")
        return None

    def find_main_tex_file(self, directory: str) -> Optional[str]:
        """
        查找主TeX文件

        Args:
            directory: 目录路径

        Returns:
            Optional[str]: 主文件路径或None
        """
        tex_files = list(Path(directory).rglob("*.tex"))
        if not tex_files:
            return None

        # 按优先级查找
        for tex_file in tex_files:
            content = self.read_file(str(tex_file))
            if content:
                if r'\documentclass' in content:
                    return str(tex_file)
                if tex_file.name.lower() == 'main.tex':
                    return str(tex_file)

        # 返回最大的tex文件
        return str(max(tex_files, key=lambda x: x.stat().st_size))

    def resolve_includes(self, tex_file: str, processed: Set[str] = None) -> List[str]:
        """
        解析TeX文件中的include引用

        Args:
            tex_file: TeX文件路径
            processed: 已处理的文件集合

        Returns:
            List[str]: 相关文件路径列表
        """
        if processed is None:
            processed = set()

        if tex_file in processed:
            return []

        processed.add(tex_file)
        result = [tex_file]
        content = self.read_file(tex_file)

        if not content:
            return result

        base_dir = Path(tex_file).parent
        for pattern in self.include_patterns:
            for match in re.finditer(pattern, content):
                included_file = match.group(2)
                if not included_file.endswith('.tex'):
                    included_file += '.tex'

                full_path = str(base_dir / included_file)
                if os.path.exists(full_path) and full_path not in processed:
                    result.extend(self.resolve_includes(full_path, processed))

        return result

    def resolve_references(self, tex_file: str, path_dir: str = None) -> str:
        """
        解析TeX文件中的参考文献引用，返回所有引用文献的内容，只保留title、author和journal字段。
        如果在tex_file目录下没找到bib文件，会在path_dir中查找。

        Args:
            tex_file: TeX文件路径
            path_dir: 额外的参考文献搜索路径

        Returns:
            str: 所有参考文献内容的字符串，只包含特定字段，不同参考文献之间用空行分隔
        """
        all_references = []  # 存储所有参考文献内容
        content = self.read_file(tex_file)

        if not content:
            return ""

        # 扩展参考文献引用的模式
        bib_patterns = [
            r'\\bibliography\{([^}]+)\}',
            r'\\addbibresource\{([^}]+)\}',
            r'\\bibliographyfile\{([^}]+)\}',
            r'\\begin\{thebibliography\}',
            r'\\bibinput\{([^}]+)\}',
            r'\\newrefsection\{([^}]+)\}'
        ]

        base_dir = Path(tex_file).parent
        found_in_tex_dir = False

        # 首先在tex文件目录下查找显式引用的bib文件
        for pattern in bib_patterns:
            for match in re.finditer(pattern, content):
                if not match.groups():
                    continue

                bib_files = match.group(1).split(',')
                for bib_file in bib_files:
                    bib_file = bib_file.strip()
                    if not bib_file.endswith('.bib'):
                        bib_file += '.bib'

                    full_path = str(base_dir / bib_file)
                    if os.path.exists(full_path):
                        found_in_tex_dir = True
                        bib_content = self.read_file(full_path)
                        if bib_content:
                            processed_refs = self._process_bib_content(bib_content)
                            all_references.extend(processed_refs)

        # 如果在tex文件目录下没找到bib文件，且提供了额外搜索路径
        if not found_in_tex_dir and path_dir:
            search_dir = Path(path_dir)
            try:
                for bib_path in search_dir.glob('**/*.bib'):
                    bib_content = self.read_file(str(bib_path))
                    if bib_content:
                        processed_refs = self._process_bib_content(bib_content)
                        all_references.extend(processed_refs)
            except Exception as e:
                print(f"Error searching in path_dir: {e}")

        # 合并所有参考文献内容，用空行分隔
        return "\n\n".join(all_references)

    def _process_bib_content(self, content: str) -> List[str]:
        """
        处理bib文件内容，提取每个参考文献的特定字段

        Args:
            content: bib文件内容

        Returns:
            List[str]: 处理后的参考文献列表
        """
        processed_refs = []
        # 匹配完整的参考文献条目
        ref_pattern = r'@\w+\{[^@]*\}'
        # 匹配参考文献类型和键值
        entry_start_pattern = r'@(\w+)\{([^,]*?),'
        # 匹配字段
        field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}'

        # 查找所有参考文献条目
        for ref_match in re.finditer(ref_pattern, content, re.DOTALL):
            ref_content = ref_match.group(0)

            # 获取参考文献类型和键值
            entry_match = re.match(entry_start_pattern, ref_content)
            if not entry_match:
                continue

            entry_type, cite_key = entry_match.groups()

            # 提取需要的字段
            needed_fields = {'title': None, 'author': None, 'journal': None}
            for field_match in re.finditer(field_pattern, ref_content):
                field_name, field_value = field_match.groups()
                field_name = field_name.lower()
                if field_name in needed_fields:
                    needed_fields[field_name] = field_value.strip()

            # 构建新的参考文献条目
            if any(needed_fields.values()):  # 如果至少有一个需要的字段
                ref_lines = [f"@{entry_type}{{{cite_key},"]
                for field_name, field_value in needed_fields.items():
                    if field_value:
                        ref_lines.append(f"  {field_name}={{{field_value}}},")
                ref_lines[-1] = ref_lines[-1][:-1]  # 移除最后一个逗号
                ref_lines.append("}")

                processed_refs.append("\n".join(ref_lines))

        return processed_refs
    def _extract_inline_references(self, content: str) -> str:
        """
        从tex文件内容中提取直接写在文件中的参考文献

        Args:
            content: tex文件内容

        Returns:
            str: 提取的参考文献内容，如果没有找到则返回空字符串
        """
        # 查找参考文献环境
        bib_start = r'\\begin\{thebibliography\}'
        bib_end = r'\\end\{thebibliography\}'

        start_match = re.search(bib_start, content)
        end_match = re.search(bib_end, content)

        if start_match and end_match:
            return content[start_match.start():end_match.end()]

        return ""
    def _preprocess_content(self, content: str) -> str:
        """预处理TeX内容"""
        # 移除注释
        content = re.sub(r'(?m)%.*$', '', content)
        # 规范化空白字符
        # content = re.sub(r'\s+', ' ', content)
        content = re.sub(r'\n\s*\n', '\n\n', content)
        return content.strip()