gpt_academic/crazy_functions/rag_fns/arxiv_fns/latex_cleaner.py

from dataclasses import dataclass, field
from typing import Set, Dict, Pattern, Optional
import re
from enum import Enum
import logging
from functools import lru_cache


class EnvType(Enum):
    """Environment classification types."""
    PRESERVE = "preserve"
    REMOVE = "remove"
    EXTRACT = "extract"


@dataclass
class LatexConfig:
    """Configuration for LaTeX processing."""
    preserve_envs: Set[str] = field(default_factory=lambda: {
        # Math environments
        'equation', 'equation*', 'align', 'align*', 'displaymath',
        'math', 'eqnarray', 'gather', 'gather*', 'multline', 'multline*',
        # Tables and figures
        'table', 'table*', 'tabular', 'tabularx', 'array', 'matrix',
        'figure', 'figure*', 'subfigure',
        # Algorithms and code
        'algorithm', 'algorithmic', 'lstlisting',
        # Theorems and proofs
        'theorem', 'proof', 'definition', 'lemma', 'corollary',
        'proposition', 'example', 'remark'
    })

    preserve_commands: Set[str] = field(default_factory=lambda: {
        # Citations and references
        'caption', 'label', 'ref', 'cite', 'citep', 'citet', 'eqref',
        # Text formatting
        'emph', 'textbf', 'textit', 'underline', 'texttt', 'footnote',
        'section', 'subsection', 'subsubsection', 'paragraph',
        # Math operators
        'frac', 'sum', 'int', 'prod', 'lim', 'sup', 'inf'
    })

    remove_commands: Set[str] = field(default_factory=lambda: {
        # Document setup
        'documentclass', 'usepackage', 'input', 'include', 'includeonly',
        'bibliography', 'bibliographystyle', 'frontmatter', 'mainmatter',
        # Layout and spacing
        'pagestyle', 'thispagestyle', 'vspace', 'hspace', 'vfill', 'hfill',
        'newpage', 'clearpage', 'pagebreak', 'linebreak', 'newline',
        'setlength', 'setcounter', 'addtocounter', 'renewcommand',
        'newcommand', 'makeatletter', 'makeatother', 'pagenumbering',
        # Margins and columns
        'marginpar', 'marginparsep', 'columnsep', 'columnseprule',
        'twocolumn', 'onecolumn', 'minipage', 'parbox'
    })

    latex_chars: Dict[str, str] = field(default_factory=lambda: {
        '~': ' ', '\\&': '&', '\\%': '%', '\\_': '_', '\\$': '$',
        '\\#': '#', '\\{': '{', '\\}': '}', '``': '"', "''": '"',
        '\\textbackslash': '\\', '\\ldots': '...', '\\dots': '...',
        '\\textasciitilde': '~', '\\textasciicircum': '^',
        '\\quad': ' ', '\\qquad': ' ', '\\,': '', '\\;': '', '\\:': '',
        '\\!': '', '\\space': ' ', '\\noindent': ''
    })

    inline_math_delimiters: Set[str] = field(default_factory=lambda: {
        '$', '\\(', '\\)', '\\[', '\\]'
    })


class LatexCleaner:
    """Efficient and modular LaTeX text cleaner."""

    def __init__(self, config: Optional[LatexConfig] = None):
        self.config = config or LatexConfig()
        self.logger = logging.getLogger(__name__)

    @lru_cache(maxsize=128)
    def _get_env_pattern(self, env_name: str) -> Pattern:
        return re.compile(fr'\\begin{{{env_name}}}(.*?)\\end{{{env_name}}}', re.DOTALL)

    def _get_env_type(self, env_name: str) -> EnvType:
        """Determine environment processing type."""
        if env_name.rstrip('*') in {name.rstrip('*') for name in self.config.preserve_envs}:
            return EnvType.PRESERVE
        elif env_name in {'verbatim', 'comment'}:
            return EnvType.REMOVE
        return EnvType.EXTRACT

    def _process_environment(self, match: re.Match) -> str:
        try:
            env_name = match.group(1)
            content = match.group(2)
            env_type = self._get_env_type(env_name)

            if env_type == EnvType.PRESERVE:
                # Preserve math content without markers for inline math
                if env_name in {'math', 'displaymath'}:
                    return f" {content} "
                return f" [BEGIN_{env_name}] {content} [END_{env_name}] "
            elif env_type == EnvType.REMOVE:
                return ' '
            # Process nested environments recursively
            return self._clean_nested_environments(content)
        except Exception as e:
            self.logger.error(f"Error processing environment {env_name}: {e}")
            return content

    def _clean_nested_environments(self, text: str) -> str:
        """Process nested environments recursively."""
        return re.sub(
            r'\\begin{(\w+)}(.*?)\\end{\1}',
            self._process_environment,
            text,
            flags=re.DOTALL
        )

    def _clean_commands(self, text: str) -> str:
        """Clean LaTeX commands while preserving specified content."""
        # Remove complete commands
        for cmd in self.config.remove_commands:
            text = re.sub(fr'\\{cmd}\*?(?:\[.*?\])?(?:{{.*?}})*', '', text)

        # Process commands with content
        def handle_command(match: re.Match) -> str:
            cmd = match.group(1).rstrip('*')  # Handle starred versions
            content = match.group(2)

            # Keep math content intact
            if cmd in {'[', ']', '(', ')', '$'} or cmd in self.config.inline_math_delimiters:
                return content

            return content if cmd in self.config.preserve_commands else ' '

        # Handle commands with arguments
        text = re.sub(r'\\(\w+)\*?(?:\[.*?\])?{(.*?)}', handle_command, text)

        # Handle inline math
        text = self._preserve_inline_math(text)

        # Remove remaining standalone commands
        return re.sub(r'\\[a-zA-Z]+\*?(?:\[\])?', '', text)

    def _preserve_inline_math(self, text: str) -> str:
        """Preserve inline math content."""
        # Handle $...$ math
        text = re.sub(r'\$(.+?)\$', r' \1 ', text)
        # Handle \(...\) math
        text = re.sub(r'\\[\(\[](.+?)\\[\)\]]', r' \1 ', text)
        return text

    def _normalize_text(self, text: str) -> str:
        """Normalize special characters and whitespace."""
        # Replace special characters
        for char, replacement in self.config.latex_chars.items():
            text = text.replace(char, replacement)

        # Clean up whitespace
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'\s*\[BEGIN_(\w+)\]\s*', r' [BEGIN_\1] ', text)
        text = re.sub(r'\s*\[END_(\w+)\]\s*', r' [END_\1] ', text)

        # Remove empty brackets and braces
        text = re.sub(r'{\s*}|\[\s*\]|\(\s*\)', '', text)

        return text.strip()

    def clean_text(self, text: str) -> str:
        """Clean LaTeX text while preserving meaningful content."""
        if not text:
            raise ValueError("Input text cannot be empty")

        try:
            # Remove comments not inside environments
            text = re.sub(r'(?<!\\)%.*?(?=\n|$)', '', text, flags=re.MULTILINE)

            # Process environments and their nested contents
            text = self._clean_nested_environments(text)

            # Clean commands and normalize
            text = self._clean_commands(text)
            text = self._normalize_text(text)

            return text

        except Exception as e:
            self.logger.error(f"Error cleaning text: {e}")
            raise


def clean_latex_commands(text: str) -> str:
    """Convenience function for quick text cleaning with default config."""
    config = LatexConfig(
        preserve_envs={'equation', 'theorem'},
        preserve_commands={'textbf', 'emph', "label"},
        latex_chars={'~': ' ', '\\&': '&'}
    )
    return LatexCleaner(config).clean_text(text)


# Example usage:
if __name__ == "__main__":
    # Basic usage with inline math
    text = clean_latex_commands(r"""
    \textbf{Important} result: $E=mc^2$ and
    \begin{equation}
    F = ma
    \end{equation}
    """)
    print(text)

    # Custom configuration
    config = LatexConfig(
        preserve_envs={'equation', 'theorem'},
        preserve_commands={'textbf', 'emph'},
        latex_chars={'~': ' ', '\\&': '&'}
    )
    cleaner = LatexCleaner(config)
    text = cleaner.clean_text(r"\textbf{Custom} cleaning")
    print(text)