This commit is contained in:
lbykkkk
2024-12-01 17:35:57 +08:00
parent cf51d4b205
commit b3aef6b393
13 changed files with 398 additions and 234 deletions

View File

@@ -1,10 +1,12 @@
import re
import os
import logging
import os
import re
from pathlib import Path
from typing import List, Tuple, Dict, Set, Optional, Callable
from typing import List, Set, Optional
from crazy_functions.rag_fns.arxiv_fns.latex_patterns import LaTeXPatterns
class TexUtils:
"""TeX文档处理器类"""
@@ -21,9 +23,6 @@ class TexUtils:
self._init_patterns()
self.latex_only_patterns = LaTeXPatterns.latex_only_patterns
def _init_patterns(self):
"""初始化LaTeX模式匹配规则"""
# 特殊环境模式
@@ -234,6 +233,7 @@ class TexUtils:
processed_refs.append("\n".join(ref_lines))
return processed_refs
def _extract_inline_references(self, content: str) -> str:
"""
从tex文件内容中提取直接写在文件中的参考文献
@@ -255,6 +255,7 @@ class TexUtils:
return content[start_match.start():end_match.end()]
return ""
def _preprocess_content(self, content: str) -> str:
"""预处理TeX内容"""
# 移除注释
@@ -263,9 +264,3 @@ class TexUtils:
# content = re.sub(r'\s+', ' ', content)
content = re.sub(r'\n\s*\n', '\n\n', content)
return content.strip()