Files
gpt_academic/crazy_functions/doc_fns/read_fns/web_reader.py
2025-06-04 00:20:09 +08:00

220 lines
6.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
from dataclasses import dataclass, field
from typing import Dict, Optional, Union
from urllib.parse import urlparse
import logging
import trafilatura
import requests
from pathlib import Path
@dataclass
class WebExtractorConfig:
"""网页内容提取器配置类
Attributes:
extract_comments: 是否提取评论
extract_tables: 是否提取表格
extract_links: 是否保留链接信息
paragraph_separator: 段落分隔符
timeout: 网络请求超时时间(秒)
max_retries: 最大重试次数
user_agent: 自定义User-Agent
text_cleanup: 文本清理选项
"""
extract_comments: bool = False
extract_tables: bool = True
extract_links: bool = False
paragraph_separator: str = '\n\n'
timeout: int = 10
max_retries: int = 3
user_agent: str = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
text_cleanup: Dict[str, bool] = field(default_factory=lambda: {
'remove_extra_spaces': True,
'normalize_whitespace': True,
'remove_special_chars': False,
'lowercase': False
})
class WebTextExtractor:
"""网页文本内容提取器
使用trafilatura库提取网页中的主要文本内容去除广告、导航等无关内容。
"""
def __init__(self, config: Optional[WebExtractorConfig] = None):
"""初始化提取器
Args:
config: 提取器配置对象如果为None则使用默认配置
"""
self.config = config or WebExtractorConfig()
self._setup_logging()
def _setup_logging(self) -> None:
"""配置日志记录器"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
self.logger = logging.getLogger(__name__)
# 添加文件处理器
fh = logging.FileHandler('web_extractor.log')
fh.setLevel(logging.ERROR)
self.logger.addHandler(fh)
def _validate_url(self, url: str) -> bool:
"""验证URL格式是否有效
Args:
url: 网页URL
Returns:
bool: URL是否有效
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except Exception:
return False
def _download_webpage(self, url: str) -> Optional[str]:
"""下载网页内容
Args:
url: 网页URL
Returns:
Optional[str]: 网页HTML内容失败返回None
Raises:
Exception: 下载失败时抛出异常
"""
headers = {'User-Agent': self.config.user_agent}
for attempt in range(self.config.max_retries):
try:
response = requests.get(
url,
headers=headers,
timeout=self.config.timeout
)
response.raise_for_status()
return response.text
except requests.RequestException as e:
self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
if attempt == self.config.max_retries - 1:
raise Exception(f"Failed to download webpage after {self.config.max_retries} attempts: {e}")
return None
def _cleanup_text(self, text: str) -> str:
"""清理文本
Args:
text: 原始文本
Returns:
str: 清理后的文本
"""
if not text:
return ""
if self.config.text_cleanup['remove_extra_spaces']:
text = ' '.join(text.split())
if self.config.text_cleanup['normalize_whitespace']:
text = text.replace('\t', ' ').replace('\r', '\n')
if self.config.text_cleanup['lowercase']:
text = text.lower()
return text.strip()
def extract_text(self, url: str) -> str:
"""提取网页文本内容
Args:
url: 网页URL
Returns:
str: 提取的文本内容
Raises:
ValueError: URL无效时抛出
Exception: 提取失败时抛出
"""
try:
if not self._validate_url(url):
raise ValueError(f"Invalid URL: {url}")
self.logger.info(f"Processing URL: {url}")
# 下载网页
html_content = self._download_webpage(url)
if not html_content:
raise Exception("Failed to download webpage")
# 配置trafilatura提取选项
extract_config = {
'include_comments': self.config.extract_comments,
'include_tables': self.config.extract_tables,
'include_links': self.config.extract_links,
'no_fallback': False, # 允许使用后备提取器
}
# 提取文本
extracted_text = trafilatura.extract(
html_content,
**extract_config
)
if not extracted_text:
raise Exception("No content could be extracted")
# 清理文本
cleaned_text = self._cleanup_text(extracted_text)
return cleaned_text
except Exception as e:
self.logger.error(f"Extraction failed: {e}")
raise
def main():
"""主函数:演示用法"""
# 配置
config = WebExtractorConfig(
extract_comments=False,
extract_tables=True,
extract_links=False,
timeout=10,
text_cleanup={
'remove_extra_spaces': True,
'normalize_whitespace': True,
'remove_special_chars': False,
'lowercase': False
}
)
# 创建提取器
extractor = WebTextExtractor(config)
# 使用示例
try:
# 替换为实际的URL
sample_url = 'https://arxiv.org/abs/2412.00036'
text = extractor.extract_text(sample_url)
print("提取的文本:")
print(text)
except Exception as e:
print(f"错误: {e}")
if __name__ == "__main__":
main()