220 lines
6.1 KiB
Python
220 lines
6.1 KiB
Python
from __future__ import annotations
|
||
|
||
from dataclasses import dataclass, field
|
||
from typing import Dict, Optional, Union
|
||
from urllib.parse import urlparse
|
||
import logging
|
||
import trafilatura
|
||
import requests
|
||
from pathlib import Path
|
||
|
||
|
||
@dataclass
|
||
class WebExtractorConfig:
|
||
"""网页内容提取器配置类
|
||
|
||
Attributes:
|
||
extract_comments: 是否提取评论
|
||
extract_tables: 是否提取表格
|
||
extract_links: 是否保留链接信息
|
||
paragraph_separator: 段落分隔符
|
||
timeout: 网络请求超时时间(秒)
|
||
max_retries: 最大重试次数
|
||
user_agent: 自定义User-Agent
|
||
text_cleanup: 文本清理选项
|
||
"""
|
||
extract_comments: bool = False
|
||
extract_tables: bool = True
|
||
extract_links: bool = False
|
||
paragraph_separator: str = '\n\n'
|
||
timeout: int = 10
|
||
max_retries: int = 3
|
||
user_agent: str = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||
text_cleanup: Dict[str, bool] = field(default_factory=lambda: {
|
||
'remove_extra_spaces': True,
|
||
'normalize_whitespace': True,
|
||
'remove_special_chars': False,
|
||
'lowercase': False
|
||
})
|
||
|
||
|
||
class WebTextExtractor:
|
||
"""网页文本内容提取器
|
||
|
||
使用trafilatura库提取网页中的主要文本内容,去除广告、导航等无关内容。
|
||
"""
|
||
|
||
def __init__(self, config: Optional[WebExtractorConfig] = None):
|
||
"""初始化提取器
|
||
|
||
Args:
|
||
config: 提取器配置对象,如果为None则使用默认配置
|
||
"""
|
||
self.config = config or WebExtractorConfig()
|
||
self._setup_logging()
|
||
|
||
def _setup_logging(self) -> None:
|
||
"""配置日志记录器"""
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||
)
|
||
self.logger = logging.getLogger(__name__)
|
||
|
||
# 添加文件处理器
|
||
fh = logging.FileHandler('web_extractor.log')
|
||
fh.setLevel(logging.ERROR)
|
||
self.logger.addHandler(fh)
|
||
|
||
def _validate_url(self, url: str) -> bool:
|
||
"""验证URL格式是否有效
|
||
|
||
Args:
|
||
url: 网页URL
|
||
|
||
Returns:
|
||
bool: URL是否有效
|
||
"""
|
||
try:
|
||
result = urlparse(url)
|
||
return all([result.scheme, result.netloc])
|
||
except Exception:
|
||
return False
|
||
|
||
def _download_webpage(self, url: str) -> Optional[str]:
|
||
"""下载网页内容
|
||
|
||
Args:
|
||
url: 网页URL
|
||
|
||
Returns:
|
||
Optional[str]: 网页HTML内容,失败返回None
|
||
|
||
Raises:
|
||
Exception: 下载失败时抛出异常
|
||
"""
|
||
headers = {'User-Agent': self.config.user_agent}
|
||
|
||
for attempt in range(self.config.max_retries):
|
||
try:
|
||
response = requests.get(
|
||
url,
|
||
headers=headers,
|
||
timeout=self.config.timeout
|
||
)
|
||
response.raise_for_status()
|
||
return response.text
|
||
except requests.RequestException as e:
|
||
self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
|
||
if attempt == self.config.max_retries - 1:
|
||
raise Exception(f"Failed to download webpage after {self.config.max_retries} attempts: {e}")
|
||
return None
|
||
|
||
def _cleanup_text(self, text: str) -> str:
|
||
"""清理文本
|
||
|
||
Args:
|
||
text: 原始文本
|
||
|
||
Returns:
|
||
str: 清理后的文本
|
||
"""
|
||
if not text:
|
||
return ""
|
||
|
||
if self.config.text_cleanup['remove_extra_spaces']:
|
||
text = ' '.join(text.split())
|
||
|
||
if self.config.text_cleanup['normalize_whitespace']:
|
||
text = text.replace('\t', ' ').replace('\r', '\n')
|
||
|
||
if self.config.text_cleanup['lowercase']:
|
||
text = text.lower()
|
||
|
||
return text.strip()
|
||
|
||
def extract_text(self, url: str) -> str:
|
||
"""提取网页文本内容
|
||
|
||
Args:
|
||
url: 网页URL
|
||
|
||
Returns:
|
||
str: 提取的文本内容
|
||
|
||
Raises:
|
||
ValueError: URL无效时抛出
|
||
Exception: 提取失败时抛出
|
||
"""
|
||
try:
|
||
if not self._validate_url(url):
|
||
raise ValueError(f"Invalid URL: {url}")
|
||
|
||
self.logger.info(f"Processing URL: {url}")
|
||
|
||
# 下载网页
|
||
html_content = self._download_webpage(url)
|
||
if not html_content:
|
||
raise Exception("Failed to download webpage")
|
||
|
||
# 配置trafilatura提取选项
|
||
extract_config = {
|
||
'include_comments': self.config.extract_comments,
|
||
'include_tables': self.config.extract_tables,
|
||
'include_links': self.config.extract_links,
|
||
'no_fallback': False, # 允许使用后备提取器
|
||
}
|
||
|
||
# 提取文本
|
||
extracted_text = trafilatura.extract(
|
||
html_content,
|
||
**extract_config
|
||
)
|
||
|
||
if not extracted_text:
|
||
raise Exception("No content could be extracted")
|
||
|
||
# 清理文本
|
||
cleaned_text = self._cleanup_text(extracted_text)
|
||
|
||
return cleaned_text
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"Extraction failed: {e}")
|
||
raise
|
||
|
||
|
||
def main():
|
||
"""主函数:演示用法"""
|
||
# 配置
|
||
config = WebExtractorConfig(
|
||
extract_comments=False,
|
||
extract_tables=True,
|
||
extract_links=False,
|
||
timeout=10,
|
||
text_cleanup={
|
||
'remove_extra_spaces': True,
|
||
'normalize_whitespace': True,
|
||
'remove_special_chars': False,
|
||
'lowercase': False
|
||
}
|
||
)
|
||
|
||
# 创建提取器
|
||
extractor = WebTextExtractor(config)
|
||
|
||
# 使用示例
|
||
try:
|
||
# 替换为实际的URL
|
||
sample_url = 'https://arxiv.org/abs/2412.00036'
|
||
text = extractor.extract_text(sample_url)
|
||
print("提取的文本:")
|
||
print(text)
|
||
|
||
except Exception as e:
|
||
print(f"错误: {e}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|