gpt_academic/crazy_functions/doc_fns/read_fns/web_reader.py

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Dict, Optional, Union
from urllib.parse import urlparse
import logging
import trafilatura
import requests
from pathlib import Path


@dataclass
class WebExtractorConfig:
    """网页内容提取器配置类

    Attributes:
        extract_comments: 是否提取评论
        extract_tables: 是否提取表格
        extract_links: 是否保留链接信息
        paragraph_separator: 段落分隔符
        timeout: 网络请求超时时间(秒)
        max_retries: 最大重试次数
        user_agent: 自定义User-Agent
        text_cleanup: 文本清理选项
    """
    extract_comments: bool = False
    extract_tables: bool = True
    extract_links: bool = False
    paragraph_separator: str = '\n\n'
    timeout: int = 10
    max_retries: int = 3
    user_agent: str = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    text_cleanup: Dict[str, bool] = field(default_factory=lambda: {
        'remove_extra_spaces': True,
        'normalize_whitespace': True,
        'remove_special_chars': False,
        'lowercase': False
    })


class WebTextExtractor:
    """网页文本内容提取器

    使用trafilatura库提取网页中的主要文本内容，去除广告、导航等无关内容。
    """

    def __init__(self, config: Optional[WebExtractorConfig] = None):
        """初始化提取器

        Args:
            config: 提取器配置对象，如果为None则使用默认配置
        """
        self.config = config or WebExtractorConfig()
        self._setup_logging()

    def _setup_logging(self) -> None:
        """配置日志记录器"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

        # 添加文件处理器
        fh = logging.FileHandler('web_extractor.log')
        fh.setLevel(logging.ERROR)
        self.logger.addHandler(fh)

    def _validate_url(self, url: str) -> bool:
        """验证URL格式是否有效

        Args:
            url: 网页URL

        Returns:
            bool: URL是否有效
        """
        try:
            result = urlparse(url)
            return all([result.scheme, result.netloc])
        except Exception:
            return False

    def _download_webpage(self, url: str) -> Optional[str]:
        """下载网页内容

        Args:
            url: 网页URL

        Returns:
            Optional[str]: 网页HTML内容，失败返回None

        Raises:
            Exception: 下载失败时抛出异常
        """
        headers = {'User-Agent': self.config.user_agent}

        for attempt in range(self.config.max_retries):
            try:
                response = requests.get(
                    url,
                    headers=headers,
                    timeout=self.config.timeout
                )
                response.raise_for_status()
                return response.text
            except requests.RequestException as e:
                self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
                if attempt == self.config.max_retries - 1:
                    raise Exception(f"Failed to download webpage after {self.config.max_retries} attempts: {e}")
        return None

    def _cleanup_text(self, text: str) -> str:
        """清理文本

        Args:
            text: 原始文本

        Returns:
            str: 清理后的文本
        """
        if not text:
            return ""

        if self.config.text_cleanup['remove_extra_spaces']:
            text = ' '.join(text.split())

        if self.config.text_cleanup['normalize_whitespace']:
            text = text.replace('\t', ' ').replace('\r', '\n')

        if self.config.text_cleanup['lowercase']:
            text = text.lower()

        return text.strip()

    def extract_text(self, url: str) -> str:
        """提取网页文本内容

        Args:
            url: 网页URL

        Returns:
            str: 提取的文本内容

        Raises:
            ValueError: URL无效时抛出
            Exception: 提取失败时抛出
        """
        try:
            if not self._validate_url(url):
                raise ValueError(f"Invalid URL: {url}")

            self.logger.info(f"Processing URL: {url}")

            # 下载网页
            html_content = self._download_webpage(url)
            if not html_content:
                raise Exception("Failed to download webpage")

            # 配置trafilatura提取选项
            extract_config = {
                'include_comments': self.config.extract_comments,
                'include_tables': self.config.extract_tables,
                'include_links': self.config.extract_links,
                'no_fallback': False,  # 允许使用后备提取器
            }

            # 提取文本
            extracted_text = trafilatura.extract(
                html_content,
                **extract_config
            )

            if not extracted_text:
                raise Exception("No content could be extracted")

            # 清理文本
            cleaned_text = self._cleanup_text(extracted_text)

            return cleaned_text

        except Exception as e:
            self.logger.error(f"Extraction failed: {e}")
            raise


def main():
    """主函数：演示用法"""
    # 配置
    config = WebExtractorConfig(
        extract_comments=False,
        extract_tables=True,
        extract_links=False,
        timeout=10,
        text_cleanup={
            'remove_extra_spaces': True,
            'normalize_whitespace': True,
            'remove_special_chars': False,
            'lowercase': False
        }
    )

    # 创建提取器
    extractor = WebTextExtractor(config)

    # 使用示例
    try:
        # 替换为实际的URL
        sample_url = 'https://arxiv.org/abs/2412.00036'
        text = extractor.extract_text(sample_url)
        print("提取的文本:")
        print(text)

    except Exception as e:
        print(f"错误: {e}")


if __name__ == "__main__":
    main()