gpt_academic/crazy_functions/review_fns/data_sources/scihub_source.py

from pathlib import Path
import requests
from bs4 import BeautifulSoup
import time
from loguru import logger
import PyPDF2
import io


class SciHub:
    # 更新的镜像列表，包含更多可用的镜像
    MIRRORS = [
        'https://sci-hub.se/',
        'https://sci-hub.st/',
        'https://sci-hub.ru/',
        'https://sci-hub.wf/',
        'https://sci-hub.ee/',
        'https://sci-hub.ren/',
        'https://sci-hub.tf/',
        'https://sci-hub.si/',
        'https://sci-hub.do/',
        'https://sci-hub.hkvisa.net/',
        'https://sci-hub.mksa.top/',
        'https://sci-hub.shop/',
        'https://sci-hub.yncjkj.com/',
        'https://sci-hub.41610.org/',
        'https://sci-hub.automic.us/',
        'https://sci-hub.et-fine.com/',
        'https://sci-hub.pooh.mu/',
        'https://sci-hub.bban.top/',
        'https://sci-hub.usualwant.com/',
        'https://sci-hub.unblockit.kim/'
    ]

    def __init__(self, doi: str, path: Path, url=None, timeout=60, use_proxy=True):
        self.timeout = timeout
        self.path = path
        self.doi = str(doi)
        self.use_proxy = use_proxy
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        }
        self.payload = {
            'sci-hub-plugin-check': '',
            'request': self.doi
        }
        self.url = url if url else self.MIRRORS[0]
        self.proxies = {
            "http": "socks5h://localhost:10880",
            "https": "socks5h://localhost:10880",
        } if use_proxy else None

    def _test_proxy_connection(self):
        """测试代理连接是否可用"""
        if not self.use_proxy:
            return True

        try:
            # 测试代理连接
            test_response = requests.get(
                'https://httpbin.org/ip',
                proxies=self.proxies,
                timeout=10
            )
            if test_response.status_code == 200:
                logger.info("代理连接测试成功")
                return True
        except Exception as e:
            logger.warning(f"代理连接测试失败: {str(e)}")
            return False
        return False

    def _check_pdf_validity(self, content):
        """检查PDF文件是否有效"""
        try:
            # 使用PyPDF2检查PDF是否可以正常打开和读取
            pdf = PyPDF2.PdfReader(io.BytesIO(content))
            if len(pdf.pages) > 0:
                return True
            return False
        except Exception as e:
            logger.error(f"PDF文件无效: {str(e)}")
            return False

    def _send_request(self):
        """发送请求到Sci-Hub镜像站点"""
        # 首先测试代理连接
        if self.use_proxy and not self._test_proxy_connection():
            logger.warning("代理连接不可用，切换到直连模式")
            self.use_proxy = False
            self.proxies = None

        last_exception = None
        working_mirrors = []

        # 先测试哪些镜像可用
        logger.info("正在测试镜像站点可用性...")
        for mirror in self.MIRRORS:
            try:
                test_response = requests.get(
                    mirror,
                    headers=self.headers,
                    proxies=self.proxies,
                    timeout=10
                )
                if test_response.status_code == 200:
                    working_mirrors.append(mirror)
                    logger.info(f"镜像 {mirror} 可用")
                    if len(working_mirrors) >= 5:  # 找到5个可用镜像就够了
                        break
            except Exception as e:
                logger.debug(f"镜像 {mirror} 不可用: {str(e)}")
                continue

        if not working_mirrors:
            raise Exception("没有找到可用的镜像站点")

        logger.info(f"找到 {len(working_mirrors)} 个可用镜像，开始尝试下载...")

        # 使用可用的镜像进行下载
        for mirror in working_mirrors:
            try:
                res = requests.post(
                    mirror,
                    headers=self.headers,
                    data=self.payload,
                    proxies=self.proxies,
                    timeout=self.timeout
                )
                if res.ok:
                    logger.info(f"成功使用镜像站点: {mirror}")
                    self.url = mirror  # 更新当前使用的镜像
                    time.sleep(1)  # 降低等待时间以提高效率
                    return res
            except Exception as e:
                logger.error(f"尝试镜像 {mirror} 失败: {str(e)}")
                last_exception = e
                continue

        if last_exception:
            raise last_exception
        raise Exception("所有可用镜像站点均无法完成下载")

    def _extract_url(self, response):
        """从响应中提取PDF下载链接"""
        soup = BeautifulSoup(response.content, 'html.parser')
        try:
            # 尝试多种方式提取PDF链接
            pdf_element = soup.find(id='pdf')
            if pdf_element:
                content_url = pdf_element.get('src')
            else:
                # 尝试其他可能的选择器
                pdf_element = soup.find('iframe')
                if pdf_element:
                    content_url = pdf_element.get('src')
                else:
                    # 查找直接的PDF链接
                    pdf_links = soup.find_all('a', href=lambda x: x and '.pdf' in x)
                    if pdf_links:
                        content_url = pdf_links[0].get('href')
                    else:
                        raise AttributeError("未找到PDF链接")

            if content_url:
                content_url = content_url.replace('#navpanes=0&view=FitH', '').replace('//', '/')
                if not content_url.endswith('.pdf') and 'pdf' not in content_url.lower():
                    raise AttributeError("找到的链接不是PDF文件")
        except AttributeError:
            logger.error(f"未找到论文 {self.doi}")
            return None

        current_mirror = self.url.rstrip('/')
        if content_url.startswith('/'):
            return current_mirror + content_url
        elif content_url.startswith('http'):
            return content_url
        else:
            return 'https:/' + content_url

    def _download_pdf(self, pdf_url):
        """下载PDF文件并验证其完整性"""
        try:
            # 尝试不同的下载方式
            download_methods = [
                # 方法1：直接下载
                lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout),
                # 方法2：添加 Referer 头
                lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout,
                                   headers={**self.headers, 'Referer': self.url}),
                # 方法3：使用原始域名作为 Referer
                lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout,
                                   headers={**self.headers, 'Referer': pdf_url.split('/downloads')[0] if '/downloads' in pdf_url else self.url})
            ]

            for i, download_method in enumerate(download_methods):
                try:
                    logger.info(f"尝试下载方式 {i+1}/3...")
                    response = download_method()
                    if response.status_code == 200:
                        content = response.content
                        if len(content) > 1000 and self._check_pdf_validity(content):  # 确保文件不是太小
                            logger.info(f"PDF下载成功，文件大小: {len(content)} bytes")
                            return content
                        else:
                            logger.warning("下载的文件可能不是有效的PDF")
                    elif response.status_code == 403:
                        logger.warning(f"访问被拒绝 (403 Forbidden)，尝试其他下载方式")
                        continue
                    else:
                        logger.warning(f"下载失败，状态码: {response.status_code}")
                        continue
                except Exception as e:
                    logger.warning(f"下载方式 {i+1} 失败: {str(e)}")
                    continue

            # 如果所有方法都失败，尝试构造替代URL
            try:
                logger.info("尝试使用替代镜像下载...")
                # 从原始URL提取关键信息
                if '/downloads/' in pdf_url:
                    file_part = pdf_url.split('/downloads/')[-1]
                    alternative_mirrors = [
                        f"https://sci-hub.se/downloads/{file_part}",
                        f"https://sci-hub.st/downloads/{file_part}",
                        f"https://sci-hub.ru/downloads/{file_part}",
                        f"https://sci-hub.wf/downloads/{file_part}",
                        f"https://sci-hub.ee/downloads/{file_part}",
                        f"https://sci-hub.ren/downloads/{file_part}",
                        f"https://sci-hub.tf/downloads/{file_part}"
                    ]

                    for alt_url in alternative_mirrors:
                        try:
                            response = requests.get(
                                alt_url,
                                proxies=self.proxies,
                                timeout=self.timeout,
                                headers={**self.headers, 'Referer': alt_url.split('/downloads')[0]}
                            )
                            if response.status_code == 200:
                                content = response.content
                                if len(content) > 1000 and self._check_pdf_validity(content):
                                    logger.info(f"使用替代镜像成功下载: {alt_url}")
                                    return content
                        except Exception as e:
                            logger.debug(f"替代镜像 {alt_url} 下载失败: {str(e)}")
                            continue

            except Exception as e:
                logger.error(f"所有下载方式都失败: {str(e)}")

            return None

        except Exception as e:
            logger.error(f"下载PDF文件失败: {str(e)}")
            return None

    def fetch(self):
        """获取论文PDF，包含重试和验证机制"""
        for attempt in range(2):  # 最多重试3次
            try:
                logger.info(f"开始第 {attempt + 1} 次尝试下载论文: {self.doi}")

                # 获取PDF下载链接
                response = self._send_request()
                pdf_url = self._extract_url(response)
                if pdf_url is None:
                    logger.warning(f"第 {attempt + 1} 次尝试：未找到PDF下载链接")
                    continue

                logger.info(f"找到PDF下载链接: {pdf_url}")

                # 下载并验证PDF
                pdf_content = self._download_pdf(pdf_url)
                if pdf_content is None:
                    logger.warning(f"第 {attempt + 1} 次尝试：PDF下载失败")
                    continue

                # 保存PDF文件
                pdf_name = f"{self.doi.replace('/', '_').replace(':', '_')}.pdf"
                pdf_path = self.path.joinpath(pdf_name)
                pdf_path.write_bytes(pdf_content)

                logger.info(f"成功下载论文: {pdf_name}，文件大小: {len(pdf_content)} bytes")
                return str(pdf_path)

            except Exception as e:
                logger.error(f"第 {attempt + 1} 次尝试失败: {str(e)}")
                if attempt < 2:  # 不是最后一次尝试
                    wait_time = (attempt + 1) * 3  # 递增等待时间
                    logger.info(f"等待 {wait_time} 秒后重试...")
                    time.sleep(wait_time)
                continue

        raise Exception(f"无法下载论文 {self.doi}，所有重试都失败了")

# Usage Example
if __name__ == '__main__':
    # 创建一个用于保存PDF的目录
    save_path = Path('./downloaded_papers')
    save_path.mkdir(exist_ok=True)

    # DOI示例
    sample_doi = '10.3897/rio.7.e67379'  # 这是一篇Nature的论文DOI

    try:
        # 初始化SciHub下载器，先尝试使用代理
        logger.info("尝试使用代理模式...")
        downloader = SciHub(doi=sample_doi, path=save_path, use_proxy=True)

        # 开始下载
        result = downloader.fetch()
        print(f"论文已保存到: {result}")

    except Exception as e:
        print(f"使用代理模式失败: {str(e)}")
        try:
            # 如果代理模式失败，尝试直连模式
            logger.info("尝试直连模式...")
            downloader = SciHub(doi=sample_doi, path=save_path, use_proxy=False)
            result = downloader.fetch()
            print(f"论文已保存到: {result}")
        except Exception as e2:
            print(f"直连模式也失败: {str(e2)}")
            print("建议检查网络连接或尝试其他DOI")