* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能,支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能,支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
326 lines
13 KiB
Python
326 lines
13 KiB
Python
from pathlib import Path
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
import time
|
||
from loguru import logger
|
||
import PyPDF2
|
||
import io
|
||
|
||
|
||
class SciHub:
|
||
# 更新的镜像列表,包含更多可用的镜像
|
||
MIRRORS = [
|
||
'https://sci-hub.se/',
|
||
'https://sci-hub.st/',
|
||
'https://sci-hub.ru/',
|
||
'https://sci-hub.wf/',
|
||
'https://sci-hub.ee/',
|
||
'https://sci-hub.ren/',
|
||
'https://sci-hub.tf/',
|
||
'https://sci-hub.si/',
|
||
'https://sci-hub.do/',
|
||
'https://sci-hub.hkvisa.net/',
|
||
'https://sci-hub.mksa.top/',
|
||
'https://sci-hub.shop/',
|
||
'https://sci-hub.yncjkj.com/',
|
||
'https://sci-hub.41610.org/',
|
||
'https://sci-hub.automic.us/',
|
||
'https://sci-hub.et-fine.com/',
|
||
'https://sci-hub.pooh.mu/',
|
||
'https://sci-hub.bban.top/',
|
||
'https://sci-hub.usualwant.com/',
|
||
'https://sci-hub.unblockit.kim/'
|
||
]
|
||
|
||
def __init__(self, doi: str, path: Path, url=None, timeout=60, use_proxy=True):
|
||
self.timeout = timeout
|
||
self.path = path
|
||
self.doi = str(doi)
|
||
self.use_proxy = use_proxy
|
||
self.headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
}
|
||
self.payload = {
|
||
'sci-hub-plugin-check': '',
|
||
'request': self.doi
|
||
}
|
||
self.url = url if url else self.MIRRORS[0]
|
||
self.proxies = {
|
||
"http": "socks5h://localhost:10880",
|
||
"https": "socks5h://localhost:10880",
|
||
} if use_proxy else None
|
||
|
||
def _test_proxy_connection(self):
|
||
"""测试代理连接是否可用"""
|
||
if not self.use_proxy:
|
||
return True
|
||
|
||
try:
|
||
# 测试代理连接
|
||
test_response = requests.get(
|
||
'https://httpbin.org/ip',
|
||
proxies=self.proxies,
|
||
timeout=10
|
||
)
|
||
if test_response.status_code == 200:
|
||
logger.info("代理连接测试成功")
|
||
return True
|
||
except Exception as e:
|
||
logger.warning(f"代理连接测试失败: {str(e)}")
|
||
return False
|
||
return False
|
||
|
||
def _check_pdf_validity(self, content):
|
||
"""检查PDF文件是否有效"""
|
||
try:
|
||
# 使用PyPDF2检查PDF是否可以正常打开和读取
|
||
pdf = PyPDF2.PdfReader(io.BytesIO(content))
|
||
if len(pdf.pages) > 0:
|
||
return True
|
||
return False
|
||
except Exception as e:
|
||
logger.error(f"PDF文件无效: {str(e)}")
|
||
return False
|
||
|
||
def _send_request(self):
|
||
"""发送请求到Sci-Hub镜像站点"""
|
||
# 首先测试代理连接
|
||
if self.use_proxy and not self._test_proxy_connection():
|
||
logger.warning("代理连接不可用,切换到直连模式")
|
||
self.use_proxy = False
|
||
self.proxies = None
|
||
|
||
last_exception = None
|
||
working_mirrors = []
|
||
|
||
# 先测试哪些镜像可用
|
||
logger.info("正在测试镜像站点可用性...")
|
||
for mirror in self.MIRRORS:
|
||
try:
|
||
test_response = requests.get(
|
||
mirror,
|
||
headers=self.headers,
|
||
proxies=self.proxies,
|
||
timeout=10
|
||
)
|
||
if test_response.status_code == 200:
|
||
working_mirrors.append(mirror)
|
||
logger.info(f"镜像 {mirror} 可用")
|
||
if len(working_mirrors) >= 5: # 找到5个可用镜像就够了
|
||
break
|
||
except Exception as e:
|
||
logger.debug(f"镜像 {mirror} 不可用: {str(e)}")
|
||
continue
|
||
|
||
if not working_mirrors:
|
||
raise Exception("没有找到可用的镜像站点")
|
||
|
||
logger.info(f"找到 {len(working_mirrors)} 个可用镜像,开始尝试下载...")
|
||
|
||
# 使用可用的镜像进行下载
|
||
for mirror in working_mirrors:
|
||
try:
|
||
res = requests.post(
|
||
mirror,
|
||
headers=self.headers,
|
||
data=self.payload,
|
||
proxies=self.proxies,
|
||
timeout=self.timeout
|
||
)
|
||
if res.ok:
|
||
logger.info(f"成功使用镜像站点: {mirror}")
|
||
self.url = mirror # 更新当前使用的镜像
|
||
time.sleep(1) # 降低等待时间以提高效率
|
||
return res
|
||
except Exception as e:
|
||
logger.error(f"尝试镜像 {mirror} 失败: {str(e)}")
|
||
last_exception = e
|
||
continue
|
||
|
||
if last_exception:
|
||
raise last_exception
|
||
raise Exception("所有可用镜像站点均无法完成下载")
|
||
|
||
def _extract_url(self, response):
|
||
"""从响应中提取PDF下载链接"""
|
||
soup = BeautifulSoup(response.content, 'html.parser')
|
||
try:
|
||
# 尝试多种方式提取PDF链接
|
||
pdf_element = soup.find(id='pdf')
|
||
if pdf_element:
|
||
content_url = pdf_element.get('src')
|
||
else:
|
||
# 尝试其他可能的选择器
|
||
pdf_element = soup.find('iframe')
|
||
if pdf_element:
|
||
content_url = pdf_element.get('src')
|
||
else:
|
||
# 查找直接的PDF链接
|
||
pdf_links = soup.find_all('a', href=lambda x: x and '.pdf' in x)
|
||
if pdf_links:
|
||
content_url = pdf_links[0].get('href')
|
||
else:
|
||
raise AttributeError("未找到PDF链接")
|
||
|
||
if content_url:
|
||
content_url = content_url.replace('#navpanes=0&view=FitH', '').replace('//', '/')
|
||
if not content_url.endswith('.pdf') and 'pdf' not in content_url.lower():
|
||
raise AttributeError("找到的链接不是PDF文件")
|
||
except AttributeError:
|
||
logger.error(f"未找到论文 {self.doi}")
|
||
return None
|
||
|
||
current_mirror = self.url.rstrip('/')
|
||
if content_url.startswith('/'):
|
||
return current_mirror + content_url
|
||
elif content_url.startswith('http'):
|
||
return content_url
|
||
else:
|
||
return 'https:/' + content_url
|
||
|
||
def _download_pdf(self, pdf_url):
|
||
"""下载PDF文件并验证其完整性"""
|
||
try:
|
||
# 尝试不同的下载方式
|
||
download_methods = [
|
||
# 方法1:直接下载
|
||
lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout),
|
||
# 方法2:添加 Referer 头
|
||
lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout,
|
||
headers={**self.headers, 'Referer': self.url}),
|
||
# 方法3:使用原始域名作为 Referer
|
||
lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout,
|
||
headers={**self.headers, 'Referer': pdf_url.split('/downloads')[0] if '/downloads' in pdf_url else self.url})
|
||
]
|
||
|
||
for i, download_method in enumerate(download_methods):
|
||
try:
|
||
logger.info(f"尝试下载方式 {i+1}/3...")
|
||
response = download_method()
|
||
if response.status_code == 200:
|
||
content = response.content
|
||
if len(content) > 1000 and self._check_pdf_validity(content): # 确保文件不是太小
|
||
logger.info(f"PDF下载成功,文件大小: {len(content)} bytes")
|
||
return content
|
||
else:
|
||
logger.warning("下载的文件可能不是有效的PDF")
|
||
elif response.status_code == 403:
|
||
logger.warning(f"访问被拒绝 (403 Forbidden),尝试其他下载方式")
|
||
continue
|
||
else:
|
||
logger.warning(f"下载失败,状态码: {response.status_code}")
|
||
continue
|
||
except Exception as e:
|
||
logger.warning(f"下载方式 {i+1} 失败: {str(e)}")
|
||
continue
|
||
|
||
# 如果所有方法都失败,尝试构造替代URL
|
||
try:
|
||
logger.info("尝试使用替代镜像下载...")
|
||
# 从原始URL提取关键信息
|
||
if '/downloads/' in pdf_url:
|
||
file_part = pdf_url.split('/downloads/')[-1]
|
||
alternative_mirrors = [
|
||
f"https://sci-hub.se/downloads/{file_part}",
|
||
f"https://sci-hub.st/downloads/{file_part}",
|
||
f"https://sci-hub.ru/downloads/{file_part}",
|
||
f"https://sci-hub.wf/downloads/{file_part}",
|
||
f"https://sci-hub.ee/downloads/{file_part}",
|
||
f"https://sci-hub.ren/downloads/{file_part}",
|
||
f"https://sci-hub.tf/downloads/{file_part}"
|
||
]
|
||
|
||
for alt_url in alternative_mirrors:
|
||
try:
|
||
response = requests.get(
|
||
alt_url,
|
||
proxies=self.proxies,
|
||
timeout=self.timeout,
|
||
headers={**self.headers, 'Referer': alt_url.split('/downloads')[0]}
|
||
)
|
||
if response.status_code == 200:
|
||
content = response.content
|
||
if len(content) > 1000 and self._check_pdf_validity(content):
|
||
logger.info(f"使用替代镜像成功下载: {alt_url}")
|
||
return content
|
||
except Exception as e:
|
||
logger.debug(f"替代镜像 {alt_url} 下载失败: {str(e)}")
|
||
continue
|
||
|
||
except Exception as e:
|
||
logger.error(f"所有下载方式都失败: {str(e)}")
|
||
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error(f"下载PDF文件失败: {str(e)}")
|
||
return None
|
||
|
||
def fetch(self):
|
||
"""获取论文PDF,包含重试和验证机制"""
|
||
for attempt in range(2): # 最多重试3次
|
||
try:
|
||
logger.info(f"开始第 {attempt + 1} 次尝试下载论文: {self.doi}")
|
||
|
||
# 获取PDF下载链接
|
||
response = self._send_request()
|
||
pdf_url = self._extract_url(response)
|
||
if pdf_url is None:
|
||
logger.warning(f"第 {attempt + 1} 次尝试:未找到PDF下载链接")
|
||
continue
|
||
|
||
logger.info(f"找到PDF下载链接: {pdf_url}")
|
||
|
||
# 下载并验证PDF
|
||
pdf_content = self._download_pdf(pdf_url)
|
||
if pdf_content is None:
|
||
logger.warning(f"第 {attempt + 1} 次尝试:PDF下载失败")
|
||
continue
|
||
|
||
# 保存PDF文件
|
||
pdf_name = f"{self.doi.replace('/', '_').replace(':', '_')}.pdf"
|
||
pdf_path = self.path.joinpath(pdf_name)
|
||
pdf_path.write_bytes(pdf_content)
|
||
|
||
logger.info(f"成功下载论文: {pdf_name},文件大小: {len(pdf_content)} bytes")
|
||
return str(pdf_path)
|
||
|
||
except Exception as e:
|
||
logger.error(f"第 {attempt + 1} 次尝试失败: {str(e)}")
|
||
if attempt < 2: # 不是最后一次尝试
|
||
wait_time = (attempt + 1) * 3 # 递增等待时间
|
||
logger.info(f"等待 {wait_time} 秒后重试...")
|
||
time.sleep(wait_time)
|
||
continue
|
||
|
||
raise Exception(f"无法下载论文 {self.doi},所有重试都失败了")
|
||
|
||
# Usage Example
|
||
if __name__ == '__main__':
|
||
# 创建一个用于保存PDF的目录
|
||
save_path = Path('./downloaded_papers')
|
||
save_path.mkdir(exist_ok=True)
|
||
|
||
# DOI示例
|
||
sample_doi = '10.3897/rio.7.e67379' # 这是一篇Nature的论文DOI
|
||
|
||
try:
|
||
# 初始化SciHub下载器,先尝试使用代理
|
||
logger.info("尝试使用代理模式...")
|
||
downloader = SciHub(doi=sample_doi, path=save_path, use_proxy=True)
|
||
|
||
# 开始下载
|
||
result = downloader.fetch()
|
||
print(f"论文已保存到: {result}")
|
||
|
||
except Exception as e:
|
||
print(f"使用代理模式失败: {str(e)}")
|
||
try:
|
||
# 如果代理模式失败,尝试直连模式
|
||
logger.info("尝试直连模式...")
|
||
downloader = SciHub(doi=sample_doi, path=save_path, use_proxy=False)
|
||
result = downloader.fetch()
|
||
print(f"论文已保存到: {result}")
|
||
except Exception as e2:
|
||
print(f"直连模式也失败: {str(e2)}")
|
||
print("建议检查网络连接或尝试其他DOI") |