Files
gpt_academic/crazy_functions/review_fns/data_sources/elsevier_source.py
binary-husky 8042750d41 Master 4.0 (#2210)
* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-08-23 15:59:22 +08:00

449 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import List, Optional, Dict, Union
from datetime import datetime
import aiohttp
import asyncio
from crazy_functions.review_fns.data_sources.base_source import DataSource, PaperMetadata
import json
from tqdm import tqdm
import random
class ElsevierSource(DataSource):
"""Elsevier (Scopus) API实现"""
# 定义API密钥列表
API_KEYS = [
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
]
def __init__(self, api_key: str = None):
"""初始化
Args:
api_key: Elsevier API密钥如果不提供则从预定义列表中随机选择
"""
self.api_key = api_key or random.choice(self.API_KEYS)
self._initialize()
def _initialize(self) -> None:
"""初始化基础URL和请求头"""
self.base_url = "https://api.elsevier.com/content"
self.headers = {
"X-ELS-APIKey": self.api_key,
"Accept": "application/json",
"Content-Type": "application/json",
# 添加更多必要的头部信息
"X-ELS-Insttoken": "", # 如果有机构令牌
}
async def _make_request(self, url: str, params: Dict = None) -> Optional[Dict]:
"""发送HTTP请求
Args:
url: 请求URL
params: 查询参数
Returns:
JSON响应
"""
try:
async with aiohttp.ClientSession(headers=self.headers) as session:
async with session.get(url, params=params) as response:
if response.status == 200:
return await response.json()
else:
# 添加更详细的错误信息
error_text = await response.text()
print(f"请求失败: {response.status}")
print(f"错误详情: {error_text}")
if response.status == 401:
print(f"使用的API密钥: {self.api_key}")
# 尝试切换到另一个API密钥
new_key = random.choice([k for k in self.API_KEYS if k != self.api_key])
print(f"尝试切换到新的API密钥: {new_key}")
self.api_key = new_key
self.headers["X-ELS-APIKey"] = new_key
# 重试请求
return await self._make_request(url, params)
return None
except Exception as e:
print(f"请求发生错误: {str(e)}")
return None
async def search(
self,
query: str,
limit: int = 100,
sort_by: str = "relevance",
start_year: int = None
) -> List[PaperMetadata]:
"""搜索论文"""
try:
params = {
"query": query,
"count": min(limit, 100),
"view": "STANDARD",
# 移除dc:description字段因为它在STANDARD视图中不可用
"field": "dc:title,dc:creator,prism:doi,prism:coverDate,citedby-count,prism:publicationName"
}
# 添加年份过滤
if start_year:
params["date"] = f"{start_year}-present"
# 添加排序
if sort_by == "date":
params["sort"] = "-coverDate"
elif sort_by == "cited":
params["sort"] = "-citedby-count"
# 发送搜索请求
response = await self._make_request(
f"{self.base_url}/search/scopus",
params=params
)
if not response or "search-results" not in response:
return []
# 解析搜索结果
entries = response["search-results"].get("entry", [])
papers = [paper for paper in (self._parse_entry(entry) for entry in entries) if paper is not None]
# 尝试为每篇论文获取摘要
for paper in papers:
if paper.doi:
paper.abstract = await self.fetch_abstract(paper.doi) or ""
return papers
except Exception as e:
print(f"搜索论文时发生错误: {str(e)}")
return []
def _parse_entry(self, entry: Dict) -> Optional[PaperMetadata]:
"""解析Scopus API返回的条目"""
try:
# 获取作者列表
authors = []
creator = entry.get("dc:creator")
if creator:
authors = [creator]
# 获取发表年份
year = None
if "prism:coverDate" in entry:
try:
year = int(entry["prism:coverDate"][:4])
except:
pass
# 简化venue信息
venue_info = {
'source_id': entry.get("source-id"),
'issn': entry.get("prism:issn")
}
return PaperMetadata(
title=entry.get("dc:title", ""),
authors=authors,
abstract=entry.get("dc:description", ""), # 从响应中获取摘要
year=year,
doi=entry.get("prism:doi"),
url=entry.get("prism:url"),
citations=int(entry.get("citedby-count", 0)),
venue=entry.get("prism:publicationName"),
institutions=[], # 移除机构信息
venue_type="",
venue_name=entry.get("prism:publicationName"),
venue_info=venue_info
)
except Exception as e:
print(f"解析条目时发生错误: {str(e)}")
return None
async def get_citations(self, doi: str, limit: int = 100) -> List[PaperMetadata]:
"""获取引用该论文的文献"""
try:
params = {
"query": f"REF({doi})",
"count": min(limit, 100),
"view": "STANDARD"
}
response = await self._make_request(
f"{self.base_url}/search/scopus",
params=params
)
if not response or "search-results" not in response:
return []
entries = response["search-results"].get("entry", [])
return [self._parse_entry(entry) for entry in entries]
except Exception as e:
print(f"获取引用文献时发生错误: {str(e)}")
return []
async def get_references(self, doi: str) -> List[PaperMetadata]:
"""获取该论文引用的文献"""
try:
response = await self._make_request(
f"{self.base_url}/abstract/doi/{doi}/references",
params={"view": "STANDARD"}
)
if not response or "references" not in response:
return []
references = response["references"].get("reference", [])
papers = [paper for paper in (self._parse_reference(ref) for ref in references) if paper is not None]
return papers
except Exception as e:
print(f"获取参考文献时发生错误: {str(e)}")
return []
def _parse_reference(self, ref: Dict) -> Optional[PaperMetadata]:
"""解析参考文献数据"""
try:
authors = []
if "author-list" in ref:
author_list = ref["author-list"].get("author", [])
if isinstance(author_list, list):
authors = [f"{author.get('ce:given-name', '')} {author.get('ce:surname', '')}"
for author in author_list]
else:
authors = [f"{author_list.get('ce:given-name', '')} {author_list.get('ce:surname', '')}"]
year = None
if "prism:coverDate" in ref:
try:
year = int(ref["prism:coverDate"][:4])
except:
pass
return PaperMetadata(
title=ref.get("ce:title", ""),
authors=authors,
abstract="", # 参考文献通常不包含摘要
year=year,
doi=ref.get("prism:doi"),
url=None,
citations=None,
venue=ref.get("prism:publicationName"),
institutions=[],
venue_type="unknown",
venue_name=ref.get("prism:publicationName"),
venue_info={}
)
except Exception as e:
print(f"解析参考文献时发生错误: {str(e)}")
return None
async def search_by_author(
self,
author: str,
limit: int = 100,
start_year: int = None
) -> List[PaperMetadata]:
"""按作者搜索论文"""
query = f"AUTHOR-NAME({author})"
return await self.search(query, limit=limit, start_year=start_year)
async def search_by_affiliation(
self,
affiliation: str,
limit: int = 100,
start_year: int = None
) -> List[PaperMetadata]:
"""按机构搜索论文"""
query = f"AF-ID({affiliation})"
return await self.search(query, limit=limit, start_year=start_year)
async def search_by_venue(
self,
venue: str,
limit: int = 100,
start_year: int = None
) -> List[PaperMetadata]:
"""按期刊/会议搜索论文"""
query = f"SRCTITLE({venue})"
return await self.search(query, limit=limit, start_year=start_year)
async def test_api_access(self):
"""测试API访问权限"""
print(f"\n测试API密钥: {self.api_key}")
# 测试1: 基础搜索
basic_params = {
"query": "test",
"count": 1,
"view": "STANDARD"
}
print("\n1. 测试基础搜索...")
response = await self._make_request(
f"{self.base_url}/search/scopus",
params=basic_params
)
if response:
print("基础搜索成功")
print("可用字段:", list(response.get("search-results", {}).get("entry", [{}])[0].keys()))
# 测试2: 测试单篇文章访问
print("\n2. 测试文章详情访问...")
test_doi = "10.1016/j.artint.2021.103535" # 一个示例DOI
response = await self._make_request(
f"{self.base_url}/abstract/doi/{test_doi}",
params={"view": "STANDARD"} # 改为STANDARD视图
)
if response:
print("文章详情访问成功")
else:
print("文章详情访问失败")
async def get_paper_details(self, paper_id: str) -> Optional[PaperMetadata]:
"""获取论文详细信息
注意当前API权限不支持获取详细信息返回None
Args:
paper_id: 论文ID
Returns:
None因为当前API权限不支持此功能
"""
return None
async def fetch_abstract(self, doi: str) -> Optional[str]:
"""获取论文摘要
使用Scopus Abstract API获取论文摘要
Args:
doi: 论文的DOI
Returns:
摘要文本如果获取失败则返回None
"""
try:
# 使用Abstract API而不是Search API
response = await self._make_request(
f"{self.base_url}/abstract/doi/{doi}",
params={
"view": "FULL" # 使用FULL视图
}
)
if response and "abstracts-retrieval-response" in response:
# 从coredata中获取摘要
coredata = response["abstracts-retrieval-response"].get("coredata", {})
return coredata.get("dc:description", "")
return None
except Exception as e:
print(f"获取摘要时发生错误: {str(e)}")
return None
async def example_usage():
"""ElsevierSource使用示例"""
elsevier = ElsevierSource()
try:
# 首先测试API访问权限
print("\n=== 测试API访问权限 ===")
await elsevier.test_api_access()
# 示例1基本搜索
print("\n=== 示例1搜索机器学习相关论文 ===")
papers = await elsevier.search("machine learning", limit=3)
for i, paper in enumerate(papers, 1):
print(f"\n--- 论文 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"DOI: {paper.doi}")
print(f"URL: {paper.url}")
print(f"引用次数: {paper.citations}")
print(f"期刊/会议: {paper.venue}")
print("期刊信息:")
for key, value in paper.venue_info.items():
if value: # 只打印非空值
print(f" - {key}: {value}")
# 示例2获取引用信息
if papers and papers[0].doi:
print("\n=== 示例2获取引用该论文的文献 ===")
citations = await elsevier.get_citations(papers[0].doi, limit=3)
for i, paper in enumerate(citations, 1):
print(f"\n--- 引用论文 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"DOI: {paper.doi}")
print(f"引用次数: {paper.citations}")
print(f"期刊/会议: {paper.venue}")
# 示例3获取参考文献
if papers and papers[0].doi:
print("\n=== 示例3获取论文的参考文献 ===")
references = await elsevier.get_references(papers[0].doi)
for i, paper in enumerate(references[:3], 1):
print(f"\n--- 参考文献 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"DOI: {paper.doi}")
print(f"期刊/会议: {paper.venue}")
# 示例4按作者搜索
print("\n=== 示例4按作者搜索 ===")
author_papers = await elsevier.search_by_author("Hinton G", limit=3)
for i, paper in enumerate(author_papers, 1):
print(f"\n--- 论文 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"DOI: {paper.doi}")
print(f"引用次数: {paper.citations}")
print(f"期刊/会议: {paper.venue}")
# 示例5按机构搜索
print("\n=== 示例5按机构搜索 ===")
affiliation_papers = await elsevier.search_by_affiliation("60027950", limit=3) # MIT的机构ID
for i, paper in enumerate(affiliation_papers, 1):
print(f"\n--- 论文 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"DOI: {paper.doi}")
print(f"引用次数: {paper.citations}")
print(f"期刊/会议: {paper.venue}")
# 示例6获取论文摘要
print("\n=== 示例6获取论文摘要 ===")
test_doi = "10.1016/j.artint.2021.103535"
abstract = await elsevier.fetch_abstract(test_doi)
if abstract:
print(f"摘要: {abstract[:200]}...") # 只显示前200个字符
else:
print("无法获取摘要")
# 在搜索结果中显示摘要
print("\n=== 示例7搜索结果中的摘要 ===")
papers = await elsevier.search("machine learning", limit=1)
for paper in papers:
print(f"标题: {paper.title}")
print(f"摘要: {paper.abstract[:200]}..." if paper.abstract else "摘要: 无")
except Exception as e:
print(f"发生错误: {str(e)}")
import traceback
print(traceback.format_exc())
if __name__ == "__main__":
asyncio.run(example_usage())