Files
gpt_academic/crazy_functions/review_fns/data_sources/scopus_source.py
binary-husky 8042750d41 Master 4.0 (#2210)
* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-08-23 15:59:22 +08:00

400 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import List, Optional, Dict, Union
from datetime import datetime
import aiohttp
import random
from .base_source import DataSource, PaperMetadata
from tqdm import tqdm
class ScopusSource(DataSource):
"""Scopus API实现"""
# 定义API密钥列表
API_KEYS = [
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
]
def __init__(self, api_key: str = None):
"""初始化
Args:
api_key: Scopus API密钥如果不提供则从预定义列表中随机选择
"""
self.api_key = api_key or random.choice(self.API_KEYS)
self._initialize()
def _initialize(self) -> None:
"""初始化基础URL和请求头"""
self.base_url = "https://api.elsevier.com/content"
self.headers = {
"X-ELS-APIKey": self.api_key,
"Accept": "application/json"
}
async def _make_request(self, url: str, params: Dict = None) -> Optional[Dict]:
"""发送HTTP请求
Args:
url: 请求URL
params: 查询参数
Returns:
响应JSON数据
"""
try:
async with aiohttp.ClientSession(headers=self.headers) as session:
async with session.get(url, params=params) as response:
if response.status == 200:
return await response.json()
else:
print(f"请求失败: {response.status}")
return None
except Exception as e:
print(f"请求发生错误: {str(e)}")
return None
def _parse_paper_data(self, data: Dict) -> PaperMetadata:
"""解析Scopus API返回的数据
Args:
data: Scopus API返回的论文数据
Returns:
解析后的论文元数据
"""
try:
# 提取基本信息
title = data.get("dc:title", "")
# 提取作者信息
authors = []
if "author" in data:
if isinstance(data["author"], list):
for author in data["author"]:
if "given-name" in author and "surname" in author:
authors.append(f"{author['given-name']} {author['surname']}")
elif "indexed-name" in author:
authors.append(author["indexed-name"])
elif isinstance(data["author"], dict):
if "given-name" in data["author"] and "surname" in data["author"]:
authors.append(f"{data['author']['given-name']} {data['author']['surname']}")
elif "indexed-name" in data["author"]:
authors.append(data["author"]["indexed-name"])
# 提取摘要
abstract = data.get("dc:description", "")
# 提取年份
year = None
if "prism:coverDate" in data:
try:
year = int(data["prism:coverDate"][:4])
except:
pass
# 提取DOI
doi = data.get("prism:doi")
# 提取引用次数
citations = data.get("citedby-count")
if citations:
try:
citations = int(citations)
except:
citations = None
# 提取期刊信息
venue = data.get("prism:publicationName")
# 提取机构信息
institutions = []
if "affiliation" in data:
if isinstance(data["affiliation"], list):
for aff in data["affiliation"]:
if "affilname" in aff:
institutions.append(aff["affilname"])
elif isinstance(data["affiliation"], dict):
if "affilname" in data["affiliation"]:
institutions.append(data["affiliation"]["affilname"])
# 构建venue信息
venue_info = {
"issn": data.get("prism:issn"),
"eissn": data.get("prism:eIssn"),
"volume": data.get("prism:volume"),
"issue": data.get("prism:issueIdentifier"),
"page_range": data.get("prism:pageRange"),
"article_number": data.get("article-number"),
"publication_date": data.get("prism:coverDate")
}
return PaperMetadata(
title=title,
authors=authors,
abstract=abstract,
year=year,
doi=doi,
url=data.get("link", [{}])[0].get("@href"),
citations=citations,
venue=venue,
institutions=institutions,
venue_type="journal",
venue_name=venue,
venue_info=venue_info
)
except Exception as e:
print(f"解析论文数据时发生错误: {str(e)}")
return None
async def search(
self,
query: str,
limit: int = 100,
sort_by: str = None,
start_year: int = None
) -> List[PaperMetadata]:
"""搜索论文
Args:
query: 搜索关键词
limit: 返回结果数量限制
sort_by: 排序方式 ('relevance', 'date', 'citations')
start_year: 起始年份
Returns:
论文列表
"""
try:
# 构建查询参数
params = {
"query": query,
"count": min(limit, 100), # Scopus API单次请求限制
"start": 0
}
# 添加年份过滤
if start_year:
params["date"] = f"{start_year}-present"
# 添加排序
if sort_by:
sort_map = {
"relevance": "-score",
"date": "-coverDate",
"citations": "-citedby-count"
}
if sort_by in sort_map:
params["sort"] = sort_map[sort_by]
# 发送请求
url = f"{self.base_url}/search/scopus"
response = await self._make_request(url, params)
if not response or "search-results" not in response:
return []
# 解析结果
results = response["search-results"].get("entry", [])
papers = []
for result in results:
paper = self._parse_paper_data(result)
if paper:
papers.append(paper)
return papers
except Exception as e:
print(f"搜索论文时发生错误: {str(e)}")
return []
async def get_paper_details(self, paper_id: str) -> Optional[PaperMetadata]:
"""获取论文详情
Args:
paper_id: Scopus ID或DOI
Returns:
论文详情
"""
try:
# 判断是否为DOI
if "/" in paper_id:
url = f"{self.base_url}/article/doi/{paper_id}"
else:
url = f"{self.base_url}/abstract/scopus_id/{paper_id}"
response = await self._make_request(url)
if not response or "abstracts-retrieval-response" not in response:
return None
data = response["abstracts-retrieval-response"]
return self._parse_paper_data(data)
except Exception as e:
print(f"获取论文详情时发生错误: {str(e)}")
return None
async def get_citations(self, paper_id: str) -> List[PaperMetadata]:
"""获取引用该论文的文献
Args:
paper_id: Scopus ID
Returns:
引用论文列表
"""
try:
url = f"{self.base_url}/abstract/citations/{paper_id}"
response = await self._make_request(url)
if not response or "citing-papers" not in response:
return []
results = response["citing-papers"].get("papers", [])
papers = []
for result in results:
paper = self._parse_paper_data(result)
if paper:
papers.append(paper)
return papers
except Exception as e:
print(f"获取引用信息时发生错误: {str(e)}")
return []
async def get_references(self, paper_id: str) -> List[PaperMetadata]:
"""获取该论文引用的文献
Args:
paper_id: Scopus ID
Returns:
参考文献列表
"""
try:
url = f"{self.base_url}/abstract/references/{paper_id}"
response = await self._make_request(url)
if not response or "references" not in response:
return []
results = response["references"].get("reference", [])
papers = []
for result in results:
paper = self._parse_paper_data(result)
if paper:
papers.append(paper)
return papers
except Exception as e:
print(f"获取参考文献时发生错误: {str(e)}")
return []
async def search_by_author(
self,
author: str,
limit: int = 100,
start_year: int = None
) -> List[PaperMetadata]:
"""按作者搜索论文"""
query = f"AUTHOR-NAME({author})"
if start_year:
query += f" AND PUBYEAR > {start_year}"
return await self.search(query, limit=limit)
async def search_by_journal(
self,
journal: str,
limit: int = 100,
start_year: int = None
) -> List[PaperMetadata]:
"""按期刊搜索论文"""
query = f"SRCTITLE({journal})"
if start_year:
query += f" AND PUBYEAR > {start_year}"
return await self.search(query, limit=limit)
async def get_latest_papers(
self,
days: int = 7,
limit: int = 100
) -> List[PaperMetadata]:
"""获取最新论文"""
query = f"LOAD-DATE > NOW() - {days}d"
return await self.search(query, limit=limit, sort_by="date")
async def example_usage():
"""ScopusSource使用示例"""
scopus = ScopusSource()
try:
# 示例1基本搜索
print("\n=== 示例1搜索机器学习相关论文 ===")
papers = await scopus.search("machine learning", limit=3)
print(f"\n找到 {len(papers)} 篇相关论文:")
for i, paper in enumerate(papers, 1):
print(f"\n论文 {i}:")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"发表期刊: {paper.venue}")
print(f"引用次数: {paper.citations}")
print(f"DOI: {paper.doi}")
if paper.abstract:
print(f"摘要:\n{paper.abstract}")
print("-" * 80)
# 示例2按作者搜索
print("\n=== 示例2搜索特定作者的论文 ===")
author_papers = await scopus.search_by_author("Hinton G.", limit=3)
print(f"\n找到 {len(author_papers)} 篇 Hinton 的论文:")
for i, paper in enumerate(author_papers, 1):
print(f"\n论文 {i}:")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"发表期刊: {paper.venue}")
print(f"引用次数: {paper.citations}")
print(f"DOI: {paper.doi}")
if paper.abstract:
print(f"摘要:\n{paper.abstract}")
print("-" * 80)
# 示例3根据关键词搜索相关论文
print("\n=== 示例3搜索人工智能相关论文 ===")
keywords = "artificial intelligence AND deep learning"
papers = await scopus.search(
query=keywords,
limit=5,
sort_by="citations", # 按引用次数排序
start_year=2020 # 只搜索2020年之后的论文
)
print(f"\n找到 {len(papers)} 篇相关论文:")
for i, paper in enumerate(papers, 1):
print(f"\n论文 {i}:")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"发表期刊: {paper.venue}")
print(f"引用次数: {paper.citations}")
print(f"DOI: {paper.doi}")
if paper.abstract:
print(f"摘要:\n{paper.abstract}")
print("-" * 80)
except Exception as e:
print(f"发生错误: {str(e)}")
import traceback
print(traceback.format_exc())
if __name__ == "__main__":
import asyncio
asyncio.run(example_usage())