* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能,支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能,支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
400 lines
13 KiB
Python
400 lines
13 KiB
Python
from typing import List, Optional, Dict, Union
|
||
from datetime import datetime
|
||
import aiohttp
|
||
import random
|
||
from .base_source import DataSource, PaperMetadata
|
||
from tqdm import tqdm
|
||
|
||
class ScopusSource(DataSource):
|
||
"""Scopus API实现"""
|
||
|
||
# 定义API密钥列表
|
||
API_KEYS = [
|
||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
||
]
|
||
|
||
def __init__(self, api_key: str = None):
|
||
"""初始化
|
||
|
||
Args:
|
||
api_key: Scopus API密钥,如果不提供则从预定义列表中随机选择
|
||
"""
|
||
self.api_key = api_key or random.choice(self.API_KEYS)
|
||
self._initialize()
|
||
|
||
def _initialize(self) -> None:
|
||
"""初始化基础URL和请求头"""
|
||
self.base_url = "https://api.elsevier.com/content"
|
||
self.headers = {
|
||
"X-ELS-APIKey": self.api_key,
|
||
"Accept": "application/json"
|
||
}
|
||
|
||
async def _make_request(self, url: str, params: Dict = None) -> Optional[Dict]:
|
||
"""发送HTTP请求
|
||
|
||
Args:
|
||
url: 请求URL
|
||
params: 查询参数
|
||
|
||
Returns:
|
||
响应JSON数据
|
||
"""
|
||
try:
|
||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||
async with session.get(url, params=params) as response:
|
||
if response.status == 200:
|
||
return await response.json()
|
||
else:
|
||
print(f"请求失败: {response.status}")
|
||
return None
|
||
except Exception as e:
|
||
print(f"请求发生错误: {str(e)}")
|
||
return None
|
||
|
||
def _parse_paper_data(self, data: Dict) -> PaperMetadata:
|
||
"""解析Scopus API返回的数据
|
||
|
||
Args:
|
||
data: Scopus API返回的论文数据
|
||
|
||
Returns:
|
||
解析后的论文元数据
|
||
"""
|
||
try:
|
||
# 提取基本信息
|
||
title = data.get("dc:title", "")
|
||
|
||
# 提取作者信息
|
||
authors = []
|
||
if "author" in data:
|
||
if isinstance(data["author"], list):
|
||
for author in data["author"]:
|
||
if "given-name" in author and "surname" in author:
|
||
authors.append(f"{author['given-name']} {author['surname']}")
|
||
elif "indexed-name" in author:
|
||
authors.append(author["indexed-name"])
|
||
elif isinstance(data["author"], dict):
|
||
if "given-name" in data["author"] and "surname" in data["author"]:
|
||
authors.append(f"{data['author']['given-name']} {data['author']['surname']}")
|
||
elif "indexed-name" in data["author"]:
|
||
authors.append(data["author"]["indexed-name"])
|
||
|
||
# 提取摘要
|
||
abstract = data.get("dc:description", "")
|
||
|
||
# 提取年份
|
||
year = None
|
||
if "prism:coverDate" in data:
|
||
try:
|
||
year = int(data["prism:coverDate"][:4])
|
||
except:
|
||
pass
|
||
|
||
# 提取DOI
|
||
doi = data.get("prism:doi")
|
||
|
||
# 提取引用次数
|
||
citations = data.get("citedby-count")
|
||
if citations:
|
||
try:
|
||
citations = int(citations)
|
||
except:
|
||
citations = None
|
||
|
||
# 提取期刊信息
|
||
venue = data.get("prism:publicationName")
|
||
|
||
# 提取机构信息
|
||
institutions = []
|
||
if "affiliation" in data:
|
||
if isinstance(data["affiliation"], list):
|
||
for aff in data["affiliation"]:
|
||
if "affilname" in aff:
|
||
institutions.append(aff["affilname"])
|
||
elif isinstance(data["affiliation"], dict):
|
||
if "affilname" in data["affiliation"]:
|
||
institutions.append(data["affiliation"]["affilname"])
|
||
|
||
# 构建venue信息
|
||
venue_info = {
|
||
"issn": data.get("prism:issn"),
|
||
"eissn": data.get("prism:eIssn"),
|
||
"volume": data.get("prism:volume"),
|
||
"issue": data.get("prism:issueIdentifier"),
|
||
"page_range": data.get("prism:pageRange"),
|
||
"article_number": data.get("article-number"),
|
||
"publication_date": data.get("prism:coverDate")
|
||
}
|
||
|
||
return PaperMetadata(
|
||
title=title,
|
||
authors=authors,
|
||
abstract=abstract,
|
||
year=year,
|
||
doi=doi,
|
||
url=data.get("link", [{}])[0].get("@href"),
|
||
citations=citations,
|
||
venue=venue,
|
||
institutions=institutions,
|
||
venue_type="journal",
|
||
venue_name=venue,
|
||
venue_info=venue_info
|
||
)
|
||
|
||
except Exception as e:
|
||
print(f"解析论文数据时发生错误: {str(e)}")
|
||
return None
|
||
|
||
async def search(
|
||
self,
|
||
query: str,
|
||
limit: int = 100,
|
||
sort_by: str = None,
|
||
start_year: int = None
|
||
) -> List[PaperMetadata]:
|
||
"""搜索论文
|
||
|
||
Args:
|
||
query: 搜索关键词
|
||
limit: 返回结果数量限制
|
||
sort_by: 排序方式 ('relevance', 'date', 'citations')
|
||
start_year: 起始年份
|
||
|
||
Returns:
|
||
论文列表
|
||
"""
|
||
try:
|
||
# 构建查询参数
|
||
params = {
|
||
"query": query,
|
||
"count": min(limit, 100), # Scopus API单次请求限制
|
||
"start": 0
|
||
}
|
||
|
||
# 添加年份过滤
|
||
if start_year:
|
||
params["date"] = f"{start_year}-present"
|
||
|
||
# 添加排序
|
||
if sort_by:
|
||
sort_map = {
|
||
"relevance": "-score",
|
||
"date": "-coverDate",
|
||
"citations": "-citedby-count"
|
||
}
|
||
if sort_by in sort_map:
|
||
params["sort"] = sort_map[sort_by]
|
||
|
||
# 发送请求
|
||
url = f"{self.base_url}/search/scopus"
|
||
response = await self._make_request(url, params)
|
||
|
||
if not response or "search-results" not in response:
|
||
return []
|
||
|
||
# 解析结果
|
||
results = response["search-results"].get("entry", [])
|
||
papers = []
|
||
|
||
for result in results:
|
||
paper = self._parse_paper_data(result)
|
||
if paper:
|
||
papers.append(paper)
|
||
|
||
return papers
|
||
|
||
except Exception as e:
|
||
print(f"搜索论文时发生错误: {str(e)}")
|
||
return []
|
||
|
||
async def get_paper_details(self, paper_id: str) -> Optional[PaperMetadata]:
|
||
"""获取论文详情
|
||
|
||
Args:
|
||
paper_id: Scopus ID或DOI
|
||
|
||
Returns:
|
||
论文详情
|
||
"""
|
||
try:
|
||
# 判断是否为DOI
|
||
if "/" in paper_id:
|
||
url = f"{self.base_url}/article/doi/{paper_id}"
|
||
else:
|
||
url = f"{self.base_url}/abstract/scopus_id/{paper_id}"
|
||
|
||
response = await self._make_request(url)
|
||
|
||
if not response or "abstracts-retrieval-response" not in response:
|
||
return None
|
||
|
||
data = response["abstracts-retrieval-response"]
|
||
return self._parse_paper_data(data)
|
||
|
||
except Exception as e:
|
||
print(f"获取论文详情时发生错误: {str(e)}")
|
||
return None
|
||
|
||
async def get_citations(self, paper_id: str) -> List[PaperMetadata]:
|
||
"""获取引用该论文的文献
|
||
|
||
Args:
|
||
paper_id: Scopus ID
|
||
|
||
Returns:
|
||
引用论文列表
|
||
"""
|
||
try:
|
||
url = f"{self.base_url}/abstract/citations/{paper_id}"
|
||
response = await self._make_request(url)
|
||
|
||
if not response or "citing-papers" not in response:
|
||
return []
|
||
|
||
results = response["citing-papers"].get("papers", [])
|
||
papers = []
|
||
|
||
for result in results:
|
||
paper = self._parse_paper_data(result)
|
||
if paper:
|
||
papers.append(paper)
|
||
|
||
return papers
|
||
|
||
except Exception as e:
|
||
print(f"获取引用信息时发生错误: {str(e)}")
|
||
return []
|
||
|
||
async def get_references(self, paper_id: str) -> List[PaperMetadata]:
|
||
"""获取该论文引用的文献
|
||
|
||
Args:
|
||
paper_id: Scopus ID
|
||
|
||
Returns:
|
||
参考文献列表
|
||
"""
|
||
try:
|
||
url = f"{self.base_url}/abstract/references/{paper_id}"
|
||
response = await self._make_request(url)
|
||
|
||
if not response or "references" not in response:
|
||
return []
|
||
|
||
results = response["references"].get("reference", [])
|
||
papers = []
|
||
|
||
for result in results:
|
||
paper = self._parse_paper_data(result)
|
||
if paper:
|
||
papers.append(paper)
|
||
|
||
return papers
|
||
|
||
except Exception as e:
|
||
print(f"获取参考文献时发生错误: {str(e)}")
|
||
return []
|
||
|
||
async def search_by_author(
|
||
self,
|
||
author: str,
|
||
limit: int = 100,
|
||
start_year: int = None
|
||
) -> List[PaperMetadata]:
|
||
"""按作者搜索论文"""
|
||
query = f"AUTHOR-NAME({author})"
|
||
if start_year:
|
||
query += f" AND PUBYEAR > {start_year}"
|
||
return await self.search(query, limit=limit)
|
||
|
||
async def search_by_journal(
|
||
self,
|
||
journal: str,
|
||
limit: int = 100,
|
||
start_year: int = None
|
||
) -> List[PaperMetadata]:
|
||
"""按期刊搜索论文"""
|
||
query = f"SRCTITLE({journal})"
|
||
if start_year:
|
||
query += f" AND PUBYEAR > {start_year}"
|
||
return await self.search(query, limit=limit)
|
||
|
||
async def get_latest_papers(
|
||
self,
|
||
days: int = 7,
|
||
limit: int = 100
|
||
) -> List[PaperMetadata]:
|
||
"""获取最新论文"""
|
||
query = f"LOAD-DATE > NOW() - {days}d"
|
||
return await self.search(query, limit=limit, sort_by="date")
|
||
|
||
async def example_usage():
|
||
"""ScopusSource使用示例"""
|
||
scopus = ScopusSource()
|
||
|
||
try:
|
||
# 示例1:基本搜索
|
||
print("\n=== 示例1:搜索机器学习相关论文 ===")
|
||
papers = await scopus.search("machine learning", limit=3)
|
||
print(f"\n找到 {len(papers)} 篇相关论文:")
|
||
for i, paper in enumerate(papers, 1):
|
||
print(f"\n论文 {i}:")
|
||
print(f"标题: {paper.title}")
|
||
print(f"作者: {', '.join(paper.authors)}")
|
||
print(f"发表年份: {paper.year}")
|
||
print(f"发表期刊: {paper.venue}")
|
||
print(f"引用次数: {paper.citations}")
|
||
print(f"DOI: {paper.doi}")
|
||
if paper.abstract:
|
||
print(f"摘要:\n{paper.abstract}")
|
||
print("-" * 80)
|
||
|
||
# 示例2:按作者搜索
|
||
print("\n=== 示例2:搜索特定作者的论文 ===")
|
||
author_papers = await scopus.search_by_author("Hinton G.", limit=3)
|
||
print(f"\n找到 {len(author_papers)} 篇 Hinton 的论文:")
|
||
for i, paper in enumerate(author_papers, 1):
|
||
print(f"\n论文 {i}:")
|
||
print(f"标题: {paper.title}")
|
||
print(f"作者: {', '.join(paper.authors)}")
|
||
print(f"发表年份: {paper.year}")
|
||
print(f"发表期刊: {paper.venue}")
|
||
print(f"引用次数: {paper.citations}")
|
||
print(f"DOI: {paper.doi}")
|
||
if paper.abstract:
|
||
print(f"摘要:\n{paper.abstract}")
|
||
print("-" * 80)
|
||
|
||
# 示例3:根据关键词搜索相关论文
|
||
print("\n=== 示例3:搜索人工智能相关论文 ===")
|
||
keywords = "artificial intelligence AND deep learning"
|
||
papers = await scopus.search(
|
||
query=keywords,
|
||
limit=5,
|
||
sort_by="citations", # 按引用次数排序
|
||
start_year=2020 # 只搜索2020年之后的论文
|
||
)
|
||
|
||
print(f"\n找到 {len(papers)} 篇相关论文:")
|
||
for i, paper in enumerate(papers, 1):
|
||
print(f"\n论文 {i}:")
|
||
print(f"标题: {paper.title}")
|
||
print(f"作者: {', '.join(paper.authors)}")
|
||
print(f"发表年份: {paper.year}")
|
||
print(f"发表期刊: {paper.venue}")
|
||
print(f"引用次数: {paper.citations}")
|
||
print(f"DOI: {paper.doi}")
|
||
if paper.abstract:
|
||
print(f"摘要:\n{paper.abstract}")
|
||
print("-" * 80)
|
||
|
||
except Exception as e:
|
||
print(f"发生错误: {str(e)}")
|
||
import traceback
|
||
print(traceback.format_exc())
|
||
|
||
if __name__ == "__main__":
|
||
import asyncio
|
||
asyncio.run(example_usage()) |