Files
gpt_academic/crazy_functions/review_fns/data_sources/openalex_source.py
binary-husky 8042750d41 Master 4.0 (#2210)
* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-08-23 15:59:22 +08:00

163 lines
6.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import aiohttp
from typing import List, Dict, Optional
from datetime import datetime
from .base_source import DataSource, PaperMetadata
import os
from urllib.parse import quote
class OpenAlexSource(DataSource):
"""OpenAlex API实现"""
def _initialize(self) -> None:
self.base_url = "https://api.openalex.org"
self.mailto = "xxxxxxxxxxxxxxxxxxxxxxxx@163.com" # 直接写入邮件地址
async def search(self, query: str, limit: int = 100) -> List[PaperMetadata]:
params = {"mailto": self.mailto} if self.mailto else {}
params.update({
"filter": f"title.search:{query}",
"per-page": limit
})
async with aiohttp.ClientSession() as session:
async with session.get(
f"{self.base_url}/works",
params=params
) as response:
try:
response.raise_for_status()
data = await response.json()
results = data.get("results", [])
return [self._parse_work(work) for work in results]
except Exception as e:
print(f"搜索出错: {str(e)}")
return []
def _parse_work(self, work: Dict) -> PaperMetadata:
"""解析OpenAlex返回的数据"""
# 获取作者信息
raw_author_names = [
authorship.get("raw_author_name", "")
for authorship in work.get("authorships", [])
if authorship
]
# 处理作者名字格式
authors = [
self._reformat_name(author)
for author in raw_author_names
]
# 获取机构信息
institutions = [
inst.get("display_name", "")
for authorship in work.get("authorships", [])
for inst in authorship.get("institutions", [])
if inst
]
# 获取主要发表位置信息
primary_location = work.get("primary_location") or {}
source = primary_location.get("source") or {}
venue = source.get("display_name")
# 获取发表日期
year = work.get("publication_year")
return PaperMetadata(
title=work.get("title", ""),
authors=authors,
institutions=institutions,
abstract=work.get("abstract", ""),
year=year,
doi=work.get("doi"),
url=work.get("doi"), # OpenAlex 使用 DOI 作为 URL
citations=work.get("cited_by_count"),
venue=venue
)
def _reformat_name(self, name: str) -> str:
"""重新格式化作者名字"""
if "," not in name:
return name
family, given_names = (x.strip() for x in name.split(",", maxsplit=1))
return f"{given_names} {family}"
async def get_paper_details(self, doi: str) -> PaperMetadata:
"""获取指定DOI的论文详情"""
params = {"mailto": self.mailto} if self.mailto else {}
async with aiohttp.ClientSession() as session:
async with session.get(
f"{self.base_url}/works/https://doi.org/{quote(doi, safe='')}",
params=params
) as response:
data = await response.json()
return self._parse_work(data)
async def get_references(self, doi: str) -> List[PaperMetadata]:
"""获取指定DOI论文的参考文献列表"""
params = {"mailto": self.mailto} if self.mailto else {}
async with aiohttp.ClientSession() as session:
async with session.get(
f"{self.base_url}/works/https://doi.org/{quote(doi, safe='')}/references",
params=params
) as response:
data = await response.json()
return [self._parse_work(work) for work in data.get("results", [])]
async def get_citations(self, doi: str) -> List[PaperMetadata]:
"""获取引用指定DOI论文的文献列表"""
params = {"mailto": self.mailto} if self.mailto else {}
params.update({
"filter": f"cites:doi:{doi}",
"per-page": 100
})
async with aiohttp.ClientSession() as session:
async with session.get(
f"{self.base_url}/works",
params=params
) as response:
data = await response.json()
return [self._parse_work(work) for work in data.get("results", [])]
async def example_usage():
"""OpenAlexSource使用示例"""
# 初始化OpenAlexSource
openalex = OpenAlexSource()
try:
print("正在搜索论文...")
# 搜索与"artificial intelligence"相关的论文限制返回5篇
papers = await openalex.search(query="artificial intelligence", limit=5)
if not papers:
print("未获取到任何论文信息")
return
print(f"找到 {len(papers)} 篇论文")
# 打印搜索结果
for i, paper in enumerate(papers, 1):
print(f"\n--- 论文 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors) if paper.authors else '未知'}")
if paper.institutions:
print(f"机构: {', '.join(paper.institutions)}")
print(f"发表年份: {paper.year if paper.year else '未知'}")
print(f"DOI: {paper.doi if paper.doi else '未知'}")
print(f"URL: {paper.url if paper.url else '未知'}")
if paper.abstract:
print(f"摘要: {paper.abstract[:200]}...")
print(f"引用次数: {paper.citations if paper.citations is not None else '未知'}")
print(f"发表venue: {paper.venue if paper.venue else '未知'}")
except Exception as e:
print(f"发生错误: {str(e)}")
import traceback
print(traceback.format_exc())
# 如果直接运行此文件,执行示例代码
if __name__ == "__main__":
import asyncio
# 运行示例
asyncio.run(example_usage())