Files
gpt_academic/crazy_functions/review_fns/data_sources/pubmed_source.py
binary-husky 8042750d41 Master 4.0 (#2210)
* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-08-23 15:59:22 +08:00

459 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import List, Optional, Dict, Union
from datetime import datetime
import aiohttp
import asyncio
from crazy_functions.review_fns.data_sources.base_source import DataSource, PaperMetadata
import xml.etree.ElementTree as ET
from urllib.parse import quote
import json
from tqdm import tqdm
import random
class PubMedSource(DataSource):
"""PubMed API实现"""
# 定义API密钥列表
API_KEYS = [
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
]
def __init__(self, api_key: str = None):
"""初始化
Args:
api_key: PubMed API密钥如果不提供则从预定义列表中随机选择
"""
self.api_key = api_key or random.choice(self.API_KEYS) # 随机选择一个API密钥
self._initialize()
def _initialize(self) -> None:
"""初始化基础URL和请求头"""
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
self.headers = {
"User-Agent": "Mozilla/5.0 PubMedDataSource/1.0",
"Accept": "application/json"
}
async def _make_request(self, url: str) -> Optional[str]:
"""发送HTTP请求
Args:
url: 请求URL
Returns:
响应内容
"""
try:
async with aiohttp.ClientSession(headers=self.headers) as session:
async with session.get(url) as response:
if response.status == 200:
return await response.text()
else:
print(f"请求失败: {response.status}")
return None
except Exception as e:
print(f"请求发生错误: {str(e)}")
return None
async def search(
self,
query: str,
limit: int = 100,
sort_by: str = "relevance",
start_year: int = None
) -> List[PaperMetadata]:
"""搜索论文
Args:
query: 搜索关键词
limit: 返回结果数量限制
sort_by: 排序方式 ('relevance', 'date', 'citations')
start_year: 起始年份
Returns:
论文列表
"""
try:
# 添加年份过滤
if start_year:
query = f"{query} AND {start_year}:3000[dp]"
# 构建搜索URL
search_url = (
f"{self.base_url}/esearch.fcgi?"
f"db=pubmed&term={quote(query)}&retmax={limit}"
f"&usehistory=y&api_key={self.api_key}"
)
if sort_by == "date":
search_url += "&sort=date"
# 获取搜索结果
response = await self._make_request(search_url)
if not response:
return []
# 解析XML响应
root = ET.fromstring(response)
id_list = root.findall(".//Id")
pmids = [id_elem.text for id_elem in id_list]
if not pmids:
return []
# 批量获取论文详情
papers = []
batch_size = 50
for i in range(0, len(pmids), batch_size):
batch = pmids[i:i + batch_size]
batch_papers = await self._fetch_papers_batch(batch)
papers.extend(batch_papers)
return papers
except Exception as e:
print(f"搜索论文时发生错误: {str(e)}")
return []
async def _fetch_papers_batch(self, pmids: List[str]) -> List[PaperMetadata]:
"""批量获取论文详情
Args:
pmids: PubMed ID列表
Returns:
论文详情列表
"""
try:
# 构建批量获取URL
fetch_url = (
f"{self.base_url}/efetch.fcgi?"
f"db=pubmed&id={','.join(pmids)}"
f"&retmode=xml&api_key={self.api_key}"
)
response = await self._make_request(fetch_url)
if not response:
return []
# 解析XML响应
root = ET.fromstring(response)
articles = root.findall(".//PubmedArticle")
return [self._parse_article(article) for article in articles]
except Exception as e:
print(f"获取论文批次时发生错误: {str(e)}")
return []
def _parse_article(self, article: ET.Element) -> PaperMetadata:
"""解析PubMed文章XML
Args:
article: XML元素
Returns:
解析后的论文数据
"""
try:
# 提取基本信息
pmid = article.find(".//PMID").text
article_meta = article.find(".//Article")
# 获取标题
title = article_meta.find(".//ArticleTitle")
title = title.text if title is not None else ""
# 获取作者列表
authors = []
author_list = article_meta.findall(".//Author")
for author in author_list:
last_name = author.find("LastName")
fore_name = author.find("ForeName")
if last_name is not None and fore_name is not None:
authors.append(f"{fore_name.text} {last_name.text}")
elif last_name is not None:
authors.append(last_name.text)
# 获取摘要
abstract = article_meta.find(".//Abstract/AbstractText")
abstract = abstract.text if abstract is not None else ""
# 获取发表年份
pub_date = article_meta.find(".//PubDate/Year")
year = int(pub_date.text) if pub_date is not None else None
# 获取DOI
doi = article.find(".//ELocationID[@EIdType='doi']")
doi = doi.text if doi is not None else None
# 获取期刊信息
journal = article_meta.find(".//Journal")
if journal is not None:
journal_title = journal.find(".//Title")
venue = journal_title.text if journal_title is not None else None
# 获取期刊详细信息
venue_info = {
'issn': journal.findtext(".//ISSN"),
'volume': journal.findtext(".//Volume"),
'issue': journal.findtext(".//Issue"),
'pub_date': journal.findtext(".//PubDate/MedlineDate") or
f"{journal.findtext('.//PubDate/Year', '')}-{journal.findtext('.//PubDate/Month', '')}"
}
else:
venue = None
venue_info = {}
# 获取机构信息
institutions = []
affiliations = article_meta.findall(".//Affiliation")
for affiliation in affiliations:
if affiliation is not None and affiliation.text:
institutions.append(affiliation.text)
return PaperMetadata(
title=title,
authors=authors,
abstract=abstract,
year=year,
doi=doi,
url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" if pmid else None,
citations=None, # PubMed API不直接提供引用数据
venue=venue,
institutions=institutions,
venue_type="journal",
venue_name=venue,
venue_info=venue_info,
source='pubmed' # 添加来源标记
)
except Exception as e:
print(f"解析文章时发生错误: {str(e)}")
return None
async def get_paper_details(self, pmid: str) -> Optional[PaperMetadata]:
"""获取指定PMID的论文详情"""
papers = await self._fetch_papers_batch([pmid])
return papers[0] if papers else None
async def get_related_papers(self, pmid: str, limit: int = 100) -> List[PaperMetadata]:
"""获取相关论文
使用PubMed的相关文章功能
Args:
pmid: PubMed ID
limit: 返回结果数量限制
Returns:
相关论文列表
"""
try:
# 构建相关文章URL
link_url = (
f"{self.base_url}/elink.fcgi?"
f"db=pubmed&id={pmid}&cmd=neighbor&api_key={self.api_key}"
)
response = await self._make_request(link_url)
if not response:
return []
# 解析XML响应
root = ET.fromstring(response)
related_ids = root.findall(".//Link/Id")
pmids = [id_elem.text for id_elem in related_ids][:limit]
if not pmids:
return []
# 获取相关论文详情
return await self._fetch_papers_batch(pmids)
except Exception as e:
print(f"获取相关论文时发生错误: {str(e)}")
return []
async def search_by_author(
self,
author: str,
limit: int = 100,
start_year: int = None
) -> List[PaperMetadata]:
"""按作者搜索论文"""
query = f"{author}[Author]"
if start_year:
query += f" AND {start_year}:3000[dp]"
return await self.search(query, limit=limit)
async def search_by_journal(
self,
journal: str,
limit: int = 100,
start_year: int = None
) -> List[PaperMetadata]:
"""按期刊搜索论文"""
query = f"{journal}[Journal]"
if start_year:
query += f" AND {start_year}:3000[dp]"
return await self.search(query, limit=limit)
async def get_latest_papers(
self,
days: int = 7,
limit: int = 100
) -> List[PaperMetadata]:
"""获取最新论文
Args:
days: 最近几天的论文
limit: 返回结果数量限制
Returns:
最新论文列表
"""
query = f"last {days} days[dp]"
return await self.search(query, limit=limit, sort_by="date")
async def get_citations(self, paper_id: str) -> List[PaperMetadata]:
"""获取引用该论文的文献
注意PubMed API本身不提供引用数据此方法将返回空列表
未来可以考虑集成其他数据源(如CrossRef)来获取引用信息
Args:
paper_id: PubMed ID
Returns:
空列表因为PubMed不提供引用数据
"""
return []
async def get_references(self, paper_id: str) -> List[PaperMetadata]:
"""获取该论文引用的文献
从PubMed文章的参考文献列表获取引用的文献
Args:
paper_id: PubMed ID
Returns:
引用的文献列表
"""
try:
# 构建获取参考文献的URL
refs_url = (
f"{self.base_url}/elink.fcgi?"
f"dbfrom=pubmed&db=pubmed&id={paper_id}"
f"&cmd=neighbor_history&linkname=pubmed_pubmed_refs"
f"&api_key={self.api_key}"
)
response = await self._make_request(refs_url)
if not response:
return []
# 解析XML响应
root = ET.fromstring(response)
ref_ids = root.findall(".//Link/Id")
pmids = [id_elem.text for id_elem in ref_ids]
if not pmids:
return []
# 获取参考文献详情
return await self._fetch_papers_batch(pmids)
except Exception as e:
print(f"获取参考文献时发生错误: {str(e)}")
return []
async def example_usage():
"""PubMedSource使用示例"""
pubmed = PubMedSource()
try:
# 示例1基本搜索
print("\n=== 示例1搜索COVID-19相关论文 ===")
papers = await pubmed.search("COVID-19", limit=3)
for i, paper in enumerate(papers, 1):
print(f"\n--- 论文 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"DOI: {paper.doi}")
if paper.abstract:
print(f"摘要: {paper.abstract[:200]}...")
# 示例2获取论文详情
if papers:
print("\n=== 示例2获取论文详情 ===")
paper_id = papers[0].url.split("/")[-2]
paper = await pubmed.get_paper_details(paper_id)
if paper:
print(f"标题: {paper.title}")
print(f"期刊: {paper.venue}")
print(f"机构: {', '.join(paper.institutions)}")
# 示例3获取相关论文
if papers:
print("\n=== 示例3获取相关论文 ===")
related = await pubmed.get_related_papers(paper_id, limit=3)
for i, paper in enumerate(related, 1):
print(f"\n--- 相关论文 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
# 示例4按作者搜索
print("\n=== 示例4按作者搜索 ===")
author_papers = await pubmed.search_by_author("Fauci AS", limit=3)
for i, paper in enumerate(author_papers, 1):
print(f"\n--- 论文 {i} ---")
print(f"标题: {paper.title}")
print(f"发表年份: {paper.year}")
# 示例5按期刊搜索
print("\n=== 示例5按期刊搜索 ===")
journal_papers = await pubmed.search_by_journal("Nature", limit=3)
for i, paper in enumerate(journal_papers, 1):
print(f"\n--- 论文 {i} ---")
print(f"标题: {paper.title}")
print(f"发表年份: {paper.year}")
# 示例6获取最新论文
print("\n=== 示例6获取最新论文 ===")
latest = await pubmed.get_latest_papers(days=7, limit=3)
for i, paper in enumerate(latest, 1):
print(f"\n--- 最新论文 {i} ---")
print(f"标题: {paper.title}")
print(f"发表日期: {paper.venue_info.get('pub_date')}")
# 示例7获取论文的参考文献
if papers:
print("\n=== 示例7获取论文的参考文献 ===")
paper_id = papers[0].url.split("/")[-2]
references = await pubmed.get_references(paper_id)
for i, paper in enumerate(references[:3], 1):
print(f"\n--- 参考文献 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
# 示例8尝试获取引用信息将返回空列表
if papers:
print("\n=== 示例8获取论文的引用信息 ===")
paper_id = papers[0].url.split("/")[-2]
citations = await pubmed.get_citations(paper_id)
print(f"引用数据:{len(citations)} (PubMed API不提供引用信息)")
except Exception as e:
print(f"发生错误: {str(e)}")
import traceback
print(traceback.format_exc())
if __name__ == "__main__":
asyncio.run(example_usage())