Files
gpt_academic/crazy_functions/paper_fns/wiki/wikipedia_api.py
binary-husky 8042750d41 Master 4.0 (#2210)
* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-08-23 15:59:22 +08:00

387 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import aiohttp
import asyncio
from typing import List, Dict, Optional
import re
import random
import time
class WikipediaAPI:
"""维基百科API调用实现"""
def __init__(self, language: str = "zh", user_agent: str = None,
max_concurrent: int = 5, request_delay: float = 0.5):
"""
初始化维基百科API客户端
Args:
language: 语言代码 (zh: 中文, en: 英文, ja: 日文等)
user_agent: 用户代理信息如果为None将使用默认值
max_concurrent: 最大并发请求数
request_delay: 请求间隔时间(秒)
"""
self.language = language
self.base_url = f"https://{language}.wikipedia.org/w/api.php"
self.user_agent = user_agent or "WikipediaAPIClient/1.0 (chatscholar@163.com)"
self.headers = {
"User-Agent": self.user_agent,
"Accept": "application/json"
}
# 添加并发控制
self.semaphore = asyncio.Semaphore(max_concurrent)
self.request_delay = request_delay
self.last_request_time = 0
async def _make_request(self, url, params=None):
"""
发起API请求包含并发控制和请求延迟
Args:
url: 请求URL
params: 请求参数
Returns:
API响应数据
"""
# 使用信号量控制并发
async with self.semaphore:
# 添加请求间隔
current_time = time.time()
time_since_last_request = current_time - self.last_request_time
if time_since_last_request < self.request_delay:
await asyncio.sleep(self.request_delay - time_since_last_request)
# 设置随机延迟,避免规律性请求
jitter = random.uniform(0, 0.2)
await asyncio.sleep(jitter)
# 记录本次请求时间
self.last_request_time = time.time()
# 发起请求
try:
async with aiohttp.ClientSession(headers=self.headers) as session:
async with session.get(url, params=params) as response:
if response.status == 429: # Too Many Requests
retry_after = int(response.headers.get('Retry-After', 5))
print(f"达到请求限制,等待 {retry_after} 秒后重试...")
await asyncio.sleep(retry_after)
return await self._make_request(url, params)
if response.status != 200:
print(f"API请求失败: HTTP {response.status}")
print(f"响应内容: {await response.text()}")
return None
return await response.json()
except aiohttp.ClientError as e:
print(f"请求错误: {str(e)}")
return None
async def search(self, query: str, limit: int = 10, namespace: int = 0) -> List[Dict]:
"""
搜索维基百科文章
Args:
query: 搜索关键词
limit: 返回结果数量
namespace: 命名空间 (0表示文章, 14表示分类等)
Returns:
搜索结果列表
"""
params = {
"action": "query",
"list": "search",
"srsearch": query,
"format": "json",
"srlimit": limit,
"srnamespace": namespace,
"srprop": "snippet|titlesnippet|sectiontitle|categorysnippet|size|wordcount|timestamp|redirecttitle"
}
data = await self._make_request(self.base_url, params)
if not data:
return []
search_results = data.get("query", {}).get("search", [])
return search_results
async def get_page_content(self, title: str, section: Optional[int] = None) -> Dict:
"""
获取维基百科页面内容
Args:
title: 页面标题
section: 特定章节编号(可选)
Returns:
页面内容字典
"""
async with aiohttp.ClientSession(headers=self.headers) as session:
params = {
"action": "parse",
"page": title,
"format": "json",
"prop": "text|langlinks|categories|links|templates|images|externallinks|sections|revid|displaytitle|iwlinks|properties"
}
# 如果指定了章节,只获取该章节内容
if section is not None:
params["section"] = section
async with session.get(self.base_url, params=params) as response:
if response.status != 200:
print(f"API请求失败: HTTP {response.status}")
return {}
data = await response.json()
if "error" in data:
print(f"API错误: {data['error'].get('info', '未知错误')}")
return {}
return data.get("parse", {})
async def get_summary(self, title: str, sentences: int = 3) -> str:
"""
获取页面摘要
Args:
title: 页面标题
sentences: 返回的句子数量
Returns:
页面摘要文本
"""
async with aiohttp.ClientSession(headers=self.headers) as session:
params = {
"action": "query",
"prop": "extracts",
"exintro": "1",
"exsentences": sentences,
"explaintext": "1",
"titles": title,
"format": "json"
}
async with session.get(self.base_url, params=params) as response:
if response.status != 200:
print(f"API请求失败: HTTP {response.status}")
return ""
data = await response.json()
pages = data.get("query", {}).get("pages", {})
# 获取第一个页面ID的内容
for page_id in pages:
return pages[page_id].get("extract", "")
return ""
async def get_random_articles(self, count: int = 1, namespace: int = 0) -> List[Dict]:
"""
获取随机文章
Args:
count: 需要的随机文章数量
namespace: 命名空间
Returns:
随机文章列表
"""
async with aiohttp.ClientSession(headers=self.headers) as session:
params = {
"action": "query",
"list": "random",
"rnlimit": count,
"rnnamespace": namespace,
"format": "json"
}
async with session.get(self.base_url, params=params) as response:
if response.status != 200:
print(f"API请求失败: HTTP {response.status}")
return []
data = await response.json()
return data.get("query", {}).get("random", [])
async def login(self, username: str, password: str) -> bool:
"""
使用维基百科账户登录
Args:
username: 维基百科用户名
password: 维基百科密码
Returns:
登录是否成功
"""
async with aiohttp.ClientSession(headers=self.headers) as session:
# 获取登录令牌
params = {
"action": "query",
"meta": "tokens",
"type": "login",
"format": "json"
}
async with session.get(self.base_url, params=params) as response:
if response.status != 200:
print(f"获取登录令牌失败: HTTP {response.status}")
return False
data = await response.json()
login_token = data.get("query", {}).get("tokens", {}).get("logintoken")
if not login_token:
print("获取登录令牌失败")
return False
# 使用令牌登录
login_params = {
"action": "login",
"lgname": username,
"lgpassword": password,
"lgtoken": login_token,
"format": "json"
}
async with session.post(self.base_url, data=login_params) as login_response:
login_data = await login_response.json()
if login_data.get("login", {}).get("result") == "Success":
print(f"登录成功: {username}")
return True
else:
print(f"登录失败: {login_data.get('login', {}).get('reason', '未知原因')}")
return False
async def setup_oauth(self, consumer_token: str, consumer_secret: str,
access_token: str = None, access_secret: str = None) -> bool:
"""
设置OAuth认证
Args:
consumer_token: 消费者令牌
consumer_secret: 消费者密钥
access_token: 访问令牌(可选)
access_secret: 访问密钥(可选)
Returns:
设置是否成功
"""
try:
# 需要安装 mwoauth 库: pip install mwoauth
import mwoauth
import requests_oauthlib
# 设置OAuth
self.consumer_token = consumer_token
self.consumer_secret = consumer_secret
if access_token and access_secret:
# 如果已有访问令牌
self.auth = requests_oauthlib.OAuth1(
consumer_token,
consumer_secret,
access_token,
access_secret
)
print("OAuth设置成功")
return True
else:
# 需要获取访问令牌(这通常需要用户在网页上授权)
print("请在开发环境中完成以下OAuth授权流程:")
# 创建消费者
consumer = mwoauth.Consumer(
consumer_token, consumer_secret
)
# 初始化握手
redirect, request_token = mwoauth.initiate(
f"https://{self.language}.wikipedia.org/w/index.php",
consumer
)
print(f"请访问此URL授权应用: {redirect}")
# 这里通常会提示用户访问URL并输入授权码
# 实际应用中需要实现适当的授权流程
return False
except ImportError:
print("请安装 mwoauth 库: pip install mwoauth")
return False
except Exception as e:
print(f"设置OAuth时发生错误: {str(e)}")
return False
async def example_usage():
"""演示WikipediaAPI的使用方法"""
# 创建默认中文维基百科API客户端
wiki_zh = WikipediaAPI(language="zh")
try:
# 示例1: 基本搜索
print("\n=== 示例1: 搜索维基百科 ===")
results = await wiki_zh.search("人工智能", limit=3)
for i, result in enumerate(results, 1):
print(f"\n--- 结果 {i} ---")
print(f"标题: {result.get('title')}")
snippet = result.get('snippet', '')
# 清理HTML标签
snippet = re.sub(r'<.*?>', '', snippet)
print(f"摘要: {snippet}")
print(f"字数: {result.get('wordcount')}")
print(f"大小: {result.get('size')} 字节")
# 示例2: 获取页面摘要
print("\n=== 示例2: 获取页面摘要 ===")
summary = await wiki_zh.get_summary("深度学习", sentences=2)
print(f"深度学习摘要: {summary}")
# 示例3: 获取页面内容
print("\n=== 示例3: 获取页面内容 ===")
content = await wiki_zh.get_page_content("机器学习")
if content and "text" in content:
text = content["text"].get("*", "")
# 移除HTML标签以便控制台显示
clean_text = re.sub(r'<.*?>', '', text)
print(f"机器学习页面内容片段: {clean_text[:200]}...")
# 显示页面包含的分类数量
categories = content.get("categories", [])
print(f"分类数量: {len(categories)}")
# 显示页面包含的链接数量
links = content.get("links", [])
print(f"链接数量: {len(links)}")
# 示例4: 获取特定章节内容
print("\n=== 示例4: 获取特定章节内容 ===")
# 获取引言部分(通常是0号章节)
intro_content = await wiki_zh.get_page_content("人工智能", section=0)
if intro_content and "text" in intro_content:
intro_text = intro_content["text"].get("*", "")
clean_intro = re.sub(r'<.*?>', '', intro_text)
print(f"人工智能引言内容片段: {clean_intro[:200]}...")
# 示例5: 获取随机文章
print("\n=== 示例5: 获取随机文章 ===")
random_articles = await wiki_zh.get_random_articles(count=2)
print("随机文章:")
for i, article in enumerate(random_articles, 1):
print(f"{i}. {article.get('title')}")
# 显示随机文章的简短摘要
article_summary = await wiki_zh.get_summary(article.get('title'), sentences=1)
print(f" 摘要: {article_summary[:100]}...")
except Exception as e:
print(f"发生错误: {str(e)}")
import traceback
print(traceback.format_exc())
if __name__ == "__main__":
import asyncio
# 运行示例
asyncio.run(example_usage())