* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能,支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能,支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
387 lines
14 KiB
Python
387 lines
14 KiB
Python
import aiohttp
|
||
import asyncio
|
||
from typing import List, Dict, Optional
|
||
import re
|
||
import random
|
||
import time
|
||
|
||
class WikipediaAPI:
|
||
"""维基百科API调用实现"""
|
||
|
||
def __init__(self, language: str = "zh", user_agent: str = None,
|
||
max_concurrent: int = 5, request_delay: float = 0.5):
|
||
"""
|
||
初始化维基百科API客户端
|
||
|
||
Args:
|
||
language: 语言代码 (zh: 中文, en: 英文, ja: 日文等)
|
||
user_agent: 用户代理信息,如果为None将使用默认值
|
||
max_concurrent: 最大并发请求数
|
||
request_delay: 请求间隔时间(秒)
|
||
"""
|
||
self.language = language
|
||
self.base_url = f"https://{language}.wikipedia.org/w/api.php"
|
||
self.user_agent = user_agent or "WikipediaAPIClient/1.0 (chatscholar@163.com)"
|
||
self.headers = {
|
||
"User-Agent": self.user_agent,
|
||
"Accept": "application/json"
|
||
}
|
||
# 添加并发控制
|
||
self.semaphore = asyncio.Semaphore(max_concurrent)
|
||
self.request_delay = request_delay
|
||
self.last_request_time = 0
|
||
|
||
async def _make_request(self, url, params=None):
|
||
"""
|
||
发起API请求,包含并发控制和请求延迟
|
||
|
||
Args:
|
||
url: 请求URL
|
||
params: 请求参数
|
||
|
||
Returns:
|
||
API响应数据
|
||
"""
|
||
# 使用信号量控制并发
|
||
async with self.semaphore:
|
||
# 添加请求间隔
|
||
current_time = time.time()
|
||
time_since_last_request = current_time - self.last_request_time
|
||
if time_since_last_request < self.request_delay:
|
||
await asyncio.sleep(self.request_delay - time_since_last_request)
|
||
|
||
# 设置随机延迟,避免规律性请求
|
||
jitter = random.uniform(0, 0.2)
|
||
await asyncio.sleep(jitter)
|
||
|
||
# 记录本次请求时间
|
||
self.last_request_time = time.time()
|
||
|
||
# 发起请求
|
||
try:
|
||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||
async with session.get(url, params=params) as response:
|
||
if response.status == 429: # Too Many Requests
|
||
retry_after = int(response.headers.get('Retry-After', 5))
|
||
print(f"达到请求限制,等待 {retry_after} 秒后重试...")
|
||
await asyncio.sleep(retry_after)
|
||
return await self._make_request(url, params)
|
||
|
||
if response.status != 200:
|
||
print(f"API请求失败: HTTP {response.status}")
|
||
print(f"响应内容: {await response.text()}")
|
||
return None
|
||
|
||
return await response.json()
|
||
except aiohttp.ClientError as e:
|
||
print(f"请求错误: {str(e)}")
|
||
return None
|
||
|
||
async def search(self, query: str, limit: int = 10, namespace: int = 0) -> List[Dict]:
|
||
"""
|
||
搜索维基百科文章
|
||
|
||
Args:
|
||
query: 搜索关键词
|
||
limit: 返回结果数量
|
||
namespace: 命名空间 (0表示文章, 14表示分类等)
|
||
|
||
Returns:
|
||
搜索结果列表
|
||
"""
|
||
params = {
|
||
"action": "query",
|
||
"list": "search",
|
||
"srsearch": query,
|
||
"format": "json",
|
||
"srlimit": limit,
|
||
"srnamespace": namespace,
|
||
"srprop": "snippet|titlesnippet|sectiontitle|categorysnippet|size|wordcount|timestamp|redirecttitle"
|
||
}
|
||
|
||
data = await self._make_request(self.base_url, params)
|
||
if not data:
|
||
return []
|
||
|
||
search_results = data.get("query", {}).get("search", [])
|
||
return search_results
|
||
|
||
async def get_page_content(self, title: str, section: Optional[int] = None) -> Dict:
|
||
"""
|
||
获取维基百科页面内容
|
||
|
||
Args:
|
||
title: 页面标题
|
||
section: 特定章节编号(可选)
|
||
|
||
Returns:
|
||
页面内容字典
|
||
"""
|
||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||
params = {
|
||
"action": "parse",
|
||
"page": title,
|
||
"format": "json",
|
||
"prop": "text|langlinks|categories|links|templates|images|externallinks|sections|revid|displaytitle|iwlinks|properties"
|
||
}
|
||
|
||
# 如果指定了章节,只获取该章节内容
|
||
if section is not None:
|
||
params["section"] = section
|
||
|
||
async with session.get(self.base_url, params=params) as response:
|
||
if response.status != 200:
|
||
print(f"API请求失败: HTTP {response.status}")
|
||
return {}
|
||
|
||
data = await response.json()
|
||
if "error" in data:
|
||
print(f"API错误: {data['error'].get('info', '未知错误')}")
|
||
return {}
|
||
|
||
return data.get("parse", {})
|
||
|
||
async def get_summary(self, title: str, sentences: int = 3) -> str:
|
||
"""
|
||
获取页面摘要
|
||
|
||
Args:
|
||
title: 页面标题
|
||
sentences: 返回的句子数量
|
||
|
||
Returns:
|
||
页面摘要文本
|
||
"""
|
||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||
params = {
|
||
"action": "query",
|
||
"prop": "extracts",
|
||
"exintro": "1",
|
||
"exsentences": sentences,
|
||
"explaintext": "1",
|
||
"titles": title,
|
||
"format": "json"
|
||
}
|
||
|
||
async with session.get(self.base_url, params=params) as response:
|
||
if response.status != 200:
|
||
print(f"API请求失败: HTTP {response.status}")
|
||
return ""
|
||
|
||
data = await response.json()
|
||
pages = data.get("query", {}).get("pages", {})
|
||
# 获取第一个页面ID的内容
|
||
for page_id in pages:
|
||
return pages[page_id].get("extract", "")
|
||
return ""
|
||
|
||
async def get_random_articles(self, count: int = 1, namespace: int = 0) -> List[Dict]:
|
||
"""
|
||
获取随机文章
|
||
|
||
Args:
|
||
count: 需要的随机文章数量
|
||
namespace: 命名空间
|
||
|
||
Returns:
|
||
随机文章列表
|
||
"""
|
||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||
params = {
|
||
"action": "query",
|
||
"list": "random",
|
||
"rnlimit": count,
|
||
"rnnamespace": namespace,
|
||
"format": "json"
|
||
}
|
||
|
||
async with session.get(self.base_url, params=params) as response:
|
||
if response.status != 200:
|
||
print(f"API请求失败: HTTP {response.status}")
|
||
return []
|
||
|
||
data = await response.json()
|
||
return data.get("query", {}).get("random", [])
|
||
|
||
async def login(self, username: str, password: str) -> bool:
|
||
"""
|
||
使用维基百科账户登录
|
||
|
||
Args:
|
||
username: 维基百科用户名
|
||
password: 维基百科密码
|
||
|
||
Returns:
|
||
登录是否成功
|
||
"""
|
||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||
# 获取登录令牌
|
||
params = {
|
||
"action": "query",
|
||
"meta": "tokens",
|
||
"type": "login",
|
||
"format": "json"
|
||
}
|
||
|
||
async with session.get(self.base_url, params=params) as response:
|
||
if response.status != 200:
|
||
print(f"获取登录令牌失败: HTTP {response.status}")
|
||
return False
|
||
|
||
data = await response.json()
|
||
login_token = data.get("query", {}).get("tokens", {}).get("logintoken")
|
||
|
||
if not login_token:
|
||
print("获取登录令牌失败")
|
||
return False
|
||
|
||
# 使用令牌登录
|
||
login_params = {
|
||
"action": "login",
|
||
"lgname": username,
|
||
"lgpassword": password,
|
||
"lgtoken": login_token,
|
||
"format": "json"
|
||
}
|
||
|
||
async with session.post(self.base_url, data=login_params) as login_response:
|
||
login_data = await login_response.json()
|
||
|
||
if login_data.get("login", {}).get("result") == "Success":
|
||
print(f"登录成功: {username}")
|
||
return True
|
||
else:
|
||
print(f"登录失败: {login_data.get('login', {}).get('reason', '未知原因')}")
|
||
return False
|
||
|
||
async def setup_oauth(self, consumer_token: str, consumer_secret: str,
|
||
access_token: str = None, access_secret: str = None) -> bool:
|
||
"""
|
||
设置OAuth认证
|
||
|
||
Args:
|
||
consumer_token: 消费者令牌
|
||
consumer_secret: 消费者密钥
|
||
access_token: 访问令牌(可选)
|
||
access_secret: 访问密钥(可选)
|
||
|
||
Returns:
|
||
设置是否成功
|
||
"""
|
||
try:
|
||
# 需要安装 mwoauth 库: pip install mwoauth
|
||
import mwoauth
|
||
import requests_oauthlib
|
||
|
||
# 设置OAuth
|
||
self.consumer_token = consumer_token
|
||
self.consumer_secret = consumer_secret
|
||
|
||
if access_token and access_secret:
|
||
# 如果已有访问令牌
|
||
self.auth = requests_oauthlib.OAuth1(
|
||
consumer_token,
|
||
consumer_secret,
|
||
access_token,
|
||
access_secret
|
||
)
|
||
print("OAuth设置成功")
|
||
return True
|
||
else:
|
||
# 需要获取访问令牌(这通常需要用户在网页上授权)
|
||
print("请在开发环境中完成以下OAuth授权流程:")
|
||
|
||
# 创建消费者
|
||
consumer = mwoauth.Consumer(
|
||
consumer_token, consumer_secret
|
||
)
|
||
|
||
# 初始化握手
|
||
redirect, request_token = mwoauth.initiate(
|
||
f"https://{self.language}.wikipedia.org/w/index.php",
|
||
consumer
|
||
)
|
||
|
||
print(f"请访问此URL授权应用: {redirect}")
|
||
# 这里通常会提示用户访问URL并输入授权码
|
||
# 实际应用中需要实现适当的授权流程
|
||
return False
|
||
except ImportError:
|
||
print("请安装 mwoauth 库: pip install mwoauth")
|
||
return False
|
||
except Exception as e:
|
||
print(f"设置OAuth时发生错误: {str(e)}")
|
||
return False
|
||
|
||
async def example_usage():
|
||
"""演示WikipediaAPI的使用方法"""
|
||
# 创建默认中文维基百科API客户端
|
||
wiki_zh = WikipediaAPI(language="zh")
|
||
|
||
try:
|
||
# 示例1: 基本搜索
|
||
print("\n=== 示例1: 搜索维基百科 ===")
|
||
results = await wiki_zh.search("人工智能", limit=3)
|
||
|
||
for i, result in enumerate(results, 1):
|
||
print(f"\n--- 结果 {i} ---")
|
||
print(f"标题: {result.get('title')}")
|
||
snippet = result.get('snippet', '')
|
||
# 清理HTML标签
|
||
snippet = re.sub(r'<.*?>', '', snippet)
|
||
print(f"摘要: {snippet}")
|
||
print(f"字数: {result.get('wordcount')}")
|
||
print(f"大小: {result.get('size')} 字节")
|
||
|
||
# 示例2: 获取页面摘要
|
||
print("\n=== 示例2: 获取页面摘要 ===")
|
||
summary = await wiki_zh.get_summary("深度学习", sentences=2)
|
||
print(f"深度学习摘要: {summary}")
|
||
|
||
# 示例3: 获取页面内容
|
||
print("\n=== 示例3: 获取页面内容 ===")
|
||
content = await wiki_zh.get_page_content("机器学习")
|
||
if content and "text" in content:
|
||
text = content["text"].get("*", "")
|
||
# 移除HTML标签以便控制台显示
|
||
clean_text = re.sub(r'<.*?>', '', text)
|
||
print(f"机器学习页面内容片段: {clean_text[:200]}...")
|
||
|
||
# 显示页面包含的分类数量
|
||
categories = content.get("categories", [])
|
||
print(f"分类数量: {len(categories)}")
|
||
|
||
# 显示页面包含的链接数量
|
||
links = content.get("links", [])
|
||
print(f"链接数量: {len(links)}")
|
||
|
||
# 示例4: 获取特定章节内容
|
||
print("\n=== 示例4: 获取特定章节内容 ===")
|
||
# 获取引言部分(通常是0号章节)
|
||
intro_content = await wiki_zh.get_page_content("人工智能", section=0)
|
||
if intro_content and "text" in intro_content:
|
||
intro_text = intro_content["text"].get("*", "")
|
||
clean_intro = re.sub(r'<.*?>', '', intro_text)
|
||
print(f"人工智能引言内容片段: {clean_intro[:200]}...")
|
||
|
||
# 示例5: 获取随机文章
|
||
print("\n=== 示例5: 获取随机文章 ===")
|
||
random_articles = await wiki_zh.get_random_articles(count=2)
|
||
print("随机文章:")
|
||
for i, article in enumerate(random_articles, 1):
|
||
print(f"{i}. {article.get('title')}")
|
||
|
||
# 显示随机文章的简短摘要
|
||
article_summary = await wiki_zh.get_summary(article.get('title'), sentences=1)
|
||
print(f" 摘要: {article_summary[:100]}...")
|
||
|
||
except Exception as e:
|
||
print(f"发生错误: {str(e)}")
|
||
import traceback
|
||
print(traceback.format_exc())
|
||
|
||
if __name__ == "__main__":
|
||
import asyncio
|
||
|
||
# 运行示例
|
||
asyncio.run(example_usage()) |