Master 4.0 (#2210)
* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能,支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能,支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
This commit is contained in:
0
crazy_functions/paper_fns/__init__.py
Normal file
0
crazy_functions/paper_fns/__init__.py
Normal file
386
crazy_functions/paper_fns/auto_git/handlers/base_handler.py
Normal file
386
crazy_functions/paper_fns/auto_git/handlers/base_handler.py
Normal file
@@ -0,0 +1,386 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Any
|
||||
from ..query_analyzer import SearchCriteria
|
||||
from ..sources.github_source import GitHubSource
|
||||
import asyncio
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
class BaseHandler(ABC):
|
||||
"""处理器基类"""
|
||||
|
||||
def __init__(self, github: GitHubSource, llm_kwargs: Dict = None):
|
||||
self.github = github
|
||||
self.llm_kwargs = llm_kwargs or {}
|
||||
self.ranked_repos = [] # 存储排序后的仓库列表
|
||||
|
||||
def _get_search_params(self, plugin_kwargs: Dict) -> Dict:
|
||||
"""获取搜索参数"""
|
||||
return {
|
||||
'max_repos': plugin_kwargs.get('max_repos', 150), # 最大仓库数量,从30改为150
|
||||
'max_details': plugin_kwargs.get('max_details', 80), # 最多展示详情的仓库数量,新增参数
|
||||
'search_multiplier': plugin_kwargs.get('search_multiplier', 3), # 检索倍数
|
||||
'min_stars': plugin_kwargs.get('min_stars', 0), # 最少星标数
|
||||
}
|
||||
|
||||
@abstractmethod
|
||||
async def handle(
|
||||
self,
|
||||
criteria: SearchCriteria,
|
||||
chatbot: List[List[str]],
|
||||
history: List[List[str]],
|
||||
system_prompt: str,
|
||||
llm_kwargs: Dict[str, Any],
|
||||
plugin_kwargs: Dict[str, Any],
|
||||
) -> str:
|
||||
"""处理查询"""
|
||||
pass
|
||||
|
||||
async def _search_repositories(self, query: str, language: str = None, min_stars: int = 0,
|
||||
sort: str = "stars", per_page: int = 30) -> List[Dict]:
|
||||
"""搜索仓库"""
|
||||
try:
|
||||
# 构建查询字符串
|
||||
if min_stars > 0 and "stars:>" not in query:
|
||||
query += f" stars:>{min_stars}"
|
||||
|
||||
if language and "language:" not in query:
|
||||
query += f" language:{language}"
|
||||
|
||||
# 执行搜索
|
||||
result = await self.github.search_repositories(
|
||||
query=query,
|
||||
sort=sort,
|
||||
per_page=per_page
|
||||
)
|
||||
|
||||
if result and "items" in result:
|
||||
return result["items"]
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"仓库搜索出错: {str(e)}")
|
||||
return []
|
||||
|
||||
async def _search_bilingual_repositories(self, english_query: str, chinese_query: str, language: str = None, min_stars: int = 0,
|
||||
sort: str = "stars", per_page: int = 30) -> List[Dict]:
|
||||
"""同时搜索中英文仓库并合并结果"""
|
||||
try:
|
||||
# 搜索英文仓库
|
||||
english_results = await self._search_repositories(
|
||||
query=english_query,
|
||||
language=language,
|
||||
min_stars=min_stars,
|
||||
sort=sort,
|
||||
per_page=per_page
|
||||
)
|
||||
|
||||
# 搜索中文仓库
|
||||
chinese_results = await self._search_repositories(
|
||||
query=chinese_query,
|
||||
language=language,
|
||||
min_stars=min_stars,
|
||||
sort=sort,
|
||||
per_page=per_page
|
||||
)
|
||||
|
||||
# 合并结果,去除重复项
|
||||
merged_results = []
|
||||
seen_repos = set()
|
||||
|
||||
# 优先添加英文结果
|
||||
for repo in english_results:
|
||||
repo_id = repo.get('id')
|
||||
if repo_id and repo_id not in seen_repos:
|
||||
seen_repos.add(repo_id)
|
||||
merged_results.append(repo)
|
||||
|
||||
# 添加中文结果(排除重复)
|
||||
for repo in chinese_results:
|
||||
repo_id = repo.get('id')
|
||||
if repo_id and repo_id not in seen_repos:
|
||||
seen_repos.add(repo_id)
|
||||
merged_results.append(repo)
|
||||
|
||||
# 按星标数重新排序
|
||||
merged_results.sort(key=lambda x: x.get('stargazers_count', 0), reverse=True)
|
||||
|
||||
return merged_results[:per_page] # 返回合并后的前per_page个结果
|
||||
except Exception as e:
|
||||
print(f"双语仓库搜索出错: {str(e)}")
|
||||
return []
|
||||
|
||||
async def _search_code(self, query: str, language: str = None, per_page: int = 30) -> List[Dict]:
|
||||
"""搜索代码"""
|
||||
try:
|
||||
# 构建查询字符串
|
||||
if language and "language:" not in query:
|
||||
query += f" language:{language}"
|
||||
|
||||
# 执行搜索
|
||||
result = await self.github.search_code(
|
||||
query=query,
|
||||
per_page=per_page
|
||||
)
|
||||
|
||||
if result and "items" in result:
|
||||
return result["items"]
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"代码搜索出错: {str(e)}")
|
||||
return []
|
||||
|
||||
async def _search_bilingual_code(self, english_query: str, chinese_query: str, language: str = None, per_page: int = 30) -> List[Dict]:
|
||||
"""同时搜索中英文代码并合并结果"""
|
||||
try:
|
||||
# 搜索英文代码
|
||||
english_results = await self._search_code(
|
||||
query=english_query,
|
||||
language=language,
|
||||
per_page=per_page
|
||||
)
|
||||
|
||||
# 搜索中文代码
|
||||
chinese_results = await self._search_code(
|
||||
query=chinese_query,
|
||||
language=language,
|
||||
per_page=per_page
|
||||
)
|
||||
|
||||
# 合并结果,去除重复项
|
||||
merged_results = []
|
||||
seen_files = set()
|
||||
|
||||
# 优先添加英文结果
|
||||
for item in english_results:
|
||||
# 使用文件URL作为唯一标识
|
||||
file_url = item.get('html_url', '')
|
||||
if file_url and file_url not in seen_files:
|
||||
seen_files.add(file_url)
|
||||
merged_results.append(item)
|
||||
|
||||
# 添加中文结果(排除重复)
|
||||
for item in chinese_results:
|
||||
file_url = item.get('html_url', '')
|
||||
if file_url and file_url not in seen_files:
|
||||
seen_files.add(file_url)
|
||||
merged_results.append(item)
|
||||
|
||||
# 对结果进行排序,优先显示匹配度高的结果
|
||||
# 由于无法直接获取匹配度,这里使用仓库的星标数作为替代指标
|
||||
merged_results.sort(key=lambda x: x.get('repository', {}).get('stargazers_count', 0), reverse=True)
|
||||
|
||||
return merged_results[:per_page] # 返回合并后的前per_page个结果
|
||||
except Exception as e:
|
||||
print(f"双语代码搜索出错: {str(e)}")
|
||||
return []
|
||||
|
||||
async def _search_users(self, query: str, per_page: int = 30) -> List[Dict]:
|
||||
"""搜索用户"""
|
||||
try:
|
||||
result = await self.github.search_users(
|
||||
query=query,
|
||||
per_page=per_page
|
||||
)
|
||||
|
||||
if result and "items" in result:
|
||||
return result["items"]
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"用户搜索出错: {str(e)}")
|
||||
return []
|
||||
|
||||
async def _search_bilingual_users(self, english_query: str, chinese_query: str, per_page: int = 30) -> List[Dict]:
|
||||
"""同时搜索中英文用户并合并结果"""
|
||||
try:
|
||||
# 搜索英文用户
|
||||
english_results = await self._search_users(
|
||||
query=english_query,
|
||||
per_page=per_page
|
||||
)
|
||||
|
||||
# 搜索中文用户
|
||||
chinese_results = await self._search_users(
|
||||
query=chinese_query,
|
||||
per_page=per_page
|
||||
)
|
||||
|
||||
# 合并结果,去除重复项
|
||||
merged_results = []
|
||||
seen_users = set()
|
||||
|
||||
# 优先添加英文结果
|
||||
for user in english_results:
|
||||
user_id = user.get('id')
|
||||
if user_id and user_id not in seen_users:
|
||||
seen_users.add(user_id)
|
||||
merged_results.append(user)
|
||||
|
||||
# 添加中文结果(排除重复)
|
||||
for user in chinese_results:
|
||||
user_id = user.get('id')
|
||||
if user_id and user_id not in seen_users:
|
||||
seen_users.add(user_id)
|
||||
merged_results.append(user)
|
||||
|
||||
# 按关注者数量进行排序
|
||||
merged_results.sort(key=lambda x: x.get('followers', 0), reverse=True)
|
||||
|
||||
return merged_results[:per_page] # 返回合并后的前per_page个结果
|
||||
except Exception as e:
|
||||
print(f"双语用户搜索出错: {str(e)}")
|
||||
return []
|
||||
|
||||
async def _search_topics(self, query: str, per_page: int = 30) -> List[Dict]:
|
||||
"""搜索主题"""
|
||||
try:
|
||||
result = await self.github.search_topics(
|
||||
query=query,
|
||||
per_page=per_page
|
||||
)
|
||||
|
||||
if result and "items" in result:
|
||||
return result["items"]
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"主题搜索出错: {str(e)}")
|
||||
return []
|
||||
|
||||
async def _search_bilingual_topics(self, english_query: str, chinese_query: str, per_page: int = 30) -> List[Dict]:
|
||||
"""同时搜索中英文主题并合并结果"""
|
||||
try:
|
||||
# 搜索英文主题
|
||||
english_results = await self._search_topics(
|
||||
query=english_query,
|
||||
per_page=per_page
|
||||
)
|
||||
|
||||
# 搜索中文主题
|
||||
chinese_results = await self._search_topics(
|
||||
query=chinese_query,
|
||||
per_page=per_page
|
||||
)
|
||||
|
||||
# 合并结果,去除重复项
|
||||
merged_results = []
|
||||
seen_topics = set()
|
||||
|
||||
# 优先添加英文结果
|
||||
for topic in english_results:
|
||||
topic_name = topic.get('name')
|
||||
if topic_name and topic_name not in seen_topics:
|
||||
seen_topics.add(topic_name)
|
||||
merged_results.append(topic)
|
||||
|
||||
# 添加中文结果(排除重复)
|
||||
for topic in chinese_results:
|
||||
topic_name = topic.get('name')
|
||||
if topic_name and topic_name not in seen_topics:
|
||||
seen_topics.add(topic_name)
|
||||
merged_results.append(topic)
|
||||
|
||||
# 可以按流行度进行排序(如果有)
|
||||
if merged_results and 'featured' in merged_results[0]:
|
||||
merged_results.sort(key=lambda x: x.get('featured', False), reverse=True)
|
||||
|
||||
return merged_results[:per_page] # 返回合并后的前per_page个结果
|
||||
except Exception as e:
|
||||
print(f"双语主题搜索出错: {str(e)}")
|
||||
return []
|
||||
|
||||
async def _get_repo_details(self, repos: List[Dict]) -> List[Dict]:
|
||||
"""获取仓库详细信息"""
|
||||
enhanced_repos = []
|
||||
|
||||
for repo in repos:
|
||||
try:
|
||||
# 获取README信息
|
||||
owner = repo.get('owner', {}).get('login') if repo.get('owner') is not None else None
|
||||
repo_name = repo.get('name')
|
||||
|
||||
if owner and repo_name:
|
||||
readme = await self.github.get_repo_readme(owner, repo_name)
|
||||
if readme and "decoded_content" in readme:
|
||||
# 提取README的前1000个字符作为摘要
|
||||
repo['readme_excerpt'] = readme["decoded_content"][:1000] + "..."
|
||||
|
||||
# 获取语言使用情况
|
||||
languages = await self.github.get_repository_languages(owner, repo_name)
|
||||
if languages:
|
||||
repo['languages_detail'] = languages
|
||||
|
||||
# 获取最新发布版本
|
||||
releases = await self.github.get_repo_releases(owner, repo_name, per_page=1)
|
||||
if releases and len(releases) > 0:
|
||||
repo['latest_release'] = releases[0]
|
||||
|
||||
# 获取主题标签
|
||||
topics = await self.github.get_repo_topics(owner, repo_name)
|
||||
if topics and "names" in topics:
|
||||
repo['topics'] = topics["names"]
|
||||
|
||||
enhanced_repos.append(repo)
|
||||
except Exception as e:
|
||||
print(f"获取仓库 {repo.get('full_name')} 详情时出错: {str(e)}")
|
||||
enhanced_repos.append(repo) # 添加原始仓库信息
|
||||
|
||||
return enhanced_repos
|
||||
|
||||
def _format_repos(self, repos: List[Dict]) -> str:
|
||||
"""格式化仓库列表"""
|
||||
formatted = []
|
||||
|
||||
for i, repo in enumerate(repos, 1):
|
||||
# 构建仓库URL
|
||||
repo_url = repo.get('html_url', '')
|
||||
|
||||
# 构建完整的引用
|
||||
reference = (
|
||||
f"{i}. **{repo.get('full_name', '')}**\n"
|
||||
f" - 描述: {repo.get('description', 'N/A')}\n"
|
||||
f" - 语言: {repo.get('language', 'N/A')}\n"
|
||||
f" - 星标: {repo.get('stargazers_count', 0)}\n"
|
||||
f" - Fork数: {repo.get('forks_count', 0)}\n"
|
||||
f" - 更新时间: {repo.get('updated_at', 'N/A')[:10]}\n"
|
||||
f" - 创建时间: {repo.get('created_at', 'N/A')[:10]}\n"
|
||||
f" - URL: <a href='{repo_url}' target='_blank'>{repo_url}</a>\n"
|
||||
)
|
||||
|
||||
# 添加主题标签(如果有)
|
||||
if repo.get('topics'):
|
||||
topics_str = ", ".join(repo.get('topics'))
|
||||
reference += f" - 主题标签: {topics_str}\n"
|
||||
|
||||
# 添加最新发布版本(如果有)
|
||||
if repo.get('latest_release'):
|
||||
release = repo.get('latest_release')
|
||||
reference += f" - 最新版本: {release.get('tag_name', 'N/A')} ({release.get('published_at', 'N/A')[:10]})\n"
|
||||
|
||||
# 添加README摘要(如果有)
|
||||
if repo.get('readme_excerpt'):
|
||||
# 截断README,只取前300个字符
|
||||
readme_short = repo.get('readme_excerpt')[:300].replace('\n', ' ')
|
||||
reference += f" - README摘要: {readme_short}...\n"
|
||||
|
||||
formatted.append(reference)
|
||||
|
||||
return "\n".join(formatted)
|
||||
|
||||
def _generate_apology_prompt(self, criteria: SearchCriteria) -> str:
|
||||
"""生成道歉提示"""
|
||||
return f"""很抱歉,我们未能找到与"{criteria.main_topic}"相关的GitHub项目。
|
||||
|
||||
可能的原因:
|
||||
1. 搜索词过于具体或冷门
|
||||
2. 星标数要求过高
|
||||
3. 编程语言限制过于严格
|
||||
|
||||
建议解决方案:
|
||||
1. 尝试使用更通用的关键词
|
||||
2. 降低最低星标数要求
|
||||
3. 移除或更改编程语言限制
|
||||
请根据以上建议调整后重试。"""
|
||||
|
||||
def _get_current_time(self) -> str:
|
||||
"""获取当前时间信息"""
|
||||
now = datetime.now()
|
||||
return now.strftime("%Y年%m月%d日")
|
||||
156
crazy_functions/paper_fns/auto_git/handlers/code_handler.py
Normal file
156
crazy_functions/paper_fns/auto_git/handlers/code_handler.py
Normal file
@@ -0,0 +1,156 @@
|
||||
from typing import List, Dict, Any
|
||||
from .base_handler import BaseHandler
|
||||
from ..query_analyzer import SearchCriteria
|
||||
import asyncio
|
||||
|
||||
class CodeSearchHandler(BaseHandler):
|
||||
"""代码搜索处理器"""
|
||||
|
||||
def __init__(self, github, llm_kwargs=None):
|
||||
super().__init__(github, llm_kwargs)
|
||||
|
||||
async def handle(
|
||||
self,
|
||||
criteria: SearchCriteria,
|
||||
chatbot: List[List[str]],
|
||||
history: List[List[str]],
|
||||
system_prompt: str,
|
||||
llm_kwargs: Dict[str, Any],
|
||||
plugin_kwargs: Dict[str, Any],
|
||||
) -> str:
|
||||
"""处理代码搜索请求,返回最终的prompt"""
|
||||
|
||||
search_params = self._get_search_params(plugin_kwargs)
|
||||
|
||||
# 搜索代码
|
||||
code_results = await self._search_bilingual_code(
|
||||
english_query=criteria.github_params["query"],
|
||||
chinese_query=criteria.github_params["chinese_query"],
|
||||
language=criteria.language,
|
||||
per_page=search_params['max_repos']
|
||||
)
|
||||
|
||||
if not code_results:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
# 获取代码文件内容
|
||||
enhanced_code_results = await self._get_code_details(code_results[:search_params['max_details']])
|
||||
self.ranked_repos = [item["repository"] for item in enhanced_code_results if "repository" in item]
|
||||
|
||||
if not enhanced_code_results:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
# 构建最终的prompt
|
||||
current_time = self._get_current_time()
|
||||
final_prompt = f"""当前时间: {current_time}
|
||||
|
||||
基于用户对{criteria.main_topic}的查询,我找到了以下代码示例。
|
||||
|
||||
代码搜索结果:
|
||||
{self._format_code_results(enhanced_code_results)}
|
||||
|
||||
请提供:
|
||||
|
||||
1. 对于搜索的"{criteria.main_topic}"主题的综合解释:
|
||||
- 概念和原理介绍
|
||||
- 常见实现方法和技术
|
||||
- 最佳实践和注意事项
|
||||
|
||||
2. 对每个代码示例:
|
||||
- 解释代码的主要功能和实现方式
|
||||
- 分析代码质量、可读性和效率
|
||||
- 指出代码中的亮点和潜在改进空间
|
||||
- 说明代码的适用场景
|
||||
|
||||
3. 代码实现比较:
|
||||
- 不同实现方法的优缺点
|
||||
- 性能和可维护性分析
|
||||
- 适用不同场景的实现建议
|
||||
|
||||
4. 学习建议:
|
||||
- 理解和使用这些代码需要的背景知识
|
||||
- 如何扩展或改进所展示的代码
|
||||
- 进一步学习相关技术的资源
|
||||
|
||||
重要提示:
|
||||
- 深入解释代码的核心逻辑和实现思路
|
||||
- 提供专业、技术性的分析
|
||||
- 优先关注代码的实现质量和技术价值
|
||||
- 当代码实现有问题时,指出并提供改进建议
|
||||
- 对于复杂代码,分解解释其组成部分
|
||||
- 根据用户查询的具体问题提供针对性答案
|
||||
- 所有链接请使用<a href='链接地址' target='_blank'>链接文本</a>格式,确保链接在新窗口打开
|
||||
|
||||
使用markdown格式提供清晰的分节回复。
|
||||
"""
|
||||
|
||||
return final_prompt
|
||||
|
||||
async def _get_code_details(self, code_results: List[Dict]) -> List[Dict]:
|
||||
"""获取代码详情"""
|
||||
enhanced_results = []
|
||||
|
||||
for item in code_results:
|
||||
try:
|
||||
repo = item.get('repository', {})
|
||||
file_path = item.get('path', '')
|
||||
repo_name = repo.get('full_name', '')
|
||||
|
||||
if repo_name and file_path:
|
||||
owner, repo_name = repo_name.split('/')
|
||||
|
||||
# 获取文件内容
|
||||
file_content = await self.github.get_file_content(owner, repo_name, file_path)
|
||||
if file_content and "decoded_content" in file_content:
|
||||
item['code_content'] = file_content["decoded_content"]
|
||||
|
||||
# 获取仓库基本信息
|
||||
repo_details = await self.github.get_repo(owner, repo_name)
|
||||
if repo_details:
|
||||
item['repository'] = repo_details
|
||||
|
||||
enhanced_results.append(item)
|
||||
except Exception as e:
|
||||
print(f"获取代码详情时出错: {str(e)}")
|
||||
enhanced_results.append(item) # 添加原始信息
|
||||
|
||||
return enhanced_results
|
||||
|
||||
def _format_code_results(self, code_results: List[Dict]) -> str:
|
||||
"""格式化代码搜索结果"""
|
||||
formatted = []
|
||||
|
||||
for i, item in enumerate(code_results, 1):
|
||||
# 构建仓库信息
|
||||
repo = item.get('repository', {})
|
||||
repo_name = repo.get('full_name', 'N/A')
|
||||
repo_url = repo.get('html_url', '')
|
||||
stars = repo.get('stargazers_count', 0)
|
||||
language = repo.get('language', 'N/A')
|
||||
|
||||
# 构建文件信息
|
||||
file_path = item.get('path', 'N/A')
|
||||
file_url = item.get('html_url', '')
|
||||
|
||||
# 构建代码内容
|
||||
code_content = item.get('code_content', '')
|
||||
if code_content:
|
||||
# 只显示前30行代码
|
||||
code_lines = code_content.split("\n")
|
||||
if len(code_lines) > 30:
|
||||
displayed_code = "\n".join(code_lines[:30]) + "\n... (代码太长已截断) ..."
|
||||
else:
|
||||
displayed_code = code_content
|
||||
else:
|
||||
displayed_code = "(代码内容获取失败)"
|
||||
|
||||
reference = (
|
||||
f"### {i}. {file_path} (在 {repo_name} 中)\n\n"
|
||||
f"- **仓库**: <a href='{repo_url}' target='_blank'>{repo_name}</a> (⭐ {stars}, 语言: {language})\n"
|
||||
f"- **文件路径**: <a href='{file_url}' target='_blank'>{file_path}</a>\n\n"
|
||||
f"```{language.lower()}\n{displayed_code}\n```\n\n"
|
||||
)
|
||||
|
||||
formatted.append(reference)
|
||||
|
||||
return "\n".join(formatted)
|
||||
192
crazy_functions/paper_fns/auto_git/handlers/repo_handler.py
Normal file
192
crazy_functions/paper_fns/auto_git/handlers/repo_handler.py
Normal file
@@ -0,0 +1,192 @@
|
||||
from typing import List, Dict, Any
|
||||
from .base_handler import BaseHandler
|
||||
from ..query_analyzer import SearchCriteria
|
||||
import asyncio
|
||||
|
||||
class RepositoryHandler(BaseHandler):
|
||||
"""仓库搜索处理器"""
|
||||
|
||||
def __init__(self, github, llm_kwargs=None):
|
||||
super().__init__(github, llm_kwargs)
|
||||
|
||||
async def handle(
|
||||
self,
|
||||
criteria: SearchCriteria,
|
||||
chatbot: List[List[str]],
|
||||
history: List[List[str]],
|
||||
system_prompt: str,
|
||||
llm_kwargs: Dict[str, Any],
|
||||
plugin_kwargs: Dict[str, Any],
|
||||
) -> str:
|
||||
"""处理仓库搜索请求,返回最终的prompt"""
|
||||
|
||||
search_params = self._get_search_params(plugin_kwargs)
|
||||
|
||||
# 如果是特定仓库查询
|
||||
if criteria.repo_id:
|
||||
try:
|
||||
owner, repo = criteria.repo_id.split('/')
|
||||
repo_details = await self.github.get_repo(owner, repo)
|
||||
if repo_details:
|
||||
# 获取推荐的相似仓库
|
||||
similar_repos = await self.github.get_repo_recommendations(criteria.repo_id, limit=5)
|
||||
|
||||
# 添加详细信息
|
||||
all_repos = [repo_details] + similar_repos
|
||||
enhanced_repos = await self._get_repo_details(all_repos)
|
||||
|
||||
self.ranked_repos = enhanced_repos
|
||||
|
||||
# 构建最终的prompt
|
||||
current_time = self._get_current_time()
|
||||
final_prompt = self._build_repo_detail_prompt(enhanced_repos[0], enhanced_repos[1:], current_time)
|
||||
return final_prompt
|
||||
else:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
except Exception as e:
|
||||
print(f"处理特定仓库时出错: {str(e)}")
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
# 一般仓库搜索
|
||||
repos = await self._search_bilingual_repositories(
|
||||
english_query=criteria.github_params["query"],
|
||||
chinese_query=criteria.github_params["chinese_query"],
|
||||
language=criteria.language,
|
||||
min_stars=criteria.min_stars,
|
||||
per_page=search_params['max_repos']
|
||||
)
|
||||
|
||||
if not repos:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
# 获取仓库详情
|
||||
enhanced_repos = await self._get_repo_details(repos[:search_params['max_details']]) # 使用max_details参数
|
||||
self.ranked_repos = enhanced_repos
|
||||
|
||||
if not enhanced_repos:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
# 构建最终的prompt
|
||||
current_time = self._get_current_time()
|
||||
final_prompt = f"""当前时间: {current_time}
|
||||
|
||||
基于用户对{criteria.main_topic}的兴趣,以下是相关的GitHub仓库。
|
||||
|
||||
可供推荐的GitHub仓库:
|
||||
{self._format_repos(enhanced_repos)}
|
||||
|
||||
请提供:
|
||||
1. 按功能、用途或成熟度对仓库进行分组
|
||||
|
||||
2. 对每个仓库:
|
||||
- 简要描述其主要功能和用途
|
||||
- 分析其技术特点和优势
|
||||
- 说明其适用场景和使用难度
|
||||
- 指出其与同类产品相比的独特优势
|
||||
- 解释其星标数量和活跃度代表的意义
|
||||
|
||||
3. 使用建议:
|
||||
- 新手最适合入门的仓库
|
||||
- 生产环境中最稳定可靠的选择
|
||||
- 最新技术栈或创新方案的代表
|
||||
- 学习特定技术的最佳资源
|
||||
|
||||
4. 相关资源:
|
||||
- 学习这些项目需要的前置知识
|
||||
- 项目间的关联和技术栈兼容性
|
||||
- 可能的使用组合方案
|
||||
|
||||
重要提示:
|
||||
- 重点解释为什么每个仓库值得关注
|
||||
- 突出项目间的关联性和差异性
|
||||
- 考虑用户不同水平的需求(初学者vs专业人士)
|
||||
- 在介绍项目时,使用<a href='链接' target='_blank'>文本</a>格式,确保链接在新窗口打开
|
||||
- 根据仓库的活跃度、更新频率、维护状态提供使用建议
|
||||
- 仅基于提供的信息,不要做无根据的猜测
|
||||
- 在信息缺失或不明确时,坦诚说明
|
||||
|
||||
使用markdown格式提供清晰的分节回复。
|
||||
"""
|
||||
|
||||
return final_prompt
|
||||
|
||||
def _build_repo_detail_prompt(self, main_repo: Dict, similar_repos: List[Dict], current_time: str) -> str:
|
||||
"""构建仓库详情prompt"""
|
||||
|
||||
# 提取README摘要
|
||||
readme_content = "未提供"
|
||||
if main_repo.get('readme_excerpt'):
|
||||
readme_content = main_repo.get('readme_excerpt')
|
||||
|
||||
# 构建语言分布
|
||||
languages = main_repo.get('languages_detail', {})
|
||||
lang_distribution = []
|
||||
if languages:
|
||||
total = sum(languages.values())
|
||||
for lang, bytes_val in languages.items():
|
||||
percentage = (bytes_val / total) * 100
|
||||
lang_distribution.append(f"{lang}: {percentage:.1f}%")
|
||||
|
||||
lang_str = "未知"
|
||||
if lang_distribution:
|
||||
lang_str = ", ".join(lang_distribution)
|
||||
|
||||
# 构建最终prompt
|
||||
prompt = f"""当前时间: {current_time}
|
||||
|
||||
## 主要仓库信息
|
||||
|
||||
### {main_repo.get('full_name')}
|
||||
|
||||
- **描述**: {main_repo.get('description', '未提供')}
|
||||
- **星标数**: {main_repo.get('stargazers_count', 0)}
|
||||
- **Fork数**: {main_repo.get('forks_count', 0)}
|
||||
- **Watch数**: {main_repo.get('watchers_count', 0)}
|
||||
- **Issues数**: {main_repo.get('open_issues_count', 0)}
|
||||
- **语言分布**: {lang_str}
|
||||
- **许可证**: {main_repo.get('license', {}).get('name', '未指定') if main_repo.get('license') is not None else '未指定'}
|
||||
- **创建时间**: {main_repo.get('created_at', '')[:10]}
|
||||
- **最近更新**: {main_repo.get('updated_at', '')[:10]}
|
||||
- **主题标签**: {', '.join(main_repo.get('topics', ['无']))}
|
||||
- **GitHub链接**: <a href='{main_repo.get('html_url')}' target='_blank'>链接</a>
|
||||
|
||||
### README摘要:
|
||||
{readme_content}
|
||||
|
||||
## 类似仓库:
|
||||
{self._format_repos(similar_repos)}
|
||||
|
||||
请提供以下内容:
|
||||
|
||||
1. **项目概述**
|
||||
- 详细解释{main_repo.get('name', '')}项目的主要功能和用途
|
||||
- 分析其技术特点、架构和实现原理
|
||||
- 讨论其在所属领域的地位和影响力
|
||||
- 评估项目成熟度和稳定性
|
||||
|
||||
2. **优势与特点**
|
||||
- 与同类项目相比的独特优势
|
||||
- 显著的技术创新或设计模式
|
||||
- 值得学习或借鉴的代码实践
|
||||
|
||||
3. **使用场景**
|
||||
- 最适合的应用场景
|
||||
- 潜在的使用限制和注意事项
|
||||
- 入门门槛和学习曲线评估
|
||||
- 产品级应用的可行性分析
|
||||
|
||||
4. **资源与生态**
|
||||
- 相关学习资源推荐
|
||||
- 配套工具和库的建议
|
||||
- 社区支持和活跃度评估
|
||||
|
||||
5. **类似项目对比**
|
||||
- 与列出的类似项目的详细对比
|
||||
- 不同场景下的最佳选择建议
|
||||
- 潜在的互补使用方案
|
||||
|
||||
提示:所有链接请使用<a href='链接地址' target='_blank'>链接文本</a>格式,确保链接在新窗口打开。
|
||||
|
||||
请以专业、客观的技术分析角度回答,使用markdown格式提供结构化信息。
|
||||
"""
|
||||
return prompt
|
||||
217
crazy_functions/paper_fns/auto_git/handlers/topic_handler.py
Normal file
217
crazy_functions/paper_fns/auto_git/handlers/topic_handler.py
Normal file
@@ -0,0 +1,217 @@
|
||||
from typing import List, Dict, Any
|
||||
from .base_handler import BaseHandler
|
||||
from ..query_analyzer import SearchCriteria
|
||||
import asyncio
|
||||
|
||||
class TopicHandler(BaseHandler):
|
||||
"""主题搜索处理器"""
|
||||
|
||||
def __init__(self, github, llm_kwargs=None):
|
||||
super().__init__(github, llm_kwargs)
|
||||
|
||||
async def handle(
|
||||
self,
|
||||
criteria: SearchCriteria,
|
||||
chatbot: List[List[str]],
|
||||
history: List[List[str]],
|
||||
system_prompt: str,
|
||||
llm_kwargs: Dict[str, Any],
|
||||
plugin_kwargs: Dict[str, Any],
|
||||
) -> str:
|
||||
"""处理主题搜索请求,返回最终的prompt"""
|
||||
|
||||
search_params = self._get_search_params(plugin_kwargs)
|
||||
|
||||
# 搜索主题
|
||||
topics = await self._search_bilingual_topics(
|
||||
english_query=criteria.github_params["query"],
|
||||
chinese_query=criteria.github_params["chinese_query"],
|
||||
per_page=search_params['max_repos']
|
||||
)
|
||||
|
||||
if not topics:
|
||||
# 尝试用主题搜索仓库
|
||||
search_query = criteria.github_params["query"]
|
||||
chinese_search_query = criteria.github_params["chinese_query"]
|
||||
if "topic:" not in search_query:
|
||||
search_query += " topic:" + criteria.main_topic.replace(" ", "-")
|
||||
if "topic:" not in chinese_search_query:
|
||||
chinese_search_query += " topic:" + criteria.main_topic.replace(" ", "-")
|
||||
|
||||
repos = await self._search_bilingual_repositories(
|
||||
english_query=search_query,
|
||||
chinese_query=chinese_search_query,
|
||||
language=criteria.language,
|
||||
min_stars=criteria.min_stars,
|
||||
per_page=search_params['max_repos']
|
||||
)
|
||||
|
||||
if not repos:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
# 获取仓库详情
|
||||
enhanced_repos = await self._get_repo_details(repos[:10])
|
||||
self.ranked_repos = enhanced_repos
|
||||
|
||||
if not enhanced_repos:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
# 构建基于主题的仓库列表prompt
|
||||
current_time = self._get_current_time()
|
||||
final_prompt = f"""当前时间: {current_time}
|
||||
|
||||
基于用户对主题"{criteria.main_topic}"的查询,我找到了以下相关GitHub仓库。
|
||||
|
||||
主题相关仓库:
|
||||
{self._format_repos(enhanced_repos)}
|
||||
|
||||
请提供:
|
||||
|
||||
1. 主题综述:
|
||||
- "{criteria.main_topic}"主题的概述和重要性
|
||||
- 该主题在技术领域中的应用和发展趋势
|
||||
- 主题相关的主要技术栈和知识体系
|
||||
|
||||
2. 仓库分析:
|
||||
- 按功能、技术栈或应用场景对仓库进行分类
|
||||
- 每个仓库在该主题领域的定位和贡献
|
||||
- 不同仓库间的技术路线对比
|
||||
|
||||
3. 学习路径建议:
|
||||
- 初学者入门该主题的推荐仓库和学习顺序
|
||||
- 进阶学习的关键仓库和技术要点
|
||||
- 实际应用中的最佳实践选择
|
||||
|
||||
4. 技术生态分析:
|
||||
- 该主题下的主流工具和库
|
||||
- 社区活跃度和维护状况
|
||||
- 与其他相关技术的集成方案
|
||||
|
||||
重要提示:
|
||||
- 主题"{criteria.main_topic}"是用户查询的核心,请围绕此主题展开分析
|
||||
- 注重仓库质量评估和使用建议
|
||||
- 提供基于事实的客观技术分析
|
||||
- 在介绍仓库时使用<a href='链接地址' target='_blank'>链接文本</a>格式,确保链接在新窗口打开
|
||||
- 考虑不同技术水平用户的需求
|
||||
|
||||
使用markdown格式提供清晰的分节回复。
|
||||
"""
|
||||
return final_prompt
|
||||
|
||||
# 如果找到了主题,则获取主题下的热门仓库
|
||||
topic_repos = []
|
||||
for topic in topics[:5]: # 增加到5个主题
|
||||
topic_name = topic.get('name', '')
|
||||
if topic_name:
|
||||
# 搜索该主题下的仓库
|
||||
repos = await self._search_repositories(
|
||||
query=f"topic:{topic_name}",
|
||||
language=criteria.language,
|
||||
min_stars=criteria.min_stars,
|
||||
per_page=20 # 每个主题最多20个仓库
|
||||
)
|
||||
|
||||
if repos:
|
||||
for repo in repos:
|
||||
repo['topic_source'] = topic_name
|
||||
topic_repos.append(repo)
|
||||
|
||||
if not topic_repos:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
# 获取前N个仓库的详情
|
||||
enhanced_repos = await self._get_repo_details(topic_repos[:search_params['max_details']])
|
||||
self.ranked_repos = enhanced_repos
|
||||
|
||||
if not enhanced_repos:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
# 构建最终的prompt
|
||||
current_time = self._get_current_time()
|
||||
final_prompt = f"""当前时间: {current_time}
|
||||
|
||||
基于用户对"{criteria.main_topic}"主题的查询,我找到了以下相关GitHub主题和仓库。
|
||||
|
||||
主题相关仓库:
|
||||
{self._format_topic_repos(enhanced_repos)}
|
||||
|
||||
请提供:
|
||||
|
||||
1. 主题概述:
|
||||
- 对"{criteria.main_topic}"相关主题的介绍和技术背景
|
||||
- 这些主题在软件开发中的重要性和应用范围
|
||||
- 主题间的关联性和技术演进路径
|
||||
|
||||
2. 精选仓库分析:
|
||||
- 每个主题下最具代表性的仓库详解
|
||||
- 仓库的技术亮点和创新点
|
||||
- 使用场景和技术成熟度评估
|
||||
|
||||
3. 技术趋势分析:
|
||||
- 基于主题和仓库活跃度的技术发展趋势
|
||||
- 新兴解决方案和传统方案的对比
|
||||
- 未来可能的技术方向预测
|
||||
|
||||
4. 实践建议:
|
||||
- 不同应用场景下的最佳仓库选择
|
||||
- 学习路径和资源推荐
|
||||
- 实际项目中的应用策略
|
||||
|
||||
重要提示:
|
||||
- 将分析重点放在主题的技术内涵和价值上
|
||||
- 突出主题间的关联性和技术演进脉络
|
||||
- 提供基于数据(星标数、更新频率等)的客观分析
|
||||
- 考虑不同技术背景用户的需求
|
||||
- 所有链接请使用<a href='链接地址' target='_blank'>链接文本</a>格式,确保链接在新窗口打开
|
||||
|
||||
使用markdown格式提供清晰的分节回复。
|
||||
"""
|
||||
|
||||
return final_prompt
|
||||
|
||||
def _format_topic_repos(self, repos: List[Dict]) -> str:
|
||||
"""按主题格式化仓库列表"""
|
||||
# 按主题分组
|
||||
topics_dict = {}
|
||||
for repo in repos:
|
||||
topic = repo.get('topic_source', '其他')
|
||||
if topic not in topics_dict:
|
||||
topics_dict[topic] = []
|
||||
topics_dict[topic].append(repo)
|
||||
|
||||
# 格式化输出
|
||||
formatted = []
|
||||
for topic, topic_repos in topics_dict.items():
|
||||
formatted.append(f"## 主题: {topic}\n")
|
||||
|
||||
for i, repo in enumerate(topic_repos, 1):
|
||||
# 构建仓库URL
|
||||
repo_url = repo.get('html_url', '')
|
||||
|
||||
# 构建引用
|
||||
reference = (
|
||||
f"{i}. **{repo.get('full_name', '')}**\n"
|
||||
f" - 描述: {repo.get('description', 'N/A')}\n"
|
||||
f" - 语言: {repo.get('language', 'N/A')}\n"
|
||||
f" - 星标: {repo.get('stargazers_count', 0)}\n"
|
||||
f" - Fork数: {repo.get('forks_count', 0)}\n"
|
||||
f" - 更新时间: {repo.get('updated_at', 'N/A')[:10]}\n"
|
||||
f" - URL: <a href='{repo_url}' target='_blank'>{repo_url}</a>\n"
|
||||
)
|
||||
|
||||
# 添加主题标签(如果有)
|
||||
if repo.get('topics'):
|
||||
topics_str = ", ".join(repo.get('topics'))
|
||||
reference += f" - 主题标签: {topics_str}\n"
|
||||
|
||||
# 添加README摘要(如果有)
|
||||
if repo.get('readme_excerpt'):
|
||||
# 截断README,只取前200个字符
|
||||
readme_short = repo.get('readme_excerpt')[:200].replace('\n', ' ')
|
||||
reference += f" - README摘要: {readme_short}...\n"
|
||||
|
||||
formatted.append(reference)
|
||||
|
||||
formatted.append("\n") # 主题之间添加空行
|
||||
|
||||
return "\n".join(formatted)
|
||||
164
crazy_functions/paper_fns/auto_git/handlers/user_handler.py
Normal file
164
crazy_functions/paper_fns/auto_git/handlers/user_handler.py
Normal file
@@ -0,0 +1,164 @@
|
||||
from typing import List, Dict, Any
|
||||
from .base_handler import BaseHandler
|
||||
from ..query_analyzer import SearchCriteria
|
||||
import asyncio
|
||||
|
||||
class UserSearchHandler(BaseHandler):
|
||||
"""用户搜索处理器"""
|
||||
|
||||
def __init__(self, github, llm_kwargs=None):
|
||||
super().__init__(github, llm_kwargs)
|
||||
|
||||
async def handle(
|
||||
self,
|
||||
criteria: SearchCriteria,
|
||||
chatbot: List[List[str]],
|
||||
history: List[List[str]],
|
||||
system_prompt: str,
|
||||
llm_kwargs: Dict[str, Any],
|
||||
plugin_kwargs: Dict[str, Any],
|
||||
) -> str:
|
||||
"""处理用户搜索请求,返回最终的prompt"""
|
||||
|
||||
search_params = self._get_search_params(plugin_kwargs)
|
||||
|
||||
# 搜索用户
|
||||
users = await self._search_bilingual_users(
|
||||
english_query=criteria.github_params["query"],
|
||||
chinese_query=criteria.github_params["chinese_query"],
|
||||
per_page=search_params['max_repos']
|
||||
)
|
||||
|
||||
if not users:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
# 获取用户详情和仓库
|
||||
enhanced_users = await self._get_user_details(users[:search_params['max_details']])
|
||||
self.ranked_repos = [] # 添加用户top仓库进行展示
|
||||
|
||||
for user in enhanced_users:
|
||||
if user.get('top_repos'):
|
||||
self.ranked_repos.extend(user.get('top_repos'))
|
||||
|
||||
if not enhanced_users:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
# 构建最终的prompt
|
||||
current_time = self._get_current_time()
|
||||
final_prompt = f"""当前时间: {current_time}
|
||||
|
||||
基于用户对{criteria.main_topic}的查询,我找到了以下GitHub用户。
|
||||
|
||||
GitHub用户搜索结果:
|
||||
{self._format_users(enhanced_users)}
|
||||
|
||||
请提供:
|
||||
|
||||
1. 用户综合分析:
|
||||
- 各开发者的专业领域和技术专长
|
||||
- 他们在GitHub开源社区的影响力
|
||||
- 技术实力和项目质量评估
|
||||
|
||||
2. 对每位开发者:
|
||||
- 其主要贡献领域和技术栈
|
||||
- 代表性项目及其价值
|
||||
- 编程风格和技术特点
|
||||
- 在相关领域的影响力
|
||||
|
||||
3. 项目推荐:
|
||||
- 针对用户查询的最有价值项目
|
||||
- 值得学习和借鉴的代码实践
|
||||
- 不同用户项目的相互补充关系
|
||||
|
||||
4. 如何学习和使用:
|
||||
- 如何从这些开发者项目中学习
|
||||
- 最适合入门学习的项目
|
||||
- 进阶学习的路径建议
|
||||
|
||||
重要提示:
|
||||
- 关注开发者的技术专长和核心贡献
|
||||
- 分析其开源项目的技术价值
|
||||
- 根据用户的原始查询提供相关建议
|
||||
- 避免过度赞美或主观评价
|
||||
- 基于事实数据(项目数、星标数等)进行客观分析
|
||||
- 所有链接请使用<a href='链接地址' target='_blank'>链接文本</a>格式,确保链接在新窗口打开
|
||||
|
||||
使用markdown格式提供清晰的分节回复。
|
||||
"""
|
||||
|
||||
return final_prompt
|
||||
|
||||
async def _get_user_details(self, users: List[Dict]) -> List[Dict]:
|
||||
"""获取用户详情和仓库"""
|
||||
enhanced_users = []
|
||||
|
||||
for user in users:
|
||||
try:
|
||||
username = user.get('login')
|
||||
|
||||
if username:
|
||||
# 获取用户详情
|
||||
user_details = await self.github.get_user(username)
|
||||
if user_details:
|
||||
user.update(user_details)
|
||||
|
||||
# 获取用户仓库
|
||||
repos = await self.github.get_user_repos(
|
||||
username,
|
||||
sort="stars",
|
||||
per_page=10 # 增加到10个仓库
|
||||
)
|
||||
if repos:
|
||||
user['top_repos'] = repos
|
||||
|
||||
enhanced_users.append(user)
|
||||
except Exception as e:
|
||||
print(f"获取用户 {user.get('login')} 详情时出错: {str(e)}")
|
||||
enhanced_users.append(user) # 添加原始信息
|
||||
|
||||
return enhanced_users
|
||||
|
||||
def _format_users(self, users: List[Dict]) -> str:
|
||||
"""格式化用户列表"""
|
||||
formatted = []
|
||||
|
||||
for i, user in enumerate(users, 1):
|
||||
# 构建用户信息
|
||||
username = user.get('login', 'N/A')
|
||||
name = user.get('name', username)
|
||||
profile_url = user.get('html_url', '')
|
||||
bio = user.get('bio', '无简介')
|
||||
followers = user.get('followers', 0)
|
||||
public_repos = user.get('public_repos', 0)
|
||||
company = user.get('company', '未指定')
|
||||
location = user.get('location', '未指定')
|
||||
blog = user.get('blog', '')
|
||||
|
||||
user_info = (
|
||||
f"### {i}. {name} (@{username})\n\n"
|
||||
f"- **简介**: {bio}\n"
|
||||
f"- **关注者**: {followers} | **公开仓库**: {public_repos}\n"
|
||||
f"- **公司**: {company} | **地点**: {location}\n"
|
||||
f"- **个人网站**: {blog}\n"
|
||||
f"- **GitHub**: <a href='{profile_url}' target='_blank'>{username}</a>\n\n"
|
||||
)
|
||||
|
||||
# 添加用户的热门仓库
|
||||
top_repos = user.get('top_repos', [])
|
||||
if top_repos:
|
||||
user_info += "**热门仓库**:\n\n"
|
||||
for repo in top_repos:
|
||||
repo_name = repo.get('name', '')
|
||||
repo_url = repo.get('html_url', '')
|
||||
repo_desc = repo.get('description', '无描述')
|
||||
repo_stars = repo.get('stargazers_count', 0)
|
||||
repo_language = repo.get('language', '未指定')
|
||||
|
||||
user_info += (
|
||||
f"- <a href='{repo_url}' target='_blank'>{repo_name}</a> - ⭐ {repo_stars}, {repo_language}\n"
|
||||
f" {repo_desc}\n\n"
|
||||
)
|
||||
|
||||
formatted.append(user_info)
|
||||
|
||||
return "\n".join(formatted)
|
||||
356
crazy_functions/paper_fns/auto_git/query_analyzer.py
Normal file
356
crazy_functions/paper_fns/auto_git/query_analyzer.py
Normal file
@@ -0,0 +1,356 @@
|
||||
from typing import Dict, List
|
||||
from dataclasses import dataclass
|
||||
import re
|
||||
|
||||
@dataclass
|
||||
class SearchCriteria:
|
||||
"""搜索条件"""
|
||||
query_type: str # 查询类型: repo/code/user/topic
|
||||
main_topic: str # 主题
|
||||
sub_topics: List[str] # 子主题列表
|
||||
language: str # 编程语言
|
||||
min_stars: int # 最少星标数
|
||||
github_params: Dict # GitHub搜索参数
|
||||
original_query: str = "" # 原始查询字符串
|
||||
repo_id: str = "" # 特定仓库ID或名称
|
||||
|
||||
class QueryAnalyzer:
|
||||
"""查询分析器"""
|
||||
|
||||
# 响应索引常量
|
||||
BASIC_QUERY_INDEX = 0
|
||||
GITHUB_QUERY_INDEX = 1
|
||||
|
||||
def __init__(self):
|
||||
self.valid_types = {
|
||||
"repo": ["repository", "project", "library", "framework", "tool"],
|
||||
"code": ["code", "snippet", "implementation", "function", "class", "algorithm"],
|
||||
"user": ["user", "developer", "organization", "contributor", "maintainer"],
|
||||
"topic": ["topic", "category", "tag", "field", "area", "domain"]
|
||||
}
|
||||
|
||||
def analyze_query(self, query: str, chatbot: List, llm_kwargs: Dict):
|
||||
"""分析查询意图"""
|
||||
from crazy_functions.crazy_utils import \
|
||||
request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency as request_gpt
|
||||
|
||||
# 1. 基本查询分析
|
||||
type_prompt = f"""请分析这个与GitHub相关的查询,并严格按照以下XML格式回答:
|
||||
|
||||
查询: {query}
|
||||
|
||||
说明:
|
||||
1. 你的回答必须使用下面显示的XML标签,不要有任何标签外的文本
|
||||
2. 从以下选项中选择查询类型: repo/code/user/topic
|
||||
- repo: 用于查找仓库、项目、框架或库
|
||||
- code: 用于查找代码片段、函数实现或算法
|
||||
- user: 用于查找用户、开发者或组织
|
||||
- topic: 用于查找主题、类别或领域相关项目
|
||||
3. 识别主题和子主题
|
||||
4. 识别首选编程语言(如果有)
|
||||
5. 确定最低星标数(如果适用)
|
||||
|
||||
必需格式:
|
||||
<query_type>此处回答</query_type>
|
||||
<main_topic>此处回答</main_topic>
|
||||
<sub_topics>子主题1, 子主题2, ...</sub_topics>
|
||||
<language>此处回答</language>
|
||||
<min_stars>此处回答</min_stars>
|
||||
|
||||
示例回答:
|
||||
|
||||
1. 仓库查询:
|
||||
查询: "查找有至少1000颗星的Python web框架"
|
||||
<query_type>repo</query_type>
|
||||
<main_topic>web框架</main_topic>
|
||||
<sub_topics>后端开发, HTTP服务器, ORM</sub_topics>
|
||||
<language>Python</language>
|
||||
<min_stars>1000</min_stars>
|
||||
|
||||
2. 代码查询:
|
||||
查询: "如何用JavaScript实现防抖函数"
|
||||
<query_type>code</query_type>
|
||||
<main_topic>防抖函数</main_topic>
|
||||
<sub_topics>事件处理, 性能优化, 函数节流</sub_topics>
|
||||
<language>JavaScript</language>
|
||||
<min_stars>0</min_stars>"""
|
||||
|
||||
# 2. 生成英文搜索条件
|
||||
github_prompt = f"""Optimize the following GitHub search query:
|
||||
|
||||
Query: {query}
|
||||
|
||||
Task: Convert the natural language query into an optimized GitHub search query.
|
||||
Please use English, regardless of the language of the input query.
|
||||
|
||||
Available search fields and filters:
|
||||
1. Basic fields:
|
||||
- in:name - Search in repository names
|
||||
- in:description - Search in repository descriptions
|
||||
- in:readme - Search in README files
|
||||
- in:topic - Search in topics
|
||||
- language:X - Filter by programming language
|
||||
- user:X - Repositories from a specific user
|
||||
- org:X - Repositories from a specific organization
|
||||
|
||||
2. Code search fields:
|
||||
- extension:X - Filter by file extension
|
||||
- path:X - Filter by path
|
||||
- filename:X - Filter by filename
|
||||
|
||||
3. Metric filters:
|
||||
- stars:>X - Has more than X stars
|
||||
- forks:>X - Has more than X forks
|
||||
- size:>X - Size greater than X KB
|
||||
- created:>YYYY-MM-DD - Created after a specific date
|
||||
- pushed:>YYYY-MM-DD - Updated after a specific date
|
||||
|
||||
4. Other filters:
|
||||
- is:public/private - Public or private repositories
|
||||
- archived:true/false - Archived or not archived
|
||||
- license:X - Specific license
|
||||
- topic:X - Contains specific topic tag
|
||||
|
||||
Examples:
|
||||
|
||||
1. Query: "Find Python machine learning libraries with at least 1000 stars"
|
||||
<query>machine learning in:description language:python stars:>1000</query>
|
||||
|
||||
2. Query: "Recently updated React UI component libraries"
|
||||
<query>UI components library in:readme in:description language:javascript topic:react pushed:>2023-01-01</query>
|
||||
|
||||
3. Query: "Open source projects developed by Facebook"
|
||||
<query>org:facebook is:public</query>
|
||||
|
||||
4. Query: "Depth-first search implementation in JavaScript"
|
||||
<query>depth first search in:file language:javascript</query>
|
||||
|
||||
Please analyze the query and answer using only the XML tag:
|
||||
<query>Provide the optimized GitHub search query, using appropriate fields and operators</query>"""
|
||||
|
||||
# 3. 生成中文搜索条件
|
||||
chinese_github_prompt = f"""优化以下GitHub搜索查询:
|
||||
|
||||
查询: {query}
|
||||
|
||||
任务: 将自然语言查询转换为优化的GitHub搜索查询语句。
|
||||
为了搜索中文内容,请提取原始查询的关键词并使用中文形式,同时保留GitHub特定的搜索语法为英文。
|
||||
|
||||
可用的搜索字段和过滤器:
|
||||
1. 基本字段:
|
||||
- in:name - 在仓库名称中搜索
|
||||
- in:description - 在仓库描述中搜索
|
||||
- in:readme - 在README文件中搜索
|
||||
- in:topic - 在主题中搜索
|
||||
- language:X - 按编程语言筛选
|
||||
- user:X - 特定用户的仓库
|
||||
- org:X - 特定组织的仓库
|
||||
|
||||
2. 代码搜索字段:
|
||||
- extension:X - 按文件扩展名筛选
|
||||
- path:X - 按路径筛选
|
||||
- filename:X - 按文件名筛选
|
||||
|
||||
3. 指标过滤器:
|
||||
- stars:>X - 有超过X颗星
|
||||
- forks:>X - 有超过X个分支
|
||||
- size:>X - 大小超过X KB
|
||||
- created:>YYYY-MM-DD - 在特定日期后创建
|
||||
- pushed:>YYYY-MM-DD - 在特定日期后更新
|
||||
|
||||
4. 其他过滤器:
|
||||
- is:public/private - 公开或私有仓库
|
||||
- archived:true/false - 已归档或未归档
|
||||
- license:X - 特定许可证
|
||||
- topic:X - 含特定主题标签
|
||||
|
||||
示例:
|
||||
|
||||
1. 查询: "找有关机器学习的Python库,至少1000颗星"
|
||||
<query>机器学习 in:description language:python stars:>1000</query>
|
||||
|
||||
2. 查询: "最近更新的React UI组件库"
|
||||
<query>UI 组件库 in:readme in:description language:javascript topic:react pushed:>2023-01-01</query>
|
||||
|
||||
3. 查询: "微信小程序开发框架"
|
||||
<query>微信小程序 开发框架 in:name in:description in:readme</query>
|
||||
|
||||
请分析查询并仅使用XML标签回答:
|
||||
<query>提供优化的GitHub搜索查询,使用适当的字段和运算符,保留中文关键词</query>"""
|
||||
|
||||
try:
|
||||
# 构建提示数组
|
||||
prompts = [
|
||||
type_prompt,
|
||||
github_prompt,
|
||||
chinese_github_prompt,
|
||||
]
|
||||
|
||||
show_messages = [
|
||||
"分析查询类型...",
|
||||
"优化英文GitHub搜索参数...",
|
||||
"优化中文GitHub搜索参数...",
|
||||
]
|
||||
|
||||
sys_prompts = [
|
||||
"你是一个精通GitHub生态系统的专家,擅长分析与GitHub相关的查询。",
|
||||
"You are a GitHub search expert, specialized in converting natural language queries into optimized GitHub search queries in English.",
|
||||
"你是一个GitHub搜索专家,擅长处理查询并保留中文关键词进行搜索。",
|
||||
]
|
||||
|
||||
# 使用同步方式调用LLM
|
||||
responses = yield from request_gpt(
|
||||
inputs_array=prompts,
|
||||
inputs_show_user_array=show_messages,
|
||||
llm_kwargs=llm_kwargs,
|
||||
chatbot=chatbot,
|
||||
history_array=[[] for _ in prompts],
|
||||
sys_prompt_array=sys_prompts,
|
||||
max_workers=3
|
||||
)
|
||||
|
||||
# 从收集的响应中提取我们需要的内容
|
||||
extracted_responses = []
|
||||
for i in range(len(prompts)):
|
||||
if (i * 2 + 1) < len(responses):
|
||||
response = responses[i * 2 + 1]
|
||||
if response is None:
|
||||
raise Exception(f"Response {i} is None")
|
||||
if not isinstance(response, str):
|
||||
try:
|
||||
response = str(response)
|
||||
except:
|
||||
raise Exception(f"Cannot convert response {i} to string")
|
||||
extracted_responses.append(response)
|
||||
else:
|
||||
raise Exception(f"未收到第 {i + 1} 个响应")
|
||||
|
||||
# 解析基本信息
|
||||
query_type = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "query_type")
|
||||
if not query_type:
|
||||
print(
|
||||
f"Debug - Failed to extract query_type. Response was: {extracted_responses[self.BASIC_QUERY_INDEX]}")
|
||||
raise Exception("无法提取query_type标签内容")
|
||||
query_type = query_type.lower()
|
||||
|
||||
main_topic = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "main_topic")
|
||||
if not main_topic:
|
||||
print(f"Debug - Failed to extract main_topic. Using query as fallback.")
|
||||
main_topic = query
|
||||
|
||||
query_type = self._normalize_query_type(query_type, query)
|
||||
|
||||
# 提取子主题
|
||||
sub_topics = []
|
||||
sub_topics_text = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "sub_topics")
|
||||
if sub_topics_text:
|
||||
sub_topics = [topic.strip() for topic in sub_topics_text.split(",")]
|
||||
|
||||
# 提取语言
|
||||
language = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "language")
|
||||
|
||||
# 提取最低星标数
|
||||
min_stars = 0
|
||||
min_stars_text = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "min_stars")
|
||||
if min_stars_text and min_stars_text.isdigit():
|
||||
min_stars = int(min_stars_text)
|
||||
|
||||
# 解析GitHub搜索参数 - 英文
|
||||
english_github_query = self._extract_tag(extracted_responses[self.GITHUB_QUERY_INDEX], "query")
|
||||
|
||||
# 解析GitHub搜索参数 - 中文
|
||||
chinese_github_query = self._extract_tag(extracted_responses[2], "query")
|
||||
|
||||
# 构建GitHub参数
|
||||
github_params = {
|
||||
"query": english_github_query,
|
||||
"chinese_query": chinese_github_query,
|
||||
"sort": "stars", # 默认按星标排序
|
||||
"order": "desc", # 默认降序
|
||||
"per_page": 30, # 默认每页30条
|
||||
"page": 1 # 默认第1页
|
||||
}
|
||||
|
||||
# 检查是否为特定仓库查询
|
||||
repo_id = ""
|
||||
if "repo:" in english_github_query or "repository:" in english_github_query:
|
||||
repo_match = re.search(r'(repo|repository):([a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+)', english_github_query)
|
||||
if repo_match:
|
||||
repo_id = repo_match.group(2)
|
||||
|
||||
print(f"Debug - 提取的信息:")
|
||||
print(f"查询类型: {query_type}")
|
||||
print(f"主题: {main_topic}")
|
||||
print(f"子主题: {sub_topics}")
|
||||
print(f"语言: {language}")
|
||||
print(f"最低星标数: {min_stars}")
|
||||
print(f"英文GitHub参数: {english_github_query}")
|
||||
print(f"中文GitHub参数: {chinese_github_query}")
|
||||
print(f"特定仓库: {repo_id}")
|
||||
|
||||
# 更新返回的 SearchCriteria,包含中英文查询
|
||||
return SearchCriteria(
|
||||
query_type=query_type,
|
||||
main_topic=main_topic,
|
||||
sub_topics=sub_topics,
|
||||
language=language,
|
||||
min_stars=min_stars,
|
||||
github_params=github_params,
|
||||
original_query=query,
|
||||
repo_id=repo_id
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"分析查询失败: {str(e)}")
|
||||
|
||||
def _normalize_query_type(self, query_type: str, query: str) -> str:
|
||||
"""规范化查询类型"""
|
||||
if query_type in ["repo", "code", "user", "topic"]:
|
||||
return query_type
|
||||
|
||||
query_lower = query.lower()
|
||||
for type_name, keywords in self.valid_types.items():
|
||||
for keyword in keywords:
|
||||
if keyword in query_lower:
|
||||
return type_name
|
||||
|
||||
query_type_lower = query_type.lower()
|
||||
for type_name, keywords in self.valid_types.items():
|
||||
for keyword in keywords:
|
||||
if keyword in query_type_lower:
|
||||
return type_name
|
||||
|
||||
return "repo" # 默认返回repo类型
|
||||
|
||||
def _extract_tag(self, text: str, tag: str) -> str:
|
||||
"""提取标记内容"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# 标准XML格式(处理多行和特殊字符)
|
||||
pattern = f"<{tag}>(.*?)</{tag}>"
|
||||
match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
|
||||
if match:
|
||||
content = match.group(1).strip()
|
||||
if content:
|
||||
return content
|
||||
|
||||
# 备用模式
|
||||
patterns = [
|
||||
rf"<{tag}>\s*([\s\S]*?)\s*</{tag}>", # 标准XML格式
|
||||
rf"<{tag}>([\s\S]*?)(?:</{tag}>|$)", # 未闭合的标签
|
||||
rf"[{tag}]([\s\S]*?)[/{tag}]", # 方括号格式
|
||||
rf"{tag}:\s*(.*?)(?=\n\w|$)", # 冒号格式
|
||||
rf"<{tag}>\s*(.*?)(?=<|$)" # 部分闭合
|
||||
]
|
||||
|
||||
# 尝试所有模式
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
|
||||
if match:
|
||||
content = match.group(1).strip()
|
||||
if content: # 确保提取的内容不为空
|
||||
return content
|
||||
|
||||
# 如果所有模式都失败,返回空字符串
|
||||
return ""
|
||||
701
crazy_functions/paper_fns/auto_git/sources/github_source.py
Normal file
701
crazy_functions/paper_fns/auto_git/sources/github_source.py
Normal file
@@ -0,0 +1,701 @@
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import random
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Optional, Union, Any
|
||||
|
||||
class GitHubSource:
|
||||
"""GitHub API实现"""
|
||||
|
||||
# 默认API密钥列表 - 可以放置多个GitHub令牌
|
||||
API_KEYS = [
|
||||
"github_pat_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||
"github_pat_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||
# "your_github_token_1",
|
||||
# "your_github_token_2",
|
||||
# "your_github_token_3"
|
||||
]
|
||||
|
||||
def __init__(self, api_key: Optional[Union[str, List[str]]] = None):
|
||||
"""初始化GitHub API客户端
|
||||
|
||||
Args:
|
||||
api_key: GitHub个人访问令牌或令牌列表
|
||||
"""
|
||||
if api_key is None:
|
||||
self.api_keys = self.API_KEYS
|
||||
elif isinstance(api_key, str):
|
||||
self.api_keys = [api_key]
|
||||
else:
|
||||
self.api_keys = api_key
|
||||
|
||||
self._initialize()
|
||||
|
||||
def _initialize(self) -> None:
|
||||
"""初始化客户端,设置默认参数"""
|
||||
self.base_url = "https://api.github.com"
|
||||
self.headers = {
|
||||
"Accept": "application/vnd.github+json",
|
||||
"X-GitHub-Api-Version": "2022-11-28",
|
||||
"User-Agent": "GitHub-API-Python-Client"
|
||||
}
|
||||
|
||||
# 如果有可用的API密钥,随机选择一个
|
||||
if self.api_keys:
|
||||
selected_key = random.choice(self.api_keys)
|
||||
self.headers["Authorization"] = f"Bearer {selected_key}"
|
||||
print(f"已随机选择API密钥进行认证")
|
||||
else:
|
||||
print("警告: 未提供API密钥,将受到GitHub API请求限制")
|
||||
|
||||
async def _request(self, method: str, endpoint: str, params: Dict = None, data: Dict = None) -> Any:
|
||||
"""发送API请求
|
||||
|
||||
Args:
|
||||
method: HTTP方法 (GET, POST, PUT, DELETE等)
|
||||
endpoint: API端点
|
||||
params: URL参数
|
||||
data: 请求体数据
|
||||
|
||||
Returns:
|
||||
解析后的响应JSON
|
||||
"""
|
||||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||||
url = f"{self.base_url}{endpoint}"
|
||||
|
||||
# 为调试目的打印请求信息
|
||||
print(f"请求: {method} {url}")
|
||||
if params:
|
||||
print(f"参数: {params}")
|
||||
|
||||
# 发送请求
|
||||
request_kwargs = {}
|
||||
if params:
|
||||
request_kwargs["params"] = params
|
||||
if data:
|
||||
request_kwargs["json"] = data
|
||||
|
||||
async with session.request(method, url, **request_kwargs) as response:
|
||||
response_text = await response.text()
|
||||
|
||||
# 检查HTTP状态码
|
||||
if response.status >= 400:
|
||||
print(f"API请求失败: HTTP {response.status}")
|
||||
print(f"响应内容: {response_text}")
|
||||
return None
|
||||
|
||||
# 解析JSON响应
|
||||
try:
|
||||
return json.loads(response_text)
|
||||
except json.JSONDecodeError:
|
||||
print(f"JSON解析错误: {response_text}")
|
||||
return None
|
||||
|
||||
# ===== 用户相关方法 =====
|
||||
|
||||
async def get_user(self, username: Optional[str] = None) -> Dict:
|
||||
"""获取用户信息
|
||||
|
||||
Args:
|
||||
username: 指定用户名,不指定则获取当前授权用户
|
||||
|
||||
Returns:
|
||||
用户信息字典
|
||||
"""
|
||||
endpoint = "/user" if username is None else f"/users/{username}"
|
||||
return await self._request("GET", endpoint)
|
||||
|
||||
async def get_user_repos(self, username: Optional[str] = None, sort: str = "updated",
|
||||
direction: str = "desc", per_page: int = 30, page: int = 1) -> List[Dict]:
|
||||
"""获取用户的仓库列表
|
||||
|
||||
Args:
|
||||
username: 指定用户名,不指定则获取当前授权用户
|
||||
sort: 排序方式 (created, updated, pushed, full_name)
|
||||
direction: 排序方向 (asc, desc)
|
||||
per_page: 每页结果数量
|
||||
page: 页码
|
||||
|
||||
Returns:
|
||||
仓库列表
|
||||
"""
|
||||
endpoint = "/user/repos" if username is None else f"/users/{username}/repos"
|
||||
params = {
|
||||
"sort": sort,
|
||||
"direction": direction,
|
||||
"per_page": per_page,
|
||||
"page": page
|
||||
}
|
||||
return await self._request("GET", endpoint, params=params)
|
||||
|
||||
async def get_user_starred(self, username: Optional[str] = None,
|
||||
per_page: int = 30, page: int = 1) -> List[Dict]:
|
||||
"""获取用户星标的仓库
|
||||
|
||||
Args:
|
||||
username: 指定用户名,不指定则获取当前授权用户
|
||||
per_page: 每页结果数量
|
||||
page: 页码
|
||||
|
||||
Returns:
|
||||
星标仓库列表
|
||||
"""
|
||||
endpoint = "/user/starred" if username is None else f"/users/{username}/starred"
|
||||
params = {
|
||||
"per_page": per_page,
|
||||
"page": page
|
||||
}
|
||||
return await self._request("GET", endpoint, params=params)
|
||||
|
||||
# ===== 仓库相关方法 =====
|
||||
|
||||
async def get_repo(self, owner: str, repo: str) -> Dict:
|
||||
"""获取仓库信息
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
|
||||
Returns:
|
||||
仓库信息
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}"
|
||||
return await self._request("GET", endpoint)
|
||||
|
||||
async def get_repo_branches(self, owner: str, repo: str, per_page: int = 30, page: int = 1) -> List[Dict]:
|
||||
"""获取仓库的分支列表
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
per_page: 每页结果数量
|
||||
page: 页码
|
||||
|
||||
Returns:
|
||||
分支列表
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}/branches"
|
||||
params = {
|
||||
"per_page": per_page,
|
||||
"page": page
|
||||
}
|
||||
return await self._request("GET", endpoint, params=params)
|
||||
|
||||
async def get_repo_commits(self, owner: str, repo: str, sha: Optional[str] = None,
|
||||
path: Optional[str] = None, per_page: int = 30, page: int = 1) -> List[Dict]:
|
||||
"""获取仓库的提交历史
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
sha: 特定提交SHA或分支名
|
||||
path: 文件路径筛选
|
||||
per_page: 每页结果数量
|
||||
page: 页码
|
||||
|
||||
Returns:
|
||||
提交列表
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}/commits"
|
||||
params = {
|
||||
"per_page": per_page,
|
||||
"page": page
|
||||
}
|
||||
if sha:
|
||||
params["sha"] = sha
|
||||
if path:
|
||||
params["path"] = path
|
||||
|
||||
return await self._request("GET", endpoint, params=params)
|
||||
|
||||
async def get_commit_details(self, owner: str, repo: str, commit_sha: str) -> Dict:
|
||||
"""获取特定提交的详情
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
commit_sha: 提交SHA
|
||||
|
||||
Returns:
|
||||
提交详情
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}/commits/{commit_sha}"
|
||||
return await self._request("GET", endpoint)
|
||||
|
||||
# ===== 内容相关方法 =====
|
||||
|
||||
async def get_file_content(self, owner: str, repo: str, path: str, ref: Optional[str] = None) -> Dict:
|
||||
"""获取文件内容
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
path: 文件路径
|
||||
ref: 分支名、标签名或提交SHA
|
||||
|
||||
Returns:
|
||||
文件内容信息
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}/contents/{path}"
|
||||
params = {}
|
||||
if ref:
|
||||
params["ref"] = ref
|
||||
|
||||
response = await self._request("GET", endpoint, params=params)
|
||||
if response and isinstance(response, dict) and "content" in response:
|
||||
try:
|
||||
# 解码Base64编码的文件内容
|
||||
content = base64.b64decode(response["content"].encode()).decode()
|
||||
response["decoded_content"] = content
|
||||
except Exception as e:
|
||||
print(f"解码文件内容时出错: {str(e)}")
|
||||
|
||||
return response
|
||||
|
||||
async def get_directory_content(self, owner: str, repo: str, path: str, ref: Optional[str] = None) -> List[Dict]:
|
||||
"""获取目录内容
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
path: 目录路径
|
||||
ref: 分支名、标签名或提交SHA
|
||||
|
||||
Returns:
|
||||
目录内容列表
|
||||
"""
|
||||
# 注意:此方法与get_file_content使用相同的端点,但对于目录会返回列表
|
||||
endpoint = f"/repos/{owner}/{repo}/contents/{path}"
|
||||
params = {}
|
||||
if ref:
|
||||
params["ref"] = ref
|
||||
|
||||
return await self._request("GET", endpoint, params=params)
|
||||
|
||||
# ===== Issues相关方法 =====
|
||||
|
||||
async def get_issues(self, owner: str, repo: str, state: str = "open",
|
||||
sort: str = "created", direction: str = "desc",
|
||||
per_page: int = 30, page: int = 1) -> List[Dict]:
|
||||
"""获取仓库的Issues列表
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
state: Issue状态 (open, closed, all)
|
||||
sort: 排序方式 (created, updated, comments)
|
||||
direction: 排序方向 (asc, desc)
|
||||
per_page: 每页结果数量
|
||||
page: 页码
|
||||
|
||||
Returns:
|
||||
Issues列表
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}/issues"
|
||||
params = {
|
||||
"state": state,
|
||||
"sort": sort,
|
||||
"direction": direction,
|
||||
"per_page": per_page,
|
||||
"page": page
|
||||
}
|
||||
return await self._request("GET", endpoint, params=params)
|
||||
|
||||
async def get_issue(self, owner: str, repo: str, issue_number: int) -> Dict:
|
||||
"""获取特定Issue的详情
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
issue_number: Issue编号
|
||||
|
||||
Returns:
|
||||
Issue详情
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}/issues/{issue_number}"
|
||||
return await self._request("GET", endpoint)
|
||||
|
||||
async def get_issue_comments(self, owner: str, repo: str, issue_number: int) -> List[Dict]:
|
||||
"""获取Issue的评论
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
issue_number: Issue编号
|
||||
|
||||
Returns:
|
||||
评论列表
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}/issues/{issue_number}/comments"
|
||||
return await self._request("GET", endpoint)
|
||||
|
||||
# ===== Pull Requests相关方法 =====
|
||||
|
||||
async def get_pull_requests(self, owner: str, repo: str, state: str = "open",
|
||||
sort: str = "created", direction: str = "desc",
|
||||
per_page: int = 30, page: int = 1) -> List[Dict]:
|
||||
"""获取仓库的Pull Request列表
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
state: PR状态 (open, closed, all)
|
||||
sort: 排序方式 (created, updated, popularity, long-running)
|
||||
direction: 排序方向 (asc, desc)
|
||||
per_page: 每页结果数量
|
||||
page: 页码
|
||||
|
||||
Returns:
|
||||
Pull Request列表
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}/pulls"
|
||||
params = {
|
||||
"state": state,
|
||||
"sort": sort,
|
||||
"direction": direction,
|
||||
"per_page": per_page,
|
||||
"page": page
|
||||
}
|
||||
return await self._request("GET", endpoint, params=params)
|
||||
|
||||
async def get_pull_request(self, owner: str, repo: str, pr_number: int) -> Dict:
|
||||
"""获取特定Pull Request的详情
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
pr_number: Pull Request编号
|
||||
|
||||
Returns:
|
||||
Pull Request详情
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}/pulls/{pr_number}"
|
||||
return await self._request("GET", endpoint)
|
||||
|
||||
async def get_pull_request_files(self, owner: str, repo: str, pr_number: int) -> List[Dict]:
|
||||
"""获取Pull Request中修改的文件
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
pr_number: Pull Request编号
|
||||
|
||||
Returns:
|
||||
修改文件列表
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}/pulls/{pr_number}/files"
|
||||
return await self._request("GET", endpoint)
|
||||
|
||||
# ===== 搜索相关方法 =====
|
||||
|
||||
async def search_repositories(self, query: str, sort: str = "stars",
|
||||
order: str = "desc", per_page: int = 30, page: int = 1) -> Dict:
|
||||
"""搜索仓库
|
||||
|
||||
Args:
|
||||
query: 搜索关键词
|
||||
sort: 排序方式 (stars, forks, updated)
|
||||
order: 排序顺序 (asc, desc)
|
||||
per_page: 每页结果数量
|
||||
page: 页码
|
||||
|
||||
Returns:
|
||||
搜索结果
|
||||
"""
|
||||
endpoint = "/search/repositories"
|
||||
params = {
|
||||
"q": query,
|
||||
"sort": sort,
|
||||
"order": order,
|
||||
"per_page": per_page,
|
||||
"page": page
|
||||
}
|
||||
return await self._request("GET", endpoint, params=params)
|
||||
|
||||
async def search_code(self, query: str, sort: str = "indexed",
|
||||
order: str = "desc", per_page: int = 30, page: int = 1) -> Dict:
|
||||
"""搜索代码
|
||||
|
||||
Args:
|
||||
query: 搜索关键词
|
||||
sort: 排序方式 (indexed)
|
||||
order: 排序顺序 (asc, desc)
|
||||
per_page: 每页结果数量
|
||||
page: 页码
|
||||
|
||||
Returns:
|
||||
搜索结果
|
||||
"""
|
||||
endpoint = "/search/code"
|
||||
params = {
|
||||
"q": query,
|
||||
"sort": sort,
|
||||
"order": order,
|
||||
"per_page": per_page,
|
||||
"page": page
|
||||
}
|
||||
return await self._request("GET", endpoint, params=params)
|
||||
|
||||
async def search_issues(self, query: str, sort: str = "created",
|
||||
order: str = "desc", per_page: int = 30, page: int = 1) -> Dict:
|
||||
"""搜索Issues和Pull Requests
|
||||
|
||||
Args:
|
||||
query: 搜索关键词
|
||||
sort: 排序方式 (created, updated, comments)
|
||||
order: 排序顺序 (asc, desc)
|
||||
per_page: 每页结果数量
|
||||
page: 页码
|
||||
|
||||
Returns:
|
||||
搜索结果
|
||||
"""
|
||||
endpoint = "/search/issues"
|
||||
params = {
|
||||
"q": query,
|
||||
"sort": sort,
|
||||
"order": order,
|
||||
"per_page": per_page,
|
||||
"page": page
|
||||
}
|
||||
return await self._request("GET", endpoint, params=params)
|
||||
|
||||
async def search_users(self, query: str, sort: str = "followers",
|
||||
order: str = "desc", per_page: int = 30, page: int = 1) -> Dict:
|
||||
"""搜索用户
|
||||
|
||||
Args:
|
||||
query: 搜索关键词
|
||||
sort: 排序方式 (followers, repositories, joined)
|
||||
order: 排序顺序 (asc, desc)
|
||||
per_page: 每页结果数量
|
||||
page: 页码
|
||||
|
||||
Returns:
|
||||
搜索结果
|
||||
"""
|
||||
endpoint = "/search/users"
|
||||
params = {
|
||||
"q": query,
|
||||
"sort": sort,
|
||||
"order": order,
|
||||
"per_page": per_page,
|
||||
"page": page
|
||||
}
|
||||
return await self._request("GET", endpoint, params=params)
|
||||
|
||||
# ===== 组织相关方法 =====
|
||||
|
||||
async def get_organization(self, org: str) -> Dict:
|
||||
"""获取组织信息
|
||||
|
||||
Args:
|
||||
org: 组织名称
|
||||
|
||||
Returns:
|
||||
组织信息
|
||||
"""
|
||||
endpoint = f"/orgs/{org}"
|
||||
return await self._request("GET", endpoint)
|
||||
|
||||
async def get_organization_repos(self, org: str, type: str = "all",
|
||||
sort: str = "created", direction: str = "desc",
|
||||
per_page: int = 30, page: int = 1) -> List[Dict]:
|
||||
"""获取组织的仓库列表
|
||||
|
||||
Args:
|
||||
org: 组织名称
|
||||
type: 仓库类型 (all, public, private, forks, sources, member, internal)
|
||||
sort: 排序方式 (created, updated, pushed, full_name)
|
||||
direction: 排序方向 (asc, desc)
|
||||
per_page: 每页结果数量
|
||||
page: 页码
|
||||
|
||||
Returns:
|
||||
仓库列表
|
||||
"""
|
||||
endpoint = f"/orgs/{org}/repos"
|
||||
params = {
|
||||
"type": type,
|
||||
"sort": sort,
|
||||
"direction": direction,
|
||||
"per_page": per_page,
|
||||
"page": page
|
||||
}
|
||||
return await self._request("GET", endpoint, params=params)
|
||||
|
||||
async def get_organization_members(self, org: str, per_page: int = 30, page: int = 1) -> List[Dict]:
|
||||
"""获取组织成员列表
|
||||
|
||||
Args:
|
||||
org: 组织名称
|
||||
per_page: 每页结果数量
|
||||
page: 页码
|
||||
|
||||
Returns:
|
||||
成员列表
|
||||
"""
|
||||
endpoint = f"/orgs/{org}/members"
|
||||
params = {
|
||||
"per_page": per_page,
|
||||
"page": page
|
||||
}
|
||||
return await self._request("GET", endpoint, params=params)
|
||||
|
||||
# ===== 更复杂的操作 =====
|
||||
|
||||
async def get_repository_languages(self, owner: str, repo: str) -> Dict:
|
||||
"""获取仓库使用的编程语言及其比例
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
|
||||
Returns:
|
||||
语言使用情况
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}/languages"
|
||||
return await self._request("GET", endpoint)
|
||||
|
||||
async def get_repository_stats_contributors(self, owner: str, repo: str) -> List[Dict]:
|
||||
"""获取仓库的贡献者统计
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
|
||||
Returns:
|
||||
贡献者统计信息
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}/stats/contributors"
|
||||
return await self._request("GET", endpoint)
|
||||
|
||||
async def get_repository_stats_commit_activity(self, owner: str, repo: str) -> List[Dict]:
|
||||
"""获取仓库的提交活动
|
||||
|
||||
Args:
|
||||
owner: 仓库所有者
|
||||
repo: 仓库名
|
||||
|
||||
Returns:
|
||||
提交活动统计
|
||||
"""
|
||||
endpoint = f"/repos/{owner}/{repo}/stats/commit_activity"
|
||||
return await self._request("GET", endpoint)
|
||||
|
||||
async def example_usage():
|
||||
"""GitHubSource使用示例"""
|
||||
# 创建客户端实例(可选传入API令牌)
|
||||
# github = GitHubSource(api_key="your_github_token")
|
||||
github = GitHubSource()
|
||||
|
||||
try:
|
||||
# 示例1:搜索热门Python仓库
|
||||
print("\n=== 示例1:搜索热门Python仓库 ===")
|
||||
repos = await github.search_repositories(
|
||||
query="language:python stars:>1000",
|
||||
sort="stars",
|
||||
order="desc",
|
||||
per_page=5
|
||||
)
|
||||
|
||||
if repos and "items" in repos:
|
||||
for i, repo in enumerate(repos["items"], 1):
|
||||
print(f"\n--- 仓库 {i} ---")
|
||||
print(f"名称: {repo['full_name']}")
|
||||
print(f"描述: {repo['description']}")
|
||||
print(f"星标数: {repo['stargazers_count']}")
|
||||
print(f"Fork数: {repo['forks_count']}")
|
||||
print(f"最近更新: {repo['updated_at']}")
|
||||
print(f"URL: {repo['html_url']}")
|
||||
|
||||
# 示例2:获取特定仓库的详情
|
||||
print("\n=== 示例2:获取特定仓库的详情 ===")
|
||||
repo_details = await github.get_repo("microsoft", "vscode")
|
||||
if repo_details:
|
||||
print(f"名称: {repo_details['full_name']}")
|
||||
print(f"描述: {repo_details['description']}")
|
||||
print(f"星标数: {repo_details['stargazers_count']}")
|
||||
print(f"Fork数: {repo_details['forks_count']}")
|
||||
print(f"默认分支: {repo_details['default_branch']}")
|
||||
print(f"开源许可: {repo_details.get('license', {}).get('name', '无')}")
|
||||
print(f"语言: {repo_details['language']}")
|
||||
print(f"Open Issues数: {repo_details['open_issues_count']}")
|
||||
|
||||
# 示例3:获取仓库的提交历史
|
||||
print("\n=== 示例3:获取仓库的最近提交 ===")
|
||||
commits = await github.get_repo_commits("tensorflow", "tensorflow", per_page=5)
|
||||
if commits:
|
||||
for i, commit in enumerate(commits, 1):
|
||||
print(f"\n--- 提交 {i} ---")
|
||||
print(f"SHA: {commit['sha'][:7]}")
|
||||
print(f"作者: {commit['commit']['author']['name']}")
|
||||
print(f"日期: {commit['commit']['author']['date']}")
|
||||
print(f"消息: {commit['commit']['message'].splitlines()[0]}")
|
||||
|
||||
# 示例4:搜索代码
|
||||
print("\n=== 示例4:搜索代码 ===")
|
||||
code_results = await github.search_code(
|
||||
query="filename:README.md language:markdown pytorch in:file",
|
||||
per_page=3
|
||||
)
|
||||
if code_results and "items" in code_results:
|
||||
print(f"共找到: {code_results['total_count']} 个结果")
|
||||
for i, item in enumerate(code_results["items"], 1):
|
||||
print(f"\n--- 代码 {i} ---")
|
||||
print(f"仓库: {item['repository']['full_name']}")
|
||||
print(f"文件: {item['path']}")
|
||||
print(f"URL: {item['html_url']}")
|
||||
|
||||
# 示例5:获取文件内容
|
||||
print("\n=== 示例5:获取文件内容 ===")
|
||||
file_content = await github.get_file_content("python", "cpython", "README.rst")
|
||||
if file_content and "decoded_content" in file_content:
|
||||
content = file_content["decoded_content"]
|
||||
print(f"文件名: {file_content['name']}")
|
||||
print(f"大小: {file_content['size']} 字节")
|
||||
print(f"内容预览: {content[:200]}...")
|
||||
|
||||
# 示例6:获取仓库使用的编程语言
|
||||
print("\n=== 示例6:获取仓库使用的编程语言 ===")
|
||||
languages = await github.get_repository_languages("facebook", "react")
|
||||
if languages:
|
||||
print(f"React仓库使用的编程语言:")
|
||||
for lang, bytes_of_code in languages.items():
|
||||
print(f"- {lang}: {bytes_of_code} 字节")
|
||||
|
||||
# 示例7:获取组织信息
|
||||
print("\n=== 示例7:获取组织信息 ===")
|
||||
org_info = await github.get_organization("google")
|
||||
if org_info:
|
||||
print(f"名称: {org_info['name']}")
|
||||
print(f"描述: {org_info.get('description', '无')}")
|
||||
print(f"位置: {org_info.get('location', '未指定')}")
|
||||
print(f"公共仓库数: {org_info['public_repos']}")
|
||||
print(f"成员数: {org_info.get('public_members', 0)}")
|
||||
print(f"URL: {org_info['html_url']}")
|
||||
|
||||
# 示例8:获取用户信息
|
||||
print("\n=== 示例8:获取用户信息 ===")
|
||||
user_info = await github.get_user("torvalds")
|
||||
if user_info:
|
||||
print(f"名称: {user_info['name']}")
|
||||
print(f"公司: {user_info.get('company', '无')}")
|
||||
print(f"博客: {user_info.get('blog', '无')}")
|
||||
print(f"位置: {user_info.get('location', '未指定')}")
|
||||
print(f"公共仓库数: {user_info['public_repos']}")
|
||||
print(f"关注者数: {user_info['followers']}")
|
||||
print(f"URL: {user_info['html_url']}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"发生错误: {str(e)}")
|
||||
import traceback
|
||||
print(traceback.format_exc())
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
# 运行示例
|
||||
asyncio.run(example_usage())
|
||||
593
crazy_functions/paper_fns/document_structure_extractor.py
Normal file
593
crazy_functions/paper_fns/document_structure_extractor.py
Normal file
@@ -0,0 +1,593 @@
|
||||
from typing import List, Dict, Optional, Tuple, Union, Any
|
||||
from dataclasses import dataclass, field
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
|
||||
from crazy_functions.doc_fns.read_fns.unstructured_all.paper_structure_extractor import (
|
||||
PaperStructureExtractor, PaperSection, StructuredPaper
|
||||
)
|
||||
from unstructured.partition.auto import partition
|
||||
from unstructured.documents.elements import (
|
||||
Text, Title, NarrativeText, ListItem, Table,
|
||||
Footer, Header, PageBreak, Image, Address
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class DocumentSection:
|
||||
"""通用文档章节数据类"""
|
||||
title: str # 章节标题,如果没有标题则为空字符串
|
||||
content: str # 章节内容
|
||||
level: int = 0 # 标题级别,0为主标题,1为一级标题,以此类推
|
||||
section_type: str = "content" # 章节类型
|
||||
is_heading_only: bool = False # 是否仅包含标题
|
||||
subsections: List['DocumentSection'] = field(default_factory=list) # 子章节列表
|
||||
|
||||
|
||||
@dataclass
|
||||
class StructuredDocument:
|
||||
"""结构化文档数据类"""
|
||||
title: str = "" # 文档标题
|
||||
metadata: Dict[str, Any] = field(default_factory=dict) # 元数据
|
||||
sections: List[DocumentSection] = field(default_factory=list) # 章节列表
|
||||
full_text: str = "" # 完整文本
|
||||
is_paper: bool = False # 是否为学术论文
|
||||
|
||||
|
||||
class GenericDocumentStructureExtractor:
|
||||
"""通用文档结构提取器
|
||||
|
||||
可以从各种文档格式中提取结构信息,包括标题和内容。
|
||||
支持论文、报告、文章和一般文本文档。
|
||||
"""
|
||||
|
||||
# 支持的文件扩展名
|
||||
SUPPORTED_EXTENSIONS = [
|
||||
'.pdf', '.docx', '.doc', '.pptx', '.ppt',
|
||||
'.txt', '.md', '.html', '.htm', '.xml',
|
||||
'.rtf', '.odt', '.epub', '.msg', '.eml'
|
||||
]
|
||||
|
||||
# 常见的标题前缀模式
|
||||
HEADING_PATTERNS = [
|
||||
# 数字标题 (1., 1.1., etc.)
|
||||
r'^\s*(\d+\.)+\s+',
|
||||
# 中文数字标题 (一、, 二、, etc.)
|
||||
r'^\s*[一二三四五六七八九十]+[、::]\s+',
|
||||
# 带括号的数字标题 ((1), (2), etc.)
|
||||
r'^\s*\(\s*\d+\s*\)\s+',
|
||||
# 特定标记的标题 (Chapter 1, Section 1, etc.)
|
||||
r'^\s*(chapter|section|part|附录|章|节)\s+\d+[\.::]\s+',
|
||||
]
|
||||
|
||||
# 常见的文档分段标记词
|
||||
SECTION_MARKERS = {
|
||||
'introduction': ['简介', '导言', '引言', 'introduction', '概述', 'overview'],
|
||||
'background': ['背景', '现状', 'background', '理论基础', '相关工作'],
|
||||
'main_content': ['主要内容', '正文', 'main content', '分析', '讨论'],
|
||||
'conclusion': ['结论', '总结', 'conclusion', '结语', '小结', 'summary'],
|
||||
'reference': ['参考', '参考文献', 'references', '文献', 'bibliography'],
|
||||
'appendix': ['附录', 'appendix', '补充资料', 'supplementary']
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""初始化提取器"""
|
||||
self.paper_extractor = PaperStructureExtractor() # 论文专用提取器
|
||||
self._setup_logging()
|
||||
|
||||
def _setup_logging(self):
|
||||
"""配置日志"""
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def extract_document_structure(self, file_path: str, strategy: str = "fast") -> StructuredDocument:
|
||||
"""提取文档结构
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
strategy: 提取策略 ("fast" 或 "accurate")
|
||||
|
||||
Returns:
|
||||
StructuredDocument: 结构化文档对象
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"正在处理文档结构: {file_path}")
|
||||
|
||||
# 1. 首先尝试使用论文提取器
|
||||
try:
|
||||
paper_result = self.paper_extractor.extract_paper_structure(file_path)
|
||||
if paper_result and len(paper_result.sections) > 2: # 如果成功识别为论文结构
|
||||
self.logger.info(f"成功识别为学术论文: {file_path}")
|
||||
# 将论文结构转换为通用文档结构
|
||||
return self._convert_paper_to_document(paper_result)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"论文结构提取失败,将尝试通用提取: {str(e)}")
|
||||
|
||||
# 2. 使用通用方法提取文档结构
|
||||
elements = partition(
|
||||
str(file_path),
|
||||
strategy=strategy,
|
||||
include_metadata=True,
|
||||
nlp=False
|
||||
)
|
||||
|
||||
# 3. 使用通用提取器处理
|
||||
doc = self._extract_generic_structure(elements)
|
||||
return doc
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"文档结构提取失败: {str(e)}")
|
||||
# 返回一个空的结构化文档
|
||||
return StructuredDocument(
|
||||
title="未能提取文档标题",
|
||||
sections=[DocumentSection(
|
||||
title="",
|
||||
content="",
|
||||
level=0,
|
||||
section_type="content"
|
||||
)]
|
||||
)
|
||||
|
||||
def _convert_paper_to_document(self, paper: StructuredPaper) -> StructuredDocument:
|
||||
"""将论文结构转换为通用文档结构
|
||||
|
||||
Args:
|
||||
paper: 结构化论文对象
|
||||
|
||||
Returns:
|
||||
StructuredDocument: 转换后的通用文档结构
|
||||
"""
|
||||
doc = StructuredDocument(
|
||||
title=paper.metadata.title,
|
||||
is_paper=True,
|
||||
full_text=paper.full_text
|
||||
)
|
||||
|
||||
# 转换元数据
|
||||
doc.metadata = {
|
||||
'title': paper.metadata.title,
|
||||
'authors': paper.metadata.authors,
|
||||
'keywords': paper.keywords,
|
||||
'abstract': paper.metadata.abstract if hasattr(paper.metadata, 'abstract') else "",
|
||||
'is_paper': True
|
||||
}
|
||||
|
||||
# 转换章节结构
|
||||
doc.sections = self._convert_paper_sections(paper.sections)
|
||||
|
||||
return doc
|
||||
|
||||
def _convert_paper_sections(self, paper_sections: List[PaperSection], level: int = 0) -> List[DocumentSection]:
|
||||
"""递归转换论文章节为通用文档章节
|
||||
|
||||
Args:
|
||||
paper_sections: 论文章节列表
|
||||
level: 当前章节级别
|
||||
|
||||
Returns:
|
||||
List[DocumentSection]: 通用文档章节列表
|
||||
"""
|
||||
doc_sections = []
|
||||
|
||||
for section in paper_sections:
|
||||
doc_section = DocumentSection(
|
||||
title=section.title,
|
||||
content=section.content,
|
||||
level=section.level,
|
||||
section_type=section.section_type,
|
||||
is_heading_only=False if section.content else True
|
||||
)
|
||||
|
||||
# 递归处理子章节
|
||||
if section.subsections:
|
||||
doc_section.subsections = self._convert_paper_sections(
|
||||
section.subsections, level + 1
|
||||
)
|
||||
|
||||
doc_sections.append(doc_section)
|
||||
|
||||
return doc_sections
|
||||
|
||||
def _extract_generic_structure(self, elements) -> StructuredDocument:
|
||||
"""从元素列表中提取通用文档结构
|
||||
|
||||
Args:
|
||||
elements: 文档元素列表
|
||||
|
||||
Returns:
|
||||
StructuredDocument: 结构化文档对象
|
||||
"""
|
||||
# 创建结构化文档对象
|
||||
doc = StructuredDocument(full_text="")
|
||||
|
||||
# 1. 提取文档标题
|
||||
title_candidates = []
|
||||
for i, element in enumerate(elements[:5]): # 只检查前5个元素
|
||||
if isinstance(element, Title):
|
||||
title_text = str(element).strip()
|
||||
title_candidates.append((i, title_text))
|
||||
|
||||
if title_candidates:
|
||||
# 使用第一个标题作为文档标题
|
||||
doc.title = title_candidates[0][1]
|
||||
|
||||
# 2. 识别所有标题元素和内容
|
||||
title_elements = []
|
||||
|
||||
# 2.1 首先识别所有标题
|
||||
for i, element in enumerate(elements):
|
||||
is_heading = False
|
||||
title_text = ""
|
||||
level = 0
|
||||
|
||||
# 检查元素类型
|
||||
if isinstance(element, Title):
|
||||
is_heading = True
|
||||
title_text = str(element).strip()
|
||||
|
||||
# 进一步检查是否为真正的标题
|
||||
if self._is_likely_heading(title_text, element, i, elements):
|
||||
level = self._estimate_heading_level(title_text, element)
|
||||
else:
|
||||
is_heading = False
|
||||
|
||||
# 也检查格式像标题的普通文本
|
||||
elif isinstance(element, (Text, NarrativeText)) and i > 0:
|
||||
text = str(element).strip()
|
||||
# 检查是否匹配标题模式
|
||||
if any(re.match(pattern, text) for pattern in self.HEADING_PATTERNS):
|
||||
# 检查长度和后续内容以确认是否为标题
|
||||
if len(text) < 100 and self._has_sufficient_following_content(i, elements):
|
||||
is_heading = True
|
||||
title_text = text
|
||||
level = self._estimate_heading_level(title_text, element)
|
||||
|
||||
if is_heading:
|
||||
section_type = self._identify_section_type(title_text)
|
||||
title_elements.append((i, title_text, level, section_type))
|
||||
|
||||
# 2.2 为每个标题提取内容
|
||||
sections = []
|
||||
|
||||
for i, (index, title_text, level, section_type) in enumerate(title_elements):
|
||||
# 确定内容范围
|
||||
content_start = index + 1
|
||||
content_end = elements[-1] # 默认到文档结束
|
||||
|
||||
# 如果有下一个标题,内容到下一个标题开始
|
||||
if i < len(title_elements) - 1:
|
||||
content_end = title_elements[i+1][0]
|
||||
else:
|
||||
content_end = len(elements)
|
||||
|
||||
# 提取内容
|
||||
content = self._extract_content_between(elements, content_start, content_end)
|
||||
|
||||
# 创建章节
|
||||
section = DocumentSection(
|
||||
title=title_text,
|
||||
content=content,
|
||||
level=level,
|
||||
section_type=section_type,
|
||||
is_heading_only=False if content.strip() else True
|
||||
)
|
||||
|
||||
sections.append(section)
|
||||
|
||||
# 3. 如果没有识别到任何章节,创建一个默认章节
|
||||
if not sections:
|
||||
all_content = self._extract_content_between(elements, 0, len(elements))
|
||||
|
||||
# 尝试从内容中提取标题
|
||||
first_line = all_content.split('\n')[0] if all_content else ""
|
||||
if first_line and len(first_line) < 100:
|
||||
doc.title = first_line
|
||||
all_content = '\n'.join(all_content.split('\n')[1:])
|
||||
|
||||
default_section = DocumentSection(
|
||||
title="",
|
||||
content=all_content,
|
||||
level=0,
|
||||
section_type="content"
|
||||
)
|
||||
sections.append(default_section)
|
||||
|
||||
# 4. 构建层次结构
|
||||
doc.sections = self._build_section_hierarchy(sections)
|
||||
|
||||
# 5. 提取完整文本
|
||||
doc.full_text = "\n\n".join([str(element) for element in elements if isinstance(element, (Text, NarrativeText, Title, ListItem))])
|
||||
|
||||
return doc
|
||||
|
||||
def _build_section_hierarchy(self, sections: List[DocumentSection]) -> List[DocumentSection]:
|
||||
"""构建章节层次结构
|
||||
|
||||
Args:
|
||||
sections: 章节列表
|
||||
|
||||
Returns:
|
||||
List[DocumentSection]: 具有层次结构的章节列表
|
||||
"""
|
||||
if not sections:
|
||||
return []
|
||||
|
||||
# 按层级排序
|
||||
top_level_sections = []
|
||||
current_parents = {0: None} # 每个层级的当前父节点
|
||||
|
||||
for section in sections:
|
||||
# 找到当前节点的父节点
|
||||
parent_level = None
|
||||
for level in sorted([k for k in current_parents.keys() if k < section.level], reverse=True):
|
||||
parent_level = level
|
||||
break
|
||||
|
||||
if parent_level is None:
|
||||
# 顶级章节
|
||||
top_level_sections.append(section)
|
||||
else:
|
||||
# 子章节
|
||||
parent = current_parents[parent_level]
|
||||
if parent:
|
||||
parent.subsections.append(section)
|
||||
else:
|
||||
top_level_sections.append(section)
|
||||
|
||||
# 更新当前层级的父节点
|
||||
current_parents[section.level] = section
|
||||
|
||||
# 清除所有更深层级的父节点缓存
|
||||
deeper_levels = [k for k in current_parents.keys() if k > section.level]
|
||||
for level in deeper_levels:
|
||||
current_parents.pop(level, None)
|
||||
|
||||
return top_level_sections
|
||||
|
||||
def _is_likely_heading(self, text: str, element, index: int, elements) -> bool:
|
||||
"""判断文本是否可能是标题
|
||||
|
||||
Args:
|
||||
text: 文本内容
|
||||
element: 元素对象
|
||||
index: 元素索引
|
||||
elements: 所有元素列表
|
||||
|
||||
Returns:
|
||||
bool: 是否可能是标题
|
||||
"""
|
||||
# 1. 检查文本长度 - 标题通常不会太长
|
||||
if len(text) > 150: # 标题通常不超过150个字符
|
||||
return False
|
||||
|
||||
# 2. 检查是否匹配标题的数字编号模式
|
||||
if any(re.match(pattern, text) for pattern in self.HEADING_PATTERNS):
|
||||
return True
|
||||
|
||||
# 3. 检查是否包含常见章节标记词
|
||||
lower_text = text.lower()
|
||||
for markers in self.SECTION_MARKERS.values():
|
||||
if any(marker.lower() in lower_text for marker in markers):
|
||||
return True
|
||||
|
||||
# 4. 检查后续内容数量 - 标题后通常有足够多的内容
|
||||
if not self._has_sufficient_following_content(index, elements, min_chars=100):
|
||||
# 但如果文本很短且以特定格式开头,仍可能是标题
|
||||
if len(text) < 50 and (text.endswith(':') or text.endswith(':')):
|
||||
return True
|
||||
return False
|
||||
|
||||
# 5. 检查格式特征
|
||||
# 标题通常是元素的开头,不在段落中间
|
||||
if len(text.split('\n')) > 1:
|
||||
# 多行文本不太可能是标题
|
||||
return False
|
||||
|
||||
# 如果有元数据,检查字体特征(字体大小等)
|
||||
if hasattr(element, 'metadata') and element.metadata:
|
||||
try:
|
||||
font_size = getattr(element.metadata, 'font_size', None)
|
||||
is_bold = getattr(element.metadata, 'is_bold', False)
|
||||
|
||||
# 字体较大或加粗的文本更可能是标题
|
||||
if font_size and font_size > 12:
|
||||
return True
|
||||
if is_bold:
|
||||
return True
|
||||
except (AttributeError, TypeError):
|
||||
pass
|
||||
|
||||
# 默认返回True,因为元素已被识别为Title类型
|
||||
return True
|
||||
|
||||
def _estimate_heading_level(self, text: str, element) -> int:
|
||||
"""估计标题的层级
|
||||
|
||||
Args:
|
||||
text: 标题文本
|
||||
element: 元素对象
|
||||
|
||||
Returns:
|
||||
int: 标题层级 (0为主标题,1为一级标题, 等等)
|
||||
"""
|
||||
# 1. 通过编号模式判断层级
|
||||
for pattern, level in [
|
||||
(r'^\s*\d+\.\s+', 1), # 1. 开头 (一级标题)
|
||||
(r'^\s*\d+\.\d+\.\s+', 2), # 1.1. 开头 (二级标题)
|
||||
(r'^\s*\d+\.\d+\.\d+\.\s+', 3), # 1.1.1. 开头 (三级标题)
|
||||
(r'^\s*\d+\.\d+\.\d+\.\d+\.\s+', 4), # 1.1.1.1. 开头 (四级标题)
|
||||
]:
|
||||
if re.match(pattern, text):
|
||||
return level
|
||||
|
||||
# 2. 检查是否是常见的主要章节标题
|
||||
lower_text = text.lower()
|
||||
main_sections = [
|
||||
'abstract', 'introduction', 'background', 'methodology',
|
||||
'results', 'discussion', 'conclusion', 'references'
|
||||
]
|
||||
for section in main_sections:
|
||||
if section in lower_text:
|
||||
return 1 # 主要章节为一级标题
|
||||
|
||||
# 3. 根据文本特征判断
|
||||
if text.isupper(): # 全大写文本可能是章标题
|
||||
return 1
|
||||
|
||||
# 4. 通过元数据判断层级
|
||||
if hasattr(element, 'metadata') and element.metadata:
|
||||
try:
|
||||
# 根据字体大小判断层级
|
||||
font_size = getattr(element.metadata, 'font_size', None)
|
||||
if font_size is not None:
|
||||
if font_size > 18: # 假设主标题字体最大
|
||||
return 0
|
||||
elif font_size > 16:
|
||||
return 1
|
||||
elif font_size > 14:
|
||||
return 2
|
||||
else:
|
||||
return 3
|
||||
except (AttributeError, TypeError):
|
||||
pass
|
||||
|
||||
# 默认为二级标题
|
||||
return 2
|
||||
|
||||
def _identify_section_type(self, title_text: str) -> str:
|
||||
"""识别章节类型,包括参考文献部分"""
|
||||
lower_text = title_text.lower()
|
||||
|
||||
# 特别检查是否为参考文献部分
|
||||
references_patterns = [
|
||||
r'references', r'参考文献', r'bibliography', r'引用文献',
|
||||
r'literature cited', r'^cited\s+literature', r'^文献$', r'^引用$'
|
||||
]
|
||||
|
||||
for pattern in references_patterns:
|
||||
if re.search(pattern, lower_text, re.IGNORECASE):
|
||||
return "references"
|
||||
|
||||
# 检查是否匹配其他常见章节类型
|
||||
for section_type, markers in self.SECTION_MARKERS.items():
|
||||
if any(marker.lower() in lower_text for marker in markers):
|
||||
return section_type
|
||||
|
||||
# 检查带编号的章节
|
||||
if re.match(r'^\d+\.', lower_text):
|
||||
return "content"
|
||||
|
||||
# 默认为内容章节
|
||||
return "content"
|
||||
|
||||
def _has_sufficient_following_content(self, index: int, elements, min_chars: int = 150) -> bool:
|
||||
"""检查元素后是否有足够的内容
|
||||
|
||||
Args:
|
||||
index: 当前元素索引
|
||||
elements: 所有元素列表
|
||||
min_chars: 最小字符数要求
|
||||
|
||||
Returns:
|
||||
bool: 是否有足够的内容
|
||||
"""
|
||||
total_chars = 0
|
||||
for i in range(index + 1, min(index + 5, len(elements))):
|
||||
if isinstance(elements[i], Title):
|
||||
# 如果紧接着是标题,就停止检查
|
||||
break
|
||||
if isinstance(elements[i], (Text, NarrativeText, ListItem, Table)):
|
||||
total_chars += len(str(elements[i]))
|
||||
if total_chars >= min_chars:
|
||||
return True
|
||||
|
||||
return total_chars >= min_chars
|
||||
|
||||
def _extract_content_between(self, elements, start_index: int, end_index: int) -> str:
|
||||
"""提取指定范围内的内容文本
|
||||
|
||||
Args:
|
||||
elements: 元素列表
|
||||
start_index: 开始索引
|
||||
end_index: 结束索引
|
||||
|
||||
Returns:
|
||||
str: 提取的内容文本
|
||||
"""
|
||||
content_parts = []
|
||||
|
||||
for i in range(start_index, end_index):
|
||||
if isinstance(elements[i], (Text, NarrativeText, ListItem, Table)):
|
||||
content_parts.append(str(elements[i]).strip())
|
||||
|
||||
return "\n\n".join([part for part in content_parts if part])
|
||||
|
||||
def generate_markdown(self, doc: StructuredDocument) -> str:
|
||||
"""将结构化文档转换为Markdown格式
|
||||
|
||||
Args:
|
||||
doc: 结构化文档对象
|
||||
|
||||
Returns:
|
||||
str: Markdown格式文本
|
||||
"""
|
||||
md_parts = []
|
||||
|
||||
# 添加标题
|
||||
if doc.title:
|
||||
md_parts.append(f"# {doc.title}\n")
|
||||
|
||||
# 添加元数据
|
||||
if doc.is_paper:
|
||||
# 作者信息
|
||||
if 'authors' in doc.metadata and doc.metadata['authors']:
|
||||
authors_str = ", ".join(doc.metadata['authors'])
|
||||
md_parts.append(f"**作者:** {authors_str}\n")
|
||||
|
||||
# 关键词
|
||||
if 'keywords' in doc.metadata and doc.metadata['keywords']:
|
||||
keywords_str = ", ".join(doc.metadata['keywords'])
|
||||
md_parts.append(f"**关键词:** {keywords_str}\n")
|
||||
|
||||
# 摘要
|
||||
if 'abstract' in doc.metadata and doc.metadata['abstract']:
|
||||
md_parts.append(f"## 摘要\n\n{doc.metadata['abstract']}\n")
|
||||
|
||||
# 添加章节内容
|
||||
md_parts.append(self._format_sections_markdown(doc.sections))
|
||||
|
||||
return "\n".join(md_parts)
|
||||
|
||||
def _format_sections_markdown(self, sections: List[DocumentSection], base_level: int = 0) -> str:
|
||||
"""递归格式化章节为Markdown
|
||||
|
||||
Args:
|
||||
sections: 章节列表
|
||||
base_level: 基础层级
|
||||
|
||||
Returns:
|
||||
str: Markdown格式文本
|
||||
"""
|
||||
md_parts = []
|
||||
|
||||
for section in sections:
|
||||
# 计算标题级别 (确保不超过6级)
|
||||
header_level = min(section.level + base_level + 1, 6)
|
||||
|
||||
# 添加标题和内容
|
||||
if section.title:
|
||||
md_parts.append(f"{'#' * header_level} {section.title}\n")
|
||||
|
||||
if section.content:
|
||||
md_parts.append(f"{section.content}\n")
|
||||
|
||||
# 递归处理子章节
|
||||
if section.subsections:
|
||||
md_parts.append(self._format_sections_markdown(
|
||||
section.subsections, base_level
|
||||
))
|
||||
|
||||
return "\n".join(md_parts)
|
||||
4
crazy_functions/paper_fns/file2file_doc/__init__.py
Normal file
4
crazy_functions/paper_fns/file2file_doc/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .txt_doc import TxtFormatter
|
||||
from .markdown_doc import MarkdownFormatter
|
||||
from .html_doc import HtmlFormatter
|
||||
from .word_doc import WordFormatter
|
||||
300
crazy_functions/paper_fns/file2file_doc/html_doc.py
Normal file
300
crazy_functions/paper_fns/file2file_doc/html_doc.py
Normal file
@@ -0,0 +1,300 @@
|
||||
class HtmlFormatter:
|
||||
"""HTML格式文档生成器 - 保留原始文档结构"""
|
||||
|
||||
def __init__(self, processing_type="文本处理"):
|
||||
self.processing_type = processing_type
|
||||
self.css_styles = """
|
||||
:root {
|
||||
--primary-color: #2563eb;
|
||||
--primary-light: #eff6ff;
|
||||
--secondary-color: #1e293b;
|
||||
--background-color: #f8fafc;
|
||||
--text-color: #334155;
|
||||
--border-color: #e2e8f0;
|
||||
--card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
line-height: 1.8;
|
||||
margin: 0;
|
||||
padding: 2rem;
|
||||
color: var(--text-color);
|
||||
background-color: var(--background-color);
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 1200px;
|
||||
margin: 0 auto;
|
||||
background: white;
|
||||
padding: 2rem;
|
||||
border-radius: 16px;
|
||||
box-shadow: var(--card-shadow);
|
||||
}
|
||||
::selection {
|
||||
background: var(--primary-light);
|
||||
color: var(--primary-color);
|
||||
}
|
||||
@keyframes fadeIn {
|
||||
from { opacity: 0; transform: translateY(20px); }
|
||||
to { opacity: 1; transform: translateY(0); }
|
||||
}
|
||||
|
||||
.container {
|
||||
animation: fadeIn 0.6s ease-out;
|
||||
}
|
||||
|
||||
.document-title {
|
||||
color: var(--primary-color);
|
||||
font-size: 2em;
|
||||
text-align: center;
|
||||
margin: 1rem 0 2rem;
|
||||
padding-bottom: 1rem;
|
||||
border-bottom: 2px solid var(--primary-color);
|
||||
}
|
||||
|
||||
.document-body {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1.5rem;
|
||||
margin: 2rem 0;
|
||||
}
|
||||
|
||||
.document-header {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
.processing-type {
|
||||
color: var(--secondary-color);
|
||||
font-size: 1.2em;
|
||||
margin: 0.5rem 0;
|
||||
}
|
||||
|
||||
.processing-date {
|
||||
color: var(--text-color);
|
||||
font-size: 0.9em;
|
||||
opacity: 0.8;
|
||||
}
|
||||
|
||||
.document-content {
|
||||
background: white;
|
||||
padding: 1.5rem;
|
||||
border-radius: 8px;
|
||||
border-left: 4px solid var(--primary-color);
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
/* 保留文档结构的样式 */
|
||||
h1, h2, h3, h4, h5, h6 {
|
||||
color: var(--secondary-color);
|
||||
margin-top: 1.5em;
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
|
||||
h1 { font-size: 1.8em; }
|
||||
h2 { font-size: 1.5em; }
|
||||
h3 { font-size: 1.3em; }
|
||||
h4 { font-size: 1.1em; }
|
||||
|
||||
p {
|
||||
margin: 0.8em 0;
|
||||
}
|
||||
|
||||
ul, ol {
|
||||
margin: 1em 0;
|
||||
padding-left: 2em;
|
||||
}
|
||||
|
||||
li {
|
||||
margin: 0.5em 0;
|
||||
}
|
||||
|
||||
blockquote {
|
||||
margin: 1em 0;
|
||||
padding: 0.5em 1em;
|
||||
border-left: 4px solid var(--primary-light);
|
||||
background: rgba(0,0,0,0.02);
|
||||
}
|
||||
|
||||
code {
|
||||
font-family: monospace;
|
||||
background: rgba(0,0,0,0.05);
|
||||
padding: 0.2em 0.4em;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
pre {
|
||||
background: rgba(0,0,0,0.05);
|
||||
padding: 1em;
|
||||
border-radius: 5px;
|
||||
overflow-x: auto;
|
||||
}
|
||||
|
||||
pre code {
|
||||
background: transparent;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
@media (prefers-color-scheme: dark) {
|
||||
:root {
|
||||
--background-color: #0f172a;
|
||||
--text-color: #e2e8f0;
|
||||
--border-color: #1e293b;
|
||||
}
|
||||
|
||||
.container, .document-content {
|
||||
background: #1e293b;
|
||||
}
|
||||
|
||||
blockquote {
|
||||
background: rgba(255,255,255,0.05);
|
||||
}
|
||||
|
||||
code, pre {
|
||||
background: rgba(255,255,255,0.05);
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
def _escape_html(self, text):
|
||||
"""转义HTML特殊字符"""
|
||||
import html
|
||||
return html.escape(text)
|
||||
|
||||
def _markdown_to_html(self, text):
|
||||
"""将Markdown格式转换为HTML格式,保留文档结构"""
|
||||
try:
|
||||
import markdown
|
||||
# 使用Python-Markdown库将markdown转换为HTML,启用更多扩展以支持嵌套列表
|
||||
return markdown.markdown(text, extensions=['tables', 'fenced_code', 'codehilite', 'nl2br', 'sane_lists', 'smarty', 'extra'])
|
||||
except ImportError:
|
||||
# 如果没有markdown库,使用更复杂的替换来处理嵌套列表
|
||||
import re
|
||||
|
||||
# 替换标题
|
||||
text = re.sub(r'^# (.+)$', r'<h1>\1</h1>', text, flags=re.MULTILINE)
|
||||
text = re.sub(r'^## (.+)$', r'<h2>\1</h2>', text, flags=re.MULTILINE)
|
||||
text = re.sub(r'^### (.+)$', r'<h3>\1</h3>', text, flags=re.MULTILINE)
|
||||
|
||||
# 预处理列表 - 在列表项之间添加空行以正确分隔
|
||||
# 处理编号列表
|
||||
text = re.sub(r'(\n\d+\.\s.+)(\n\d+\.\s)', r'\1\n\2', text)
|
||||
# 处理项目符号列表
|
||||
text = re.sub(r'(\n•\s.+)(\n•\s)', r'\1\n\2', text)
|
||||
text = re.sub(r'(\n\*\s.+)(\n\*\s)', r'\1\n\2', text)
|
||||
text = re.sub(r'(\n-\s.+)(\n-\s)', r'\1\n\2', text)
|
||||
|
||||
# 处理嵌套列表 - 确保正确的缩进和结构
|
||||
lines = text.split('\n')
|
||||
in_list = False
|
||||
list_type = None # 'ol' 或 'ul'
|
||||
list_html = []
|
||||
normal_lines = []
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
|
||||
# 匹配编号列表项
|
||||
numbered_match = re.match(r'^(\d+)\.\s+(.+)$', line)
|
||||
# 匹配项目符号列表项
|
||||
bullet_match = re.match(r'^[•\*-]\s+(.+)$', line)
|
||||
|
||||
if numbered_match:
|
||||
if not in_list or list_type != 'ol':
|
||||
# 开始新的编号列表
|
||||
if in_list:
|
||||
# 关闭前一个列表
|
||||
list_html.append(f'</{list_type}>')
|
||||
list_html.append('<ol>')
|
||||
in_list = True
|
||||
list_type = 'ol'
|
||||
|
||||
num, content = numbered_match.groups()
|
||||
list_html.append(f'<li>{content}</li>')
|
||||
|
||||
elif bullet_match:
|
||||
if not in_list or list_type != 'ul':
|
||||
# 开始新的项目符号列表
|
||||
if in_list:
|
||||
# 关闭前一个列表
|
||||
list_html.append(f'</{list_type}>')
|
||||
list_html.append('<ul>')
|
||||
in_list = True
|
||||
list_type = 'ul'
|
||||
|
||||
content = bullet_match.group(1)
|
||||
list_html.append(f'<li>{content}</li>')
|
||||
|
||||
else:
|
||||
if in_list:
|
||||
# 结束当前列表
|
||||
list_html.append(f'</{list_type}>')
|
||||
in_list = False
|
||||
# 将完成的列表添加到正常行中
|
||||
normal_lines.append(''.join(list_html))
|
||||
list_html = []
|
||||
|
||||
normal_lines.append(line)
|
||||
|
||||
i += 1
|
||||
|
||||
# 如果最后还在列表中,确保关闭列表
|
||||
if in_list:
|
||||
list_html.append(f'</{list_type}>')
|
||||
normal_lines.append(''.join(list_html))
|
||||
|
||||
# 重建文本
|
||||
text = '\n'.join(normal_lines)
|
||||
|
||||
# 替换段落,但避免处理已经是HTML标签的部分
|
||||
paragraphs = text.split('\n\n')
|
||||
for i, p in enumerate(paragraphs):
|
||||
# 如果不是以HTML标签开始且不为空
|
||||
if not (p.strip().startswith('<') and p.strip().endswith('>')) and p.strip() != '':
|
||||
paragraphs[i] = f'<p>{p}</p>'
|
||||
|
||||
return '\n'.join(paragraphs)
|
||||
|
||||
def create_document(self, content: str) -> str:
|
||||
"""生成完整的HTML文档,保留原始文档结构
|
||||
|
||||
Args:
|
||||
content: 处理后的文档内容
|
||||
|
||||
Returns:
|
||||
str: 完整的HTML文档字符串
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
# 将markdown内容转换为HTML
|
||||
html_content = self._markdown_to_html(content)
|
||||
|
||||
return f"""
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>文档处理结果</title>
|
||||
<style>{self.css_styles}</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1 class="document-title">文档处理结果</h1>
|
||||
|
||||
<div class="document-header">
|
||||
<div class="processing-type">处理方式: {self._escape_html(self.processing_type)}</div>
|
||||
<div class="processing-date">处理时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
|
||||
</div>
|
||||
|
||||
<div class="document-content">
|
||||
{html_content}
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
40
crazy_functions/paper_fns/file2file_doc/markdown_doc.py
Normal file
40
crazy_functions/paper_fns/file2file_doc/markdown_doc.py
Normal file
@@ -0,0 +1,40 @@
|
||||
class MarkdownFormatter:
|
||||
"""Markdown格式文档生成器 - 保留原始文档结构"""
|
||||
|
||||
def __init__(self):
|
||||
self.content = []
|
||||
|
||||
def _add_content(self, text: str):
|
||||
"""添加正文内容"""
|
||||
if text:
|
||||
self.content.append(f"\n{text}\n")
|
||||
|
||||
def create_document(self, content: str, processing_type: str = "文本处理") -> str:
|
||||
"""
|
||||
创建完整的Markdown文档,保留原始文档结构
|
||||
Args:
|
||||
content: 处理后的文档内容
|
||||
processing_type: 处理类型(润色、翻译等)
|
||||
Returns:
|
||||
str: 生成的Markdown文本
|
||||
"""
|
||||
self.content = []
|
||||
|
||||
# 添加标题和说明
|
||||
self.content.append(f"# 文档处理结果\n")
|
||||
self.content.append(f"## 处理方式: {processing_type}\n")
|
||||
|
||||
# 添加处理时间
|
||||
from datetime import datetime
|
||||
self.content.append(f"*处理时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n")
|
||||
|
||||
# 添加分隔线
|
||||
self.content.append("---\n")
|
||||
|
||||
# 添加原始内容,保留结构
|
||||
self.content.append(content)
|
||||
|
||||
# 添加结尾分隔线
|
||||
self.content.append("\n---\n")
|
||||
|
||||
return "\n".join(self.content)
|
||||
69
crazy_functions/paper_fns/file2file_doc/txt_doc.py
Normal file
69
crazy_functions/paper_fns/file2file_doc/txt_doc.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import re
|
||||
|
||||
def convert_markdown_to_txt(markdown_text):
|
||||
"""Convert markdown text to plain text while preserving formatting"""
|
||||
# Standardize line endings
|
||||
markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# 1. Handle headers but keep their formatting instead of removing them
|
||||
markdown_text = re.sub(r'^#\s+(.+)$', r'# \1', markdown_text, flags=re.MULTILINE)
|
||||
markdown_text = re.sub(r'^##\s+(.+)$', r'## \1', markdown_text, flags=re.MULTILINE)
|
||||
markdown_text = re.sub(r'^###\s+(.+)$', r'### \1', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 2. Handle bold and italic - simply remove markers
|
||||
markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text)
|
||||
markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text)
|
||||
|
||||
# 3. Handle lists but preserve formatting
|
||||
markdown_text = re.sub(r'^\s*[-*+]\s+(.+?)(?=\n|$)', r'• \1', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 4. Handle links - keep only the text
|
||||
markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 (\2)', markdown_text)
|
||||
|
||||
# 5. Handle HTML links - convert to user-friendly format
|
||||
markdown_text = re.sub(r'<a href=[\'"]([^\'"]+)[\'"](?:\s+target=[\'"][^\'"]+[\'"])?>([^<]+)</a>', r'\2 (\1)', markdown_text)
|
||||
|
||||
# 6. Preserve paragraph breaks
|
||||
markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text) # normalize multiple newlines to double newlines
|
||||
|
||||
# 7. Clean up extra spaces but maintain indentation
|
||||
markdown_text = re.sub(r' +', ' ', markdown_text)
|
||||
|
||||
return markdown_text.strip()
|
||||
|
||||
|
||||
class TxtFormatter:
|
||||
"""文本格式化器 - 保留原始文档结构"""
|
||||
|
||||
def __init__(self):
|
||||
self.content = []
|
||||
self._setup_document()
|
||||
|
||||
def _setup_document(self):
|
||||
"""初始化文档标题"""
|
||||
self.content.append("=" * 50)
|
||||
self.content.append("处理后文档".center(48))
|
||||
self.content.append("=" * 50)
|
||||
|
||||
def _format_header(self):
|
||||
"""创建文档头部信息"""
|
||||
from datetime import datetime
|
||||
date_str = datetime.now().strftime('%Y年%m月%d日')
|
||||
return [
|
||||
date_str.center(48),
|
||||
"\n" # 添加空行
|
||||
]
|
||||
|
||||
def create_document(self, content):
|
||||
"""生成保留原始结构的文档"""
|
||||
# 添加头部信息
|
||||
self.content.extend(self._format_header())
|
||||
|
||||
# 处理内容,保留原始结构
|
||||
processed_content = convert_markdown_to_txt(content)
|
||||
|
||||
# 添加处理后的内容
|
||||
self.content.append(processed_content)
|
||||
|
||||
# 合并所有内容
|
||||
return "\n".join(self.content)
|
||||
125
crazy_functions/paper_fns/file2file_doc/word2pdf.py
Normal file
125
crazy_functions/paper_fns/file2file_doc/word2pdf.py
Normal file
@@ -0,0 +1,125 @@
|
||||
from docx2pdf import convert
|
||||
import os
|
||||
import platform
|
||||
from typing import Union
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
class WordToPdfConverter:
|
||||
"""Word文档转PDF转换器"""
|
||||
|
||||
@staticmethod
|
||||
def convert_to_pdf(word_path: Union[str, Path], pdf_path: Union[str, Path] = None) -> str:
|
||||
"""
|
||||
将Word文档转换为PDF
|
||||
|
||||
参数:
|
||||
word_path: Word文档的路径
|
||||
pdf_path: 可选,PDF文件的输出路径。如果未指定,将使用与Word文档相同的名称和位置
|
||||
|
||||
返回:
|
||||
生成的PDF文件路径
|
||||
|
||||
异常:
|
||||
如果转换失败,将抛出相应异常
|
||||
"""
|
||||
try:
|
||||
# 确保输入路径是Path对象
|
||||
word_path = Path(word_path)
|
||||
|
||||
# 如果未指定pdf_path,则使用与word文档相同的名称
|
||||
if pdf_path is None:
|
||||
pdf_path = word_path.with_suffix('.pdf')
|
||||
else:
|
||||
pdf_path = Path(pdf_path)
|
||||
|
||||
# 检查操作系统
|
||||
if platform.system() == 'Linux':
|
||||
# Linux系统需要安装libreoffice
|
||||
if not os.system('which libreoffice') == 0:
|
||||
raise RuntimeError("请先安装LibreOffice: sudo apt-get install libreoffice")
|
||||
|
||||
# 使用libreoffice进行转换
|
||||
os.system(f'libreoffice --headless --convert-to pdf "{word_path}" --outdir "{pdf_path.parent}"')
|
||||
|
||||
# 如果输出路径与默认生成的不同,则重命名
|
||||
default_pdf = word_path.with_suffix('.pdf')
|
||||
if default_pdf != pdf_path:
|
||||
os.rename(default_pdf, pdf_path)
|
||||
else:
|
||||
# Windows和MacOS使用docx2pdf
|
||||
convert(word_path, pdf_path)
|
||||
|
||||
return str(pdf_path)
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"转换PDF失败: {str(e)}")
|
||||
|
||||
@staticmethod
|
||||
def batch_convert(word_dir: Union[str, Path], pdf_dir: Union[str, Path] = None) -> list:
|
||||
"""
|
||||
批量转换目录下的所有Word文档
|
||||
|
||||
参数:
|
||||
word_dir: 包含Word文档的目录路径
|
||||
pdf_dir: 可选,PDF文件的输出目录。如果未指定,将使用与Word文档相同的目录
|
||||
|
||||
返回:
|
||||
生成的PDF文件路径列表
|
||||
"""
|
||||
word_dir = Path(word_dir)
|
||||
if pdf_dir:
|
||||
pdf_dir = Path(pdf_dir)
|
||||
pdf_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
converted_files = []
|
||||
|
||||
for word_file in word_dir.glob("*.docx"):
|
||||
try:
|
||||
if pdf_dir:
|
||||
pdf_path = pdf_dir / word_file.with_suffix('.pdf').name
|
||||
else:
|
||||
pdf_path = word_file.with_suffix('.pdf')
|
||||
|
||||
pdf_file = WordToPdfConverter.convert_to_pdf(word_file, pdf_path)
|
||||
converted_files.append(pdf_file)
|
||||
|
||||
except Exception as e:
|
||||
print(f"转换 {word_file} 失败: {str(e)}")
|
||||
|
||||
return converted_files
|
||||
|
||||
@staticmethod
|
||||
def convert_doc_to_pdf(doc, output_dir: Union[str, Path] = None) -> str:
|
||||
"""
|
||||
将docx对象直接转换为PDF
|
||||
|
||||
参数:
|
||||
doc: python-docx的Document对象
|
||||
output_dir: 可选,输出目录。如果未指定,将使用当前目录
|
||||
|
||||
返回:
|
||||
生成的PDF文件路径
|
||||
"""
|
||||
try:
|
||||
# 设置临时文件路径和输出路径
|
||||
output_dir = Path(output_dir) if output_dir else Path.cwd()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 生成临时word文件
|
||||
temp_docx = output_dir / f"temp_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
|
||||
doc.save(temp_docx)
|
||||
|
||||
# 转换为PDF
|
||||
pdf_path = temp_docx.with_suffix('.pdf')
|
||||
WordToPdfConverter.convert_to_pdf(temp_docx, pdf_path)
|
||||
|
||||
# 删除临时word文件
|
||||
temp_docx.unlink()
|
||||
|
||||
return str(pdf_path)
|
||||
|
||||
except Exception as e:
|
||||
if temp_docx.exists():
|
||||
temp_docx.unlink()
|
||||
raise Exception(f"转换PDF失败: {str(e)}")
|
||||
236
crazy_functions/paper_fns/file2file_doc/word_doc.py
Normal file
236
crazy_functions/paper_fns/file2file_doc/word_doc.py
Normal file
@@ -0,0 +1,236 @@
|
||||
import re
|
||||
from docx import Document
|
||||
from docx.shared import Cm, Pt
|
||||
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
|
||||
from docx.enum.style import WD_STYLE_TYPE
|
||||
from docx.oxml.ns import qn
|
||||
from datetime import datetime
|
||||
|
||||
def convert_markdown_to_word(markdown_text):
|
||||
# 0. 首先标准化所有换行符为\n
|
||||
markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# 1. 处理标题 - 支持更多级别的标题,使用更精确的正则
|
||||
# 保留标题标记,以便后续处理时还能识别出标题级别
|
||||
markdown_text = re.sub(r'^(#{1,6})\s+(.+?)(?:\s+#+)?$', r'\1 \2', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 2. 处理粗体、斜体和加粗斜体
|
||||
markdown_text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', markdown_text) # 加粗斜体
|
||||
markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text) # 加粗
|
||||
markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text) # 斜体
|
||||
markdown_text = re.sub(r'_(.+?)_', r'\1', markdown_text) # 下划线斜体
|
||||
markdown_text = re.sub(r'__(.+?)__', r'\1', markdown_text) # 下划线加粗
|
||||
|
||||
# 3. 处理代码块 - 不移除,而是简化格式
|
||||
# 多行代码块
|
||||
markdown_text = re.sub(r'```(?:\w+)?\n([\s\S]*?)```', r'[代码块]\n\1[/代码块]', markdown_text)
|
||||
# 单行代码
|
||||
markdown_text = re.sub(r'`([^`]+)`', r'[代码]\1[/代码]', markdown_text)
|
||||
|
||||
# 4. 处理列表 - 保留列表结构
|
||||
# 匹配无序列表
|
||||
markdown_text = re.sub(r'^(\s*)[-*+]\s+(.+?)$', r'\1• \2', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 5. 处理Markdown链接
|
||||
markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+?)\s*(?:"[^"]*")?\)', r'\1 (\2)', markdown_text)
|
||||
|
||||
# 6. 处理HTML链接
|
||||
markdown_text = re.sub(r'<a href=[\'"]([^\'"]+)[\'"](?:\s+target=[\'"][^\'"]+[\'"])?>([^<]+)</a>', r'\2 (\1)', markdown_text)
|
||||
|
||||
# 7. 处理图片
|
||||
markdown_text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'[图片:\1]', markdown_text)
|
||||
|
||||
return markdown_text
|
||||
|
||||
|
||||
class WordFormatter:
|
||||
"""文档Word格式化器 - 保留原始文档结构"""
|
||||
|
||||
def __init__(self):
|
||||
self.doc = Document()
|
||||
self._setup_document()
|
||||
self._create_styles()
|
||||
|
||||
def _setup_document(self):
|
||||
"""设置文档基本格式,包括页面设置和页眉"""
|
||||
sections = self.doc.sections
|
||||
for section in sections:
|
||||
# 设置页面大小为A4
|
||||
section.page_width = Cm(21)
|
||||
section.page_height = Cm(29.7)
|
||||
# 设置页边距
|
||||
section.top_margin = Cm(3.7) # 上边距37mm
|
||||
section.bottom_margin = Cm(3.5) # 下边距35mm
|
||||
section.left_margin = Cm(2.8) # 左边距28mm
|
||||
section.right_margin = Cm(2.6) # 右边距26mm
|
||||
# 设置页眉页脚距离
|
||||
section.header_distance = Cm(2.0)
|
||||
section.footer_distance = Cm(2.0)
|
||||
|
||||
# 添加页眉
|
||||
header = section.header
|
||||
header_para = header.paragraphs[0]
|
||||
header_para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
||||
header_run = header_para.add_run("文档处理结果")
|
||||
header_run.font.name = '仿宋'
|
||||
header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
header_run.font.size = Pt(9)
|
||||
|
||||
def _create_styles(self):
|
||||
"""创建文档样式"""
|
||||
# 创建正文样式
|
||||
style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
style.font.name = '仿宋'
|
||||
style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
style.font.size = Pt(12) # 调整为12磅
|
||||
style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
|
||||
style.paragraph_format.space_after = Pt(0)
|
||||
|
||||
# 创建标题样式
|
||||
title_style = self.doc.styles.add_style('Title_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
title_style.font.name = '黑体'
|
||||
title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
|
||||
title_style.font.size = Pt(22) # 调整为22磅
|
||||
title_style.font.bold = True
|
||||
title_style.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
||||
title_style.paragraph_format.space_before = Pt(0)
|
||||
title_style.paragraph_format.space_after = Pt(24)
|
||||
title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
|
||||
|
||||
# 创建标题1样式
|
||||
h1_style = self.doc.styles.add_style('Heading1_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
h1_style.font.name = '黑体'
|
||||
h1_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
|
||||
h1_style.font.size = Pt(18)
|
||||
h1_style.font.bold = True
|
||||
h1_style.paragraph_format.space_before = Pt(12)
|
||||
h1_style.paragraph_format.space_after = Pt(6)
|
||||
|
||||
# 创建标题2样式
|
||||
h2_style = self.doc.styles.add_style('Heading2_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
h2_style.font.name = '黑体'
|
||||
h2_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
|
||||
h2_style.font.size = Pt(16)
|
||||
h2_style.font.bold = True
|
||||
h2_style.paragraph_format.space_before = Pt(10)
|
||||
h2_style.paragraph_format.space_after = Pt(6)
|
||||
|
||||
# 创建标题3样式
|
||||
h3_style = self.doc.styles.add_style('Heading3_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
h3_style.font.name = '黑体'
|
||||
h3_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
|
||||
h3_style.font.size = Pt(14)
|
||||
h3_style.font.bold = True
|
||||
h3_style.paragraph_format.space_before = Pt(8)
|
||||
h3_style.paragraph_format.space_after = Pt(4)
|
||||
|
||||
# 创建代码块样式
|
||||
code_style = self.doc.styles.add_style('Code_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
code_style.font.name = 'Courier New'
|
||||
code_style.font.size = Pt(11)
|
||||
code_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE
|
||||
code_style.paragraph_format.space_before = Pt(6)
|
||||
code_style.paragraph_format.space_after = Pt(6)
|
||||
code_style.paragraph_format.left_indent = Pt(36)
|
||||
code_style.paragraph_format.right_indent = Pt(36)
|
||||
|
||||
# 创建列表样式
|
||||
list_style = self.doc.styles.add_style('List_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
list_style.font.name = '仿宋'
|
||||
list_style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
list_style.font.size = Pt(12)
|
||||
list_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
|
||||
list_style.paragraph_format.left_indent = Pt(21)
|
||||
list_style.paragraph_format.first_line_indent = Pt(-21)
|
||||
|
||||
def create_document(self, content: str, processing_type: str = "文本处理"):
|
||||
"""创建文档,保留原始结构"""
|
||||
# 添加标题
|
||||
title_para = self.doc.add_paragraph(style='Title_Custom')
|
||||
title_run = title_para.add_run('文档处理结果')
|
||||
|
||||
# 添加处理类型
|
||||
processing_para = self.doc.add_paragraph()
|
||||
processing_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
||||
processing_run = processing_para.add_run(f"处理方式: {processing_type}")
|
||||
processing_run.font.name = '仿宋'
|
||||
processing_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
processing_run.font.size = Pt(14)
|
||||
|
||||
# 添加日期
|
||||
date_para = self.doc.add_paragraph()
|
||||
date_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
||||
date_run = date_para.add_run(f"处理时间: {datetime.now().strftime('%Y年%m月%d日')}")
|
||||
date_run.font.name = '仿宋'
|
||||
date_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
date_run.font.size = Pt(14)
|
||||
|
||||
self.doc.add_paragraph() # 添加空行
|
||||
|
||||
# 预处理内容,将Markdown格式转换为适合Word的格式
|
||||
processed_content = convert_markdown_to_word(content)
|
||||
|
||||
# 按行处理文本,保留结构
|
||||
lines = processed_content.split('\n')
|
||||
in_code_block = False
|
||||
current_paragraph = None
|
||||
|
||||
for line in lines:
|
||||
# 检查是否为标题
|
||||
header_match = re.match(r'^(#{1,6})\s+(.+)$', line)
|
||||
|
||||
if header_match:
|
||||
# 根据#的数量确定标题级别
|
||||
level = len(header_match.group(1))
|
||||
title_text = header_match.group(2)
|
||||
|
||||
if level == 1:
|
||||
style = 'Heading1_Custom'
|
||||
elif level == 2:
|
||||
style = 'Heading2_Custom'
|
||||
else:
|
||||
style = 'Heading3_Custom'
|
||||
|
||||
self.doc.add_paragraph(title_text, style=style)
|
||||
current_paragraph = None
|
||||
|
||||
# 检查代码块标记
|
||||
elif '[代码块]' in line:
|
||||
in_code_block = True
|
||||
current_paragraph = self.doc.add_paragraph(style='Code_Custom')
|
||||
code_line = line.replace('[代码块]', '').strip()
|
||||
if code_line:
|
||||
current_paragraph.add_run(code_line)
|
||||
|
||||
elif '[/代码块]' in line:
|
||||
in_code_block = False
|
||||
code_line = line.replace('[/代码块]', '').strip()
|
||||
if code_line and current_paragraph:
|
||||
current_paragraph.add_run(code_line)
|
||||
current_paragraph = None
|
||||
|
||||
# 检查列表项
|
||||
elif line.strip().startswith('•'):
|
||||
p = self.doc.add_paragraph(style='List_Custom')
|
||||
p.add_run(line.strip())
|
||||
current_paragraph = None
|
||||
|
||||
# 处理普通文本行
|
||||
elif line.strip():
|
||||
if in_code_block:
|
||||
if current_paragraph:
|
||||
current_paragraph.add_run('\n' + line)
|
||||
else:
|
||||
current_paragraph = self.doc.add_paragraph(line, style='Code_Custom')
|
||||
else:
|
||||
if current_paragraph is None or not current_paragraph.text:
|
||||
current_paragraph = self.doc.add_paragraph(line, style='Normal_Custom')
|
||||
else:
|
||||
current_paragraph.add_run('\n' + line)
|
||||
|
||||
# 处理空行,创建新段落
|
||||
elif not in_code_block:
|
||||
current_paragraph = None
|
||||
|
||||
return self.doc
|
||||
|
||||
278
crazy_functions/paper_fns/github_search.py
Normal file
278
crazy_functions/paper_fns/github_search.py
Normal file
@@ -0,0 +1,278 @@
|
||||
from typing import List, Dict, Tuple
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from toolbox import CatchException, update_ui, promote_file_to_downloadzone, get_log_folder, get_user
|
||||
from toolbox import update_ui, CatchException, report_exception, write_history_to_file
|
||||
from crazy_functions.paper_fns.auto_git.query_analyzer import QueryAnalyzer, SearchCriteria
|
||||
from crazy_functions.paper_fns.auto_git.handlers.repo_handler import RepositoryHandler
|
||||
from crazy_functions.paper_fns.auto_git.handlers.code_handler import CodeSearchHandler
|
||||
from crazy_functions.paper_fns.auto_git.handlers.user_handler import UserSearchHandler
|
||||
from crazy_functions.paper_fns.auto_git.handlers.topic_handler import TopicHandler
|
||||
from crazy_functions.paper_fns.auto_git.sources.github_source import GitHubSource
|
||||
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
||||
import re
|
||||
from datetime import datetime
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
# 导入格式化器
|
||||
from crazy_functions.paper_fns.file2file_doc import (
|
||||
TxtFormatter,
|
||||
MarkdownFormatter,
|
||||
HtmlFormatter,
|
||||
WordFormatter
|
||||
)
|
||||
from crazy_functions.paper_fns.file2file_doc.word2pdf import WordToPdfConverter
|
||||
|
||||
@CatchException
|
||||
def GitHub项目智能检索(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List,
|
||||
history: List, system_prompt: str, user_request: str):
|
||||
"""GitHub项目智能检索主函数"""
|
||||
|
||||
# 初始化GitHub API调用源
|
||||
github_source = GitHubSource(api_key=plugin_kwargs.get("github_api_key"))
|
||||
|
||||
# 初始化处理器
|
||||
handlers = {
|
||||
"repo": RepositoryHandler(github_source, llm_kwargs),
|
||||
"code": CodeSearchHandler(github_source, llm_kwargs),
|
||||
"user": UserSearchHandler(github_source, llm_kwargs),
|
||||
"topic": TopicHandler(github_source, llm_kwargs),
|
||||
}
|
||||
|
||||
# 分析查询意图
|
||||
chatbot.append(["分析查询意图", "正在分析您的查询需求..."])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
query_analyzer = QueryAnalyzer()
|
||||
search_criteria = yield from query_analyzer.analyze_query(
|
||||
txt, chatbot, llm_kwargs
|
||||
)
|
||||
|
||||
# 根据查询类型选择处理器
|
||||
handler = handlers.get(search_criteria.query_type)
|
||||
if not handler:
|
||||
handler = handlers["repo"] # 默认使用仓库处理器
|
||||
|
||||
# 处理查询
|
||||
chatbot.append(["开始搜索", f"使用{handler.__class__.__name__}处理您的请求,正在搜索GitHub..."])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
final_prompt = asyncio.run(handler.handle(
|
||||
criteria=search_criteria,
|
||||
chatbot=chatbot,
|
||||
history=history,
|
||||
system_prompt=system_prompt,
|
||||
llm_kwargs=llm_kwargs,
|
||||
plugin_kwargs=plugin_kwargs
|
||||
))
|
||||
|
||||
if final_prompt:
|
||||
# 检查是否是道歉提示
|
||||
if "很抱歉,我们未能找到" in final_prompt:
|
||||
chatbot.append([txt, final_prompt])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
return
|
||||
|
||||
# 在 final_prompt 末尾添加用户原始查询要求
|
||||
final_prompt += f"""
|
||||
|
||||
原始用户查询: "{txt}"
|
||||
|
||||
重要提示:
|
||||
- 你的回答必须直接满足用户的原始查询要求
|
||||
- 在遵循之前指南的同时,优先回答用户明确提出的问题
|
||||
- 确保回答格式和内容与用户期望一致
|
||||
- 对于GitHub仓库需要提供链接地址, 回复中请采用以下格式的HTML链接:
|
||||
* 对于GitHub仓库: <a href='Github_URL' target='_blank'>仓库名</a>
|
||||
- 不要生成参考列表,引用信息将另行处理
|
||||
"""
|
||||
|
||||
# 使用最终的prompt生成回答
|
||||
response = yield from request_gpt_model_in_new_thread_with_ui_alive(
|
||||
inputs=final_prompt,
|
||||
inputs_show_user=txt,
|
||||
llm_kwargs=llm_kwargs,
|
||||
chatbot=chatbot,
|
||||
history=[],
|
||||
sys_prompt=f"你是一个熟悉GitHub生态系统的专业助手,能帮助用户找到合适的项目、代码和开发者。除非用户指定,否则请使用中文回复。"
|
||||
)
|
||||
|
||||
# 1. 获取项目列表
|
||||
repos_list = handler.ranked_repos # 直接使用原始仓库数据
|
||||
|
||||
# 在新的对话中添加格式化的仓库参考列表
|
||||
if repos_list:
|
||||
references = ""
|
||||
for idx, repo in enumerate(repos_list, 1):
|
||||
# 构建仓库引用
|
||||
stars_str = f"⭐ {repo.get('stargazers_count', 'N/A')}" if repo.get('stargazers_count') else ""
|
||||
forks_str = f"🍴 {repo.get('forks_count', 'N/A')}" if repo.get('forks_count') else ""
|
||||
stats = f"{stars_str} {forks_str}".strip()
|
||||
stats = f" ({stats})" if stats else ""
|
||||
|
||||
language = f" [{repo.get('language', '')}]" if repo.get('language') else ""
|
||||
|
||||
reference = f"[{idx}] **{repo.get('name', '')}**{language}{stats} \n"
|
||||
reference += f"👤 {repo.get('owner', {}).get('login', 'N/A') if repo.get('owner') is not None else 'N/A'} | "
|
||||
reference += f"📅 {repo.get('updated_at', 'N/A')[:10]} | "
|
||||
reference += f"<a href='{repo.get('html_url', '')}' target='_blank'>GitHub</a> \n"
|
||||
|
||||
if repo.get('description'):
|
||||
reference += f"{repo.get('description')} \n"
|
||||
reference += " \n"
|
||||
|
||||
references += reference
|
||||
|
||||
# 添加新的对话显示参考仓库
|
||||
chatbot.append(["推荐项目如下:", references])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
# 2. 保存结果到文件
|
||||
# 创建保存目录
|
||||
save_dir = get_log_folder(get_user(chatbot), plugin_name='github_search')
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
|
||||
# 生成文件名
|
||||
def get_safe_filename(txt, max_length=10):
|
||||
# 获取文本前max_length个字符作为文件名
|
||||
filename = txt[:max_length].strip()
|
||||
# 移除不安全的文件名字符
|
||||
filename = re.sub(r'[\\/:*?"<>|]', '', filename)
|
||||
# 如果文件名为空,使用时间戳
|
||||
if not filename:
|
||||
filename = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
return filename
|
||||
|
||||
base_filename = get_safe_filename(txt)
|
||||
|
||||
# 准备保存的内容 - 优化文档结构
|
||||
md_content = f"# GitHub搜索结果: {txt}\n\n"
|
||||
md_content += f"搜索时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
|
||||
|
||||
# 添加模型回复
|
||||
md_content += "## 搜索分析与总结\n\n"
|
||||
md_content += response + "\n\n"
|
||||
|
||||
# 添加所有搜索到的仓库详细信息
|
||||
md_content += "## 推荐项目详情\n\n"
|
||||
|
||||
if not repos_list:
|
||||
md_content += "未找到匹配的项目\n\n"
|
||||
else:
|
||||
md_content += f"共找到 {len(repos_list)} 个相关项目\n\n"
|
||||
|
||||
# 添加项目简表
|
||||
md_content += "### 项目一览表\n\n"
|
||||
md_content += "| 序号 | 项目名称 | 作者 | 语言 | 星标数 | 更新时间 |\n"
|
||||
md_content += "| ---- | -------- | ---- | ---- | ------ | -------- |\n"
|
||||
|
||||
for idx, repo in enumerate(repos_list, 1):
|
||||
md_content += f"| {idx} | [{repo.get('name', '')}]({repo.get('html_url', '')}) | {repo.get('owner', {}).get('login', 'N/A') if repo.get('owner') is not None else 'N/A'} | {repo.get('language', 'N/A')} | {repo.get('stargazers_count', 'N/A')} | {repo.get('updated_at', 'N/A')[:10]} |\n"
|
||||
|
||||
md_content += "\n"
|
||||
|
||||
# 添加详细项目信息
|
||||
md_content += "### 项目详细信息\n\n"
|
||||
for idx, repo in enumerate(repos_list, 1):
|
||||
md_content += f"#### {idx}. {repo.get('name', '')}\n\n"
|
||||
md_content += f"- **仓库**: [{repo.get('full_name', '')}]({repo.get('html_url', '')})\n"
|
||||
md_content += f"- **作者**: [{repo.get('owner', {}).get('login', '') if repo.get('owner') is not None else 'N/A'}]({repo.get('owner', {}).get('html_url', '') if repo.get('owner') is not None else '#'})\n"
|
||||
md_content += f"- **描述**: {repo.get('description', 'N/A')}\n"
|
||||
md_content += f"- **语言**: {repo.get('language', 'N/A')}\n"
|
||||
md_content += f"- **星标**: {repo.get('stargazers_count', 'N/A')}\n"
|
||||
md_content += f"- **Fork数**: {repo.get('forks_count', 'N/A')}\n"
|
||||
md_content += f"- **最近更新**: {repo.get('updated_at', 'N/A')[:10]}\n"
|
||||
md_content += f"- **创建时间**: {repo.get('created_at', 'N/A')[:10]}\n"
|
||||
md_content += f"- **开源许可**: {repo.get('license', {}).get('name', 'N/A') if repo.get('license') is not None else 'N/A'}\n"
|
||||
if repo.get('topics'):
|
||||
md_content += f"- **主题标签**: {', '.join(repo.get('topics', []))}\n"
|
||||
if repo.get('homepage'):
|
||||
md_content += f"- **项目主页**: [{repo.get('homepage')}]({repo.get('homepage')})\n"
|
||||
md_content += "\n"
|
||||
|
||||
# 添加查询信息和元数据
|
||||
md_content += "## 查询元数据\n\n"
|
||||
md_content += f"- **原始查询**: {txt}\n"
|
||||
md_content += f"- **查询类型**: {search_criteria.query_type}\n"
|
||||
md_content += f"- **关键词**: {', '.join(search_criteria.keywords) if hasattr(search_criteria, 'keywords') and search_criteria.keywords else 'N/A'}\n"
|
||||
md_content += f"- **搜索日期**: {datetime.now().strftime('%Y-%m-%d')}\n\n"
|
||||
|
||||
# 保存为多种格式
|
||||
saved_files = []
|
||||
failed_files = []
|
||||
|
||||
# 1. 保存为TXT
|
||||
try:
|
||||
txt_formatter = TxtFormatter()
|
||||
txt_content = txt_formatter.create_document(md_content)
|
||||
txt_file = os.path.join(save_dir, f"github_results_{base_filename}.txt")
|
||||
with open(txt_file, 'w', encoding='utf-8') as f:
|
||||
f.write(txt_content)
|
||||
promote_file_to_downloadzone(txt_file, chatbot=chatbot)
|
||||
saved_files.append("TXT")
|
||||
except Exception as e:
|
||||
failed_files.append(f"TXT (错误: {str(e)})")
|
||||
|
||||
# 2. 保存为Markdown
|
||||
try:
|
||||
md_formatter = MarkdownFormatter()
|
||||
formatted_md_content = md_formatter.create_document(md_content, "GitHub项目搜索")
|
||||
md_file = os.path.join(save_dir, f"github_results_{base_filename}.md")
|
||||
with open(md_file, 'w', encoding='utf-8') as f:
|
||||
f.write(formatted_md_content)
|
||||
promote_file_to_downloadzone(md_file, chatbot=chatbot)
|
||||
saved_files.append("Markdown")
|
||||
except Exception as e:
|
||||
failed_files.append(f"Markdown (错误: {str(e)})")
|
||||
|
||||
# 3. 保存为HTML
|
||||
try:
|
||||
html_formatter = HtmlFormatter(processing_type="GitHub项目搜索")
|
||||
html_content = html_formatter.create_document(md_content)
|
||||
html_file = os.path.join(save_dir, f"github_results_{base_filename}.html")
|
||||
with open(html_file, 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
promote_file_to_downloadzone(html_file, chatbot=chatbot)
|
||||
saved_files.append("HTML")
|
||||
except Exception as e:
|
||||
failed_files.append(f"HTML (错误: {str(e)})")
|
||||
|
||||
# 4. 保存为Word
|
||||
word_file = None
|
||||
try:
|
||||
word_formatter = WordFormatter()
|
||||
doc = word_formatter.create_document(md_content, "GitHub项目搜索")
|
||||
word_file = os.path.join(save_dir, f"github_results_{base_filename}.docx")
|
||||
doc.save(word_file)
|
||||
promote_file_to_downloadzone(word_file, chatbot=chatbot)
|
||||
saved_files.append("Word")
|
||||
except Exception as e:
|
||||
failed_files.append(f"Word (错误: {str(e)})")
|
||||
word_file = None
|
||||
|
||||
# 5. 保存为PDF (仅当Word保存成功时)
|
||||
if word_file and os.path.exists(word_file):
|
||||
try:
|
||||
pdf_file = WordToPdfConverter.convert_to_pdf(word_file)
|
||||
promote_file_to_downloadzone(pdf_file, chatbot=chatbot)
|
||||
saved_files.append("PDF")
|
||||
except Exception as e:
|
||||
failed_files.append(f"PDF (错误: {str(e)})")
|
||||
|
||||
# 报告保存结果
|
||||
if saved_files:
|
||||
success_message = f"成功保存以下格式: {', '.join(saved_files)}"
|
||||
if failed_files:
|
||||
failure_message = f"以下格式保存失败: {', '.join(failed_files)}"
|
||||
chatbot.append(["部分格式保存成功", f"{success_message}。{failure_message}"])
|
||||
else:
|
||||
chatbot.append(["所有格式保存成功", success_message])
|
||||
else:
|
||||
chatbot.append(["保存失败", f"所有格式均保存失败: {', '.join(failed_files)}"])
|
||||
else:
|
||||
report_exception(chatbot, history, a=f"处理失败", b=f"请尝试其他查询")
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
635
crazy_functions/paper_fns/journal_paper_recom.py
Normal file
635
crazy_functions/paper_fns/journal_paper_recom.py
Normal file
@@ -0,0 +1,635 @@
|
||||
import os
|
||||
import time
|
||||
import glob
|
||||
from typing import Dict, List, Generator, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
from crazy_functions.pdf_fns.text_content_loader import TextContentLoader
|
||||
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
||||
from toolbox import update_ui, promote_file_to_downloadzone, write_history_to_file, CatchException, report_exception
|
||||
from shared_utils.fastapi_server import validate_path_safety
|
||||
# 导入论文下载相关函数
|
||||
from crazy_functions.论文下载 import extract_paper_id, extract_paper_ids, get_arxiv_paper, format_arxiv_id, SciHub
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
import calendar
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecommendationQuestion:
|
||||
"""期刊会议推荐分析问题类"""
|
||||
id: str # 问题ID
|
||||
question: str # 问题内容
|
||||
importance: int # 重要性 (1-5,5最高)
|
||||
description: str # 问题描述
|
||||
|
||||
|
||||
class JournalConferenceRecommender:
|
||||
"""论文期刊会议推荐器"""
|
||||
|
||||
def __init__(self, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, history: List, system_prompt: str):
|
||||
"""初始化推荐器"""
|
||||
self.llm_kwargs = llm_kwargs
|
||||
self.plugin_kwargs = plugin_kwargs
|
||||
self.chatbot = chatbot
|
||||
self.history = history
|
||||
self.system_prompt = system_prompt
|
||||
self.paper_content = ""
|
||||
self.analysis_results = {}
|
||||
|
||||
# 定义论文分析问题库(针对期刊会议推荐)
|
||||
self.questions = [
|
||||
RecommendationQuestion(
|
||||
id="research_field_and_topic",
|
||||
question="请分析这篇论文的研究领域、主题和关键词。具体包括:1)论文属于哪个主要学科领域(如自然科学、工程技术、医学、社会科学、人文学科等);2)具体的研究子领域或方向;3)论文的核心主题和关键概念;4)重要的学术关键词和专业术语;5)研究的跨学科特征(如果有);6)研究的地域性特征(国际性研究还是特定地区研究)。",
|
||||
importance=5,
|
||||
description="研究领域与主题分析"
|
||||
),
|
||||
RecommendationQuestion(
|
||||
id="methodology_and_approach",
|
||||
question="请分析论文的研究方法和技术路线。包括:1)采用的主要研究方法(定量研究、定性研究、理论分析、实验研究、田野调查、文献综述、案例研究等);2)使用的技术手段、工具或分析方法;3)研究设计的严谨性和创新性;4)数据收集和分析方法的适当性;5)研究方法在该学科中的先进性或传统性;6)方法学上的贡献或局限性。",
|
||||
importance=4,
|
||||
description="研究方法与技术路线"
|
||||
),
|
||||
RecommendationQuestion(
|
||||
id="novelty_and_contribution",
|
||||
question="请评估论文的创新性和学术贡献。包括:1)研究的新颖性程度(理论创新、方法创新、应用创新等);2)对现有知识体系的贡献或突破;3)解决问题的重要性和学术价值;4)研究成果的理论意义和实践价值;5)在该学科领域的地位和影响潜力;6)与国际前沿研究的关系;7)对后续研究的启发意义。",
|
||||
importance=4,
|
||||
description="创新性与学术贡献"
|
||||
),
|
||||
RecommendationQuestion(
|
||||
id="target_audience_and_scope",
|
||||
question="请分析论文的目标受众和应用范围。包括:1)主要面向的学术群体(研究者、从业者、政策制定者等);2)研究成果的潜在应用领域和受益群体;3)对学术界和实践界的价值;4)研究的国际化程度和跨文化适用性;5)是否适合国际期刊还是区域性期刊;6)语言发表偏好(英文、中文或其他语言);7)开放获取的必要性和可行性。",
|
||||
importance=3,
|
||||
description="目标受众与应用范围"
|
||||
),
|
||||
]
|
||||
|
||||
# 按重要性排序
|
||||
self.questions.sort(key=lambda q: q.importance, reverse=True)
|
||||
|
||||
def _load_paper(self, paper_path: str) -> Generator:
|
||||
"""加载论文内容"""
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
# 使用TextContentLoader读取文件
|
||||
loader = TextContentLoader(self.chatbot, self.history)
|
||||
|
||||
yield from loader.execute_single_file(paper_path)
|
||||
|
||||
# 获取加载的内容
|
||||
if len(self.history) >= 2 and self.history[-2]:
|
||||
self.paper_content = self.history[-2]
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
return True
|
||||
else:
|
||||
self.chatbot.append(["错误", "无法读取论文内容,请检查文件是否有效"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
return False
|
||||
|
||||
def _analyze_question(self, question: RecommendationQuestion) -> Generator:
|
||||
"""分析单个问题"""
|
||||
try:
|
||||
# 创建分析提示
|
||||
prompt = f"请基于以下论文内容回答问题:\n\n{self.paper_content}\n\n问题:{question.question}"
|
||||
|
||||
# 使用单线程版本的请求函数
|
||||
response = yield from request_gpt_model_in_new_thread_with_ui_alive(
|
||||
inputs=prompt,
|
||||
inputs_show_user=question.question, # 显示问题本身
|
||||
llm_kwargs=self.llm_kwargs,
|
||||
chatbot=self.chatbot,
|
||||
history=[], # 空历史,确保每个问题独立分析
|
||||
sys_prompt="你是一个专业的学术期刊会议推荐专家,需要仔细分析论文内容并提供准确的分析。请保持客观、专业,并基于论文内容提供深入分析。"
|
||||
)
|
||||
|
||||
if response:
|
||||
self.analysis_results[question.id] = response
|
||||
return True
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.chatbot.append(["错误", f"分析问题时出错: {str(e)}"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
return False
|
||||
|
||||
def _generate_journal_recommendations(self) -> Generator:
|
||||
"""生成期刊推荐"""
|
||||
self.chatbot.append(["生成期刊推荐", "正在基于论文分析结果生成期刊推荐..."])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
# 构建期刊推荐提示
|
||||
journal_prompt = """请基于以下论文分析结果,为这篇论文推荐合适的学术期刊。
|
||||
|
||||
推荐要求:
|
||||
1. 根据论文的创新性和工作质量,分别推荐不同级别的期刊:
|
||||
- 顶级期刊(影响因子>8或该领域顶级期刊):2-3个
|
||||
- 高质量期刊(影响因子4-8或该领域知名期刊):3-4个
|
||||
- 中等期刊(影响因子1.5-4或该领域认可期刊):3-4个
|
||||
- 入门期刊(影响因子<1.5但声誉良好的期刊):2-3个
|
||||
|
||||
注意:不同学科的影响因子标准差异很大,请根据论文所属学科的实际情况调整标准。
|
||||
特别是医学领域,需要考虑:
|
||||
- 临床医学期刊通常影响因子较高(顶级期刊IF>20,高质量期刊IF>10)
|
||||
- 基础医学期刊影响因子相对较低但学术价值很高
|
||||
- 专科医学期刊在各自领域内具有权威性
|
||||
- 医学期刊的临床实用性和循证医学价值
|
||||
|
||||
2. 对每个期刊提供详细信息:
|
||||
- 期刊全名和缩写
|
||||
- 最新影响因子(如果知道)
|
||||
- 期刊级别分类(Q1/Q2/Q3/Q4或该学科的分类标准)
|
||||
- 主要研究领域和范围
|
||||
- 与论文内容的匹配度评分(1-10分)
|
||||
- 发表难度评估(容易/中等/困难/极难)
|
||||
- 平均审稿周期
|
||||
- 开放获取政策
|
||||
- 期刊的学科分类(如SCI、SSCI、A&HCI等)
|
||||
- 医学期刊特殊信息(如适用):
|
||||
* PubMed收录情况
|
||||
* 是否为核心临床期刊
|
||||
* 专科领域权威性
|
||||
* 循证医学等级要求
|
||||
* 临床试验注册要求
|
||||
* 伦理委员会批准要求
|
||||
|
||||
3. 按推荐优先级排序,并说明推荐理由
|
||||
4. 提供针对性的投稿建议,考虑该学科的特点
|
||||
|
||||
论文分析结果:"""
|
||||
|
||||
for q in self.questions:
|
||||
if q.id in self.analysis_results:
|
||||
journal_prompt += f"\n\n{q.description}:\n{self.analysis_results[q.id]}"
|
||||
|
||||
journal_prompt += "\n\n请提供详细的期刊推荐报告,重点关注期刊的层次性和适配性。请根据论文的具体学科领域,采用该领域通用的期刊评价标准和分类体系。"
|
||||
|
||||
try:
|
||||
response = yield from request_gpt_model_in_new_thread_with_ui_alive(
|
||||
inputs=journal_prompt,
|
||||
inputs_show_user="生成期刊推荐报告",
|
||||
llm_kwargs=self.llm_kwargs,
|
||||
chatbot=self.chatbot,
|
||||
history=[],
|
||||
sys_prompt="你是一个资深的跨学科学术期刊推荐专家,熟悉各个学科领域不同层次的期刊。请根据论文的具体学科和创新性,推荐从顶级到入门级的各层次期刊。不同学科有不同的期刊评价标准:理工科重视影响因子和SCI收录,社会科学重视SSCI和学科声誉,人文学科重视A&HCI和同行评议,医学领域重视PubMed收录、临床实用性、循证医学价值和伦理规范。请根据论文所属学科采用相应的评价标准。"
|
||||
)
|
||||
|
||||
if response:
|
||||
return response
|
||||
return "期刊推荐生成失败"
|
||||
|
||||
except Exception as e:
|
||||
self.chatbot.append(["错误", f"生成期刊推荐时出错: {str(e)}"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
return "期刊推荐生成失败: " + str(e)
|
||||
|
||||
def _generate_conference_recommendations(self) -> Generator:
|
||||
"""生成会议推荐"""
|
||||
self.chatbot.append(["生成会议推荐", "正在基于论文分析结果生成会议推荐..."])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
# 获取当前时间信息
|
||||
current_time = datetime.now()
|
||||
current_date_str = current_time.strftime("%Y年%m月%d日")
|
||||
current_year = current_time.year
|
||||
current_month = current_time.month
|
||||
|
||||
# 构建会议推荐提示
|
||||
conference_prompt = f"""请基于以下论文分析结果,为这篇论文推荐合适的学术会议。
|
||||
|
||||
**重要提示:当前时间是{current_date_str}({current_year}年{current_month}月),请基于这个时间点推断会议的举办时间和投稿截止时间。**
|
||||
|
||||
推荐要求:
|
||||
1. 根据论文的创新性和工作质量,分别推荐不同级别的会议:
|
||||
- 顶级会议(该领域最权威的国际会议):2-3个
|
||||
- 高质量会议(该领域知名的国际或区域会议):3-4个
|
||||
- 中等会议(该领域认可的专业会议):3-4个
|
||||
- 专业会议(该领域细分方向的专门会议):2-3个
|
||||
|
||||
注意:不同学科的会议评价标准不同:
|
||||
- 计算机科学:可参考CCF分类(A/B/C类)
|
||||
- 工程学:可参考EI收录和影响力
|
||||
- 医学:可参考会议的临床影响和同行认可度
|
||||
- 社会科学:可参考会议的学术声誉和参与度
|
||||
- 人文学科:可参考会议的历史和学术传统
|
||||
- 自然科学:可参考会议的国际影响力和发表质量
|
||||
|
||||
特别是医学会议,需要考虑:
|
||||
- 临床医学会议重视实用性和临床指导价值
|
||||
- 基础医学会议重视科学创新和机制研究
|
||||
- 专科医学会议在各自领域内具有权威性
|
||||
- 国际医学会议的CME学分认证情况
|
||||
|
||||
2. 对每个会议提供详细信息:
|
||||
- 会议全名和缩写
|
||||
- 会议级别分类(根据该学科的评价标准)
|
||||
- 主要研究领域和主题
|
||||
- 与论文内容的匹配度评分(1-10分)
|
||||
- 录用难度评估(容易/中等/困难/极难)
|
||||
- 会议举办周期(年会/双年会/不定期等)
|
||||
- **基于当前时间{current_date_str},推断{current_year}年和{current_year+1}年的举办时间和地点**(请根据往年的举办时间规律进行推断)
|
||||
- **基于推断的会议时间,估算论文提交截止时间**(通常在会议前3-6个月)
|
||||
- 会议的国际化程度和影响范围
|
||||
- 医学会议特殊信息(如适用):
|
||||
* 是否提供CME学分
|
||||
* 临床实践指导价值
|
||||
* 专科认证机构认可情况
|
||||
* 会议论文集的PubMed收录情况
|
||||
* 伦理和临床试验相关要求
|
||||
|
||||
3. 按推荐优先级排序,并说明推荐理由
|
||||
4. **基于当前时间{current_date_str},提供会议投稿的时间规划建议**
|
||||
- 哪些会议可以赶上{current_year}年的投稿截止时间
|
||||
- 哪些会议需要准备{current_year+1}年的投稿
|
||||
- 具体的时间安排建议
|
||||
|
||||
论文分析结果:"""
|
||||
|
||||
for q in self.questions:
|
||||
if q.id in self.analysis_results:
|
||||
conference_prompt += f"\n\n{q.description}:\n{self.analysis_results[q.id]}"
|
||||
|
||||
conference_prompt += f"\n\n请提供详细的会议推荐报告,重点关注会议的层次性和时效性。请根据论文的具体学科领域,采用该领域通用的会议评价标准。\n\n**特别注意:请根据当前时间{current_date_str}和各会议的历史举办时间规律,准确推断{current_year}年和{current_year+1}年的会议时间安排,不要使用虚构的时间。**"
|
||||
|
||||
try:
|
||||
response = yield from request_gpt_model_in_new_thread_with_ui_alive(
|
||||
inputs=conference_prompt,
|
||||
inputs_show_user="生成会议推荐报告",
|
||||
llm_kwargs=self.llm_kwargs,
|
||||
chatbot=self.chatbot,
|
||||
history=[],
|
||||
sys_prompt="你是一个资深的跨学科学术会议推荐专家,熟悉各个学科领域不同层次的学术会议。请根据论文的具体学科和创新性,推荐从顶级到专业级的各层次会议。不同学科有不同的会议评价标准和文化:理工科重视技术创新和国际影响力,社会科学重视理论贡献和社会意义,人文学科重视学术深度和文化价值,医学领域重视临床实用性、CME学分认证、专科权威性和伦理规范。请根据论文所属学科采用相应的评价标准和推荐策略。"
|
||||
)
|
||||
|
||||
if response:
|
||||
return response
|
||||
return "会议推荐生成失败"
|
||||
|
||||
except Exception as e:
|
||||
self.chatbot.append(["错误", f"生成会议推荐时出错: {str(e)}"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
return "会议推荐生成失败: " + str(e)
|
||||
|
||||
def _generate_priority_summary(self, journal_recommendations: str, conference_recommendations: str) -> Generator:
|
||||
"""生成优先级总结"""
|
||||
self.chatbot.append(["生成优先级总结", "正在生成投稿优先级总结..."])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
# 获取当前时间信息
|
||||
current_time = datetime.now()
|
||||
current_date_str = current_time.strftime("%Y年%m月%d日")
|
||||
current_month = current_time.strftime("%Y年%m月")
|
||||
|
||||
# 计算未来时间点
|
||||
def add_months(date, months):
|
||||
"""安全地添加月份"""
|
||||
month = date.month - 1 + months
|
||||
year = date.year + month // 12
|
||||
month = month % 12 + 1
|
||||
day = min(date.day, calendar.monthrange(year, month)[1])
|
||||
return date.replace(year=year, month=month, day=day)
|
||||
|
||||
future_6_months = add_months(current_time, 6).strftime('%Y年%m月')
|
||||
future_12_months = add_months(current_time, 12).strftime('%Y年%m月')
|
||||
future_year = (current_time.year + 1)
|
||||
|
||||
priority_prompt = f"""请基于以下期刊和会议推荐结果,生成一个综合的投稿优先级总结。
|
||||
|
||||
**重要提示:当前时间是{current_date_str}({current_month}),请基于这个时间点制定投稿计划。**
|
||||
|
||||
期刊推荐结果:
|
||||
{journal_recommendations}
|
||||
|
||||
会议推荐结果:
|
||||
{conference_recommendations}
|
||||
|
||||
请提供:
|
||||
1. 综合投稿策略建议(考虑该学科的发表文化和惯例)
|
||||
- 期刊优先还是会议优先(不同学科有不同偏好)
|
||||
- 国际期刊/会议 vs 国内期刊/会议的选择策略
|
||||
- 英文发表 vs 中文发表的考虑
|
||||
|
||||
2. 按时间线排列的投稿计划(**基于当前时间{current_date_str},考虑截止时间和审稿周期**)
|
||||
- 短期目标({current_month}起3-6个月内,即到{future_6_months})
|
||||
- 中期目标(6-12个月内,即到{future_12_months})
|
||||
- 长期目标(1年以上,即{future_year}年以后)
|
||||
|
||||
3. 风险分散策略
|
||||
- 同时投稿多个不同级别的目标
|
||||
- 考虑该学科的一稿多投政策
|
||||
- 备选方案和应急策略
|
||||
|
||||
4. 针对论文可能需要的改进建议
|
||||
- 根据目标期刊/会议的要求调整内容
|
||||
- 语言和格式的优化建议
|
||||
- 补充实验或分析的建议
|
||||
|
||||
5. 预期的发表时间线和成功概率评估(基于当前时间{current_date_str})
|
||||
|
||||
6. 该学科特有的发表注意事项
|
||||
- 伦理审查要求(如医学、心理学等)
|
||||
- 数据开放要求(如某些自然科学领域)
|
||||
- 利益冲突声明(如医学、工程等)
|
||||
- 医学领域特殊要求:
|
||||
* 临床试验注册要求(ClinicalTrials.gov、中国临床试验注册中心等)
|
||||
* 患者知情同意和隐私保护
|
||||
* 医学伦理委员会批准证明
|
||||
* CONSORT、STROBE、PRISMA等报告规范遵循
|
||||
* 药物/器械安全性数据要求
|
||||
* CME学分认证相关要求
|
||||
* 临床指南和循证医学等级要求
|
||||
- 其他学科特殊要求
|
||||
|
||||
请以表格形式总结前10个最推荐的投稿目标(期刊+会议),包括优先级排序、预期时间线和成功概率。
|
||||
|
||||
**注意:所有时间规划都应基于当前时间{current_date_str}进行计算,不要使用虚构的时间。**"""
|
||||
|
||||
try:
|
||||
response = yield from request_gpt_model_in_new_thread_with_ui_alive(
|
||||
inputs=priority_prompt,
|
||||
inputs_show_user="生成投稿优先级总结",
|
||||
llm_kwargs=self.llm_kwargs,
|
||||
chatbot=self.chatbot,
|
||||
history=[],
|
||||
sys_prompt="你是一个资深的跨学科学术发表策略专家,熟悉各个学科的发表文化、惯例和要求。请综合考虑不同学科的特点:理工科通常重视期刊发表和影响因子,社会科学平衡期刊和专著,人文学科重视同行评议和学术声誉,医学重视临床意义和伦理规范。请为作者制定最适合其学科背景的投稿策略和时间规划。"
|
||||
)
|
||||
|
||||
if response:
|
||||
return response
|
||||
return "优先级总结生成失败"
|
||||
|
||||
except Exception as e:
|
||||
self.chatbot.append(["错误", f"生成优先级总结时出错: {str(e)}"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
return "优先级总结生成失败: " + str(e)
|
||||
|
||||
def save_recommendations(self, journal_recommendations: str, conference_recommendations: str, priority_summary: str) -> Generator:
|
||||
"""保存推荐报告"""
|
||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
# 保存为Markdown文件
|
||||
try:
|
||||
md_content = f"""# 论文期刊会议推荐报告
|
||||
|
||||
## 投稿优先级总结
|
||||
|
||||
{priority_summary}
|
||||
|
||||
## 期刊推荐
|
||||
|
||||
{journal_recommendations}
|
||||
|
||||
## 会议推荐
|
||||
|
||||
{conference_recommendations}
|
||||
|
||||
---
|
||||
|
||||
# 详细分析结果
|
||||
"""
|
||||
|
||||
# 添加详细分析结果
|
||||
for q in self.questions:
|
||||
if q.id in self.analysis_results:
|
||||
md_content += f"\n\n## {q.description}\n\n{self.analysis_results[q.id]}"
|
||||
|
||||
result_file = write_history_to_file(
|
||||
history=[md_content],
|
||||
file_basename=f"期刊会议推荐_{timestamp}.md"
|
||||
)
|
||||
|
||||
if result_file and os.path.exists(result_file):
|
||||
promote_file_to_downloadzone(result_file, chatbot=self.chatbot)
|
||||
self.chatbot.append(["保存成功", f"推荐报告已保存至: {os.path.basename(result_file)}"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
else:
|
||||
self.chatbot.append(["警告", "保存报告成功但找不到文件"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
except Exception as e:
|
||||
self.chatbot.append(["警告", f"保存报告失败: {str(e)}"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
def recommend_venues(self, paper_path: str) -> Generator:
|
||||
"""推荐期刊会议主流程"""
|
||||
# 加载论文
|
||||
success = yield from self._load_paper(paper_path)
|
||||
if not success:
|
||||
return
|
||||
|
||||
# 分析关键问题
|
||||
for question in self.questions:
|
||||
yield from self._analyze_question(question)
|
||||
|
||||
# 分别生成期刊和会议推荐
|
||||
journal_recommendations = yield from self._generate_journal_recommendations()
|
||||
conference_recommendations = yield from self._generate_conference_recommendations()
|
||||
|
||||
# 生成优先级总结
|
||||
priority_summary = yield from self._generate_priority_summary(journal_recommendations, conference_recommendations)
|
||||
|
||||
# 显示结果
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
# 保存报告
|
||||
yield from self.save_recommendations(journal_recommendations, conference_recommendations, priority_summary)
|
||||
|
||||
# 将完整的分析结果和推荐内容添加到历史记录中,方便用户继续提问
|
||||
self._add_to_history(journal_recommendations, conference_recommendations, priority_summary)
|
||||
|
||||
def _add_to_history(self, journal_recommendations: str, conference_recommendations: str, priority_summary: str):
|
||||
"""将分析结果和推荐内容添加到历史记录中"""
|
||||
try:
|
||||
# 构建完整的内容摘要
|
||||
history_content = f"""# 论文期刊会议推荐分析完成
|
||||
|
||||
## 📊 投稿优先级总结
|
||||
{priority_summary}
|
||||
|
||||
## 📚 期刊推荐
|
||||
{journal_recommendations}
|
||||
|
||||
## 🏛️ 会议推荐
|
||||
{conference_recommendations}
|
||||
|
||||
## 📋 详细分析结果
|
||||
"""
|
||||
|
||||
# 添加详细分析结果
|
||||
for q in self.questions:
|
||||
if q.id in self.analysis_results:
|
||||
history_content += f"\n### {q.description}\n{self.analysis_results[q.id]}\n"
|
||||
|
||||
history_content += "\n---\n💡 您现在可以基于以上分析结果继续提问,比如询问特定期刊的详细信息、投稿策略建议、或者对推荐结果的进一步解释。"
|
||||
|
||||
# 添加到历史记录中
|
||||
self.history.append("论文期刊会议推荐分析")
|
||||
self.history.append(history_content)
|
||||
|
||||
self.chatbot.append(["✅ 分析完成", "所有分析结果和推荐内容已添加到对话历史中,您可以继续基于这些内容提问。"])
|
||||
|
||||
except Exception as e:
|
||||
self.chatbot.append(["警告", f"添加到历史记录时出错: {str(e)},但推荐报告已正常生成"])
|
||||
# 即使添加历史失败,也不影响主要功能
|
||||
|
||||
|
||||
def _find_paper_file(path: str) -> str:
|
||||
"""查找路径中的论文文件(简化版)"""
|
||||
if os.path.isfile(path):
|
||||
return path
|
||||
|
||||
# 支持的文件扩展名(按优先级排序)
|
||||
extensions = ["pdf", "docx", "doc", "txt", "md", "tex"]
|
||||
|
||||
# 简单地遍历目录
|
||||
if os.path.isdir(path):
|
||||
try:
|
||||
for ext in extensions:
|
||||
# 手动检查每个可能的文件,而不使用glob
|
||||
potential_file = os.path.join(path, f"paper.{ext}")
|
||||
if os.path.exists(potential_file) and os.path.isfile(potential_file):
|
||||
return potential_file
|
||||
|
||||
# 如果没找到特定命名的文件,检查目录中的所有文件
|
||||
for file in os.listdir(path):
|
||||
file_path = os.path.join(path, file)
|
||||
if os.path.isfile(file_path):
|
||||
file_ext = file.split('.')[-1].lower() if '.' in file else ""
|
||||
if file_ext in extensions:
|
||||
return file_path
|
||||
except Exception:
|
||||
pass # 忽略任何错误
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def download_paper_by_id(paper_info, chatbot, history) -> str:
|
||||
"""下载论文并返回保存路径
|
||||
|
||||
Args:
|
||||
paper_info: 元组,包含论文ID类型(arxiv或doi)和ID值
|
||||
chatbot: 聊天机器人对象
|
||||
history: 历史记录
|
||||
|
||||
Returns:
|
||||
str: 下载的论文路径或None
|
||||
"""
|
||||
id_type, paper_id = paper_info
|
||||
|
||||
# 创建保存目录 - 使用时间戳创建唯一文件夹
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
user_name = chatbot.get_user() if hasattr(chatbot, 'get_user') else "default"
|
||||
from toolbox import get_log_folder, get_user
|
||||
base_save_dir = get_log_folder(get_user(chatbot), plugin_name='paper_download')
|
||||
save_dir = os.path.join(base_save_dir, f"papers_{timestamp}")
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
save_path = Path(save_dir)
|
||||
|
||||
chatbot.append([f"下载论文", f"正在下载{'arXiv' if id_type == 'arxiv' else 'DOI'} {paper_id} 的论文..."])
|
||||
update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
pdf_path = None
|
||||
|
||||
try:
|
||||
if id_type == 'arxiv':
|
||||
# 使用改进的arxiv查询方法
|
||||
formatted_id = format_arxiv_id(paper_id)
|
||||
paper_result = get_arxiv_paper(formatted_id)
|
||||
|
||||
if not paper_result:
|
||||
chatbot.append([f"下载失败", f"未找到arXiv论文: {paper_id}"])
|
||||
update_ui(chatbot=chatbot, history=history)
|
||||
return None
|
||||
|
||||
# 下载PDF
|
||||
filename = f"arxiv_{paper_id.replace('/', '_')}.pdf"
|
||||
pdf_path = str(save_path / filename)
|
||||
paper_result.download_pdf(filename=pdf_path)
|
||||
|
||||
else: # doi
|
||||
# 下载DOI
|
||||
sci_hub = SciHub(
|
||||
doi=paper_id,
|
||||
path=save_path
|
||||
)
|
||||
pdf_path = sci_hub.fetch()
|
||||
|
||||
# 检查下载结果
|
||||
if pdf_path and os.path.exists(pdf_path):
|
||||
promote_file_to_downloadzone(pdf_path, chatbot=chatbot)
|
||||
chatbot.append([f"下载成功", f"已成功下载论文: {os.path.basename(pdf_path)}"])
|
||||
update_ui(chatbot=chatbot, history=history)
|
||||
return pdf_path
|
||||
else:
|
||||
chatbot.append([f"下载失败", f"论文下载失败: {paper_id}"])
|
||||
update_ui(chatbot=chatbot, history=history)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
chatbot.append([f"下载错误", f"下载论文时出错: {str(e)}"])
|
||||
update_ui(chatbot=chatbot, history=history)
|
||||
return None
|
||||
|
||||
|
||||
@CatchException
|
||||
def 论文期刊会议推荐(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List,
|
||||
history: List, system_prompt: str, user_request: str):
|
||||
"""主函数 - 论文期刊会议推荐"""
|
||||
# 初始化推荐器
|
||||
chatbot.append(["函数插件功能及使用方式", "论文期刊会议推荐:基于论文内容分析,为您推荐合适的学术期刊和会议投稿目标。适用于各个学科专业(自然科学、工程技术、医学、社会科学、人文学科等),根据不同学科的评价标准和发表文化,提供分层次的期刊会议推荐、影响因子分析、发表难度评估、投稿策略建议等。<br><br>📋 使用方式:<br>1、直接上传PDF文件<br>2、输入DOI号或arXiv ID<br>3、点击插件开始分析"])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
paper_file = None
|
||||
|
||||
# 检查输入是否为论文ID(arxiv或DOI)
|
||||
paper_info = extract_paper_id(txt)
|
||||
|
||||
if paper_info:
|
||||
# 如果是论文ID,下载论文
|
||||
chatbot.append(["检测到论文ID", f"检测到{'arXiv' if paper_info[0] == 'arxiv' else 'DOI'} ID: {paper_info[1]},准备下载论文..."])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
# 下载论文
|
||||
paper_file = download_paper_by_id(paper_info, chatbot, history)
|
||||
|
||||
if not paper_file:
|
||||
report_exception(chatbot, history, a=f"下载论文失败", b=f"无法下载{'arXiv' if paper_info[0] == 'arxiv' else 'DOI'}论文: {paper_info[1]}")
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
return
|
||||
else:
|
||||
# 检查输入路径
|
||||
if not os.path.exists(txt):
|
||||
report_exception(chatbot, history, a=f"解析论文: {txt}", b=f"找不到文件或无权访问: {txt}")
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
return
|
||||
|
||||
# 验证路径安全性
|
||||
user_name = chatbot.get_user()
|
||||
validate_path_safety(txt, user_name)
|
||||
|
||||
# 查找论文文件
|
||||
paper_file = _find_paper_file(txt)
|
||||
|
||||
if not paper_file:
|
||||
report_exception(chatbot, history, a=f"解析论文", b=f"在路径 {txt} 中未找到支持的论文文件")
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
return
|
||||
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
# 确保paper_file是字符串
|
||||
if paper_file is not None and not isinstance(paper_file, str):
|
||||
# 尝试转换为字符串
|
||||
try:
|
||||
paper_file = str(paper_file)
|
||||
except:
|
||||
report_exception(chatbot, history, a=f"类型错误", b=f"论文路径不是有效的字符串: {type(paper_file)}")
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
return
|
||||
|
||||
# 开始推荐
|
||||
chatbot.append(["开始分析", f"正在分析论文并生成期刊会议推荐: {os.path.basename(paper_file)}"])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
recommender = JournalConferenceRecommender(llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
|
||||
yield from recommender.recommend_venues(paper_file)
|
||||
295
crazy_functions/paper_fns/paper_download.py
Normal file
295
crazy_functions/paper_fns/paper_download.py
Normal file
@@ -0,0 +1,295 @@
|
||||
import re
|
||||
import os
|
||||
import zipfile
|
||||
from toolbox import CatchException, update_ui, promote_file_to_downloadzone, get_log_folder, get_user
|
||||
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
def extract_paper_id(txt):
|
||||
"""从输入文本中提取论文ID"""
|
||||
# 尝试匹配DOI(将DOI匹配提前,因为其格式更加明确)
|
||||
doi_patterns = [
|
||||
r'doi.org/([\w\./-]+)', # doi.org/10.1234/xxx
|
||||
r'doi:\s*([\w\./-]+)', # doi: 10.1234/xxx
|
||||
r'(10\.\d{4,}/[\w\.-]+)', # 直接输入DOI: 10.1234/xxx
|
||||
]
|
||||
|
||||
for pattern in doi_patterns:
|
||||
match = re.search(pattern, txt, re.IGNORECASE)
|
||||
if match:
|
||||
return ('doi', match.group(1))
|
||||
|
||||
# 尝试匹配arXiv ID
|
||||
arxiv_patterns = [
|
||||
r'arxiv.org/abs/(\d+\.\d+)', # arxiv.org/abs/2103.14030
|
||||
r'arxiv.org/pdf/(\d+\.\d+)', # arxiv.org/pdf/2103.14030
|
||||
r'arxiv/(\d+\.\d+)', # arxiv/2103.14030
|
||||
r'^(\d{4}\.\d{4,5})$', # 直接输入ID: 2103.14030
|
||||
# 添加对早期arXiv ID的支持
|
||||
r'arxiv.org/abs/([\w-]+/\d{7})', # arxiv.org/abs/math/0211159
|
||||
r'arxiv.org/pdf/([\w-]+/\d{7})', # arxiv.org/pdf/hep-th/9901001
|
||||
r'^([\w-]+/\d{7})$', # 直接输入: math/0211159
|
||||
]
|
||||
|
||||
for pattern in arxiv_patterns:
|
||||
match = re.search(pattern, txt, re.IGNORECASE)
|
||||
if match:
|
||||
paper_id = match.group(1)
|
||||
# 如果是新格式(YYMM.NNNNN)或旧格式(category/NNNNNNN),都直接返回
|
||||
if re.match(r'^\d{4}\.\d{4,5}$', paper_id) or re.match(r'^[\w-]+/\d{7}$', paper_id):
|
||||
return ('arxiv', paper_id)
|
||||
|
||||
return None
|
||||
|
||||
def extract_paper_ids(txt):
|
||||
"""从输入文本中提取多个论文ID"""
|
||||
paper_ids = []
|
||||
|
||||
# 首先按换行符分割
|
||||
for line in txt.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if not line: # 跳过空行
|
||||
continue
|
||||
|
||||
# 对每一行再按空格分割
|
||||
for item in line.split():
|
||||
item = item.strip()
|
||||
if not item: # 跳过空项
|
||||
continue
|
||||
paper_info = extract_paper_id(item)
|
||||
if paper_info:
|
||||
paper_ids.append(paper_info)
|
||||
|
||||
# 去除重复项,保持顺序
|
||||
unique_paper_ids = []
|
||||
seen = set()
|
||||
for paper_info in paper_ids:
|
||||
if paper_info not in seen:
|
||||
seen.add(paper_info)
|
||||
unique_paper_ids.append(paper_info)
|
||||
|
||||
return unique_paper_ids
|
||||
|
||||
def format_arxiv_id(paper_id):
|
||||
"""格式化arXiv ID,处理新旧两种格式"""
|
||||
# 如果是旧格式 (e.g. astro-ph/0404140),需要去掉arxiv:前缀
|
||||
if '/' in paper_id:
|
||||
return paper_id.replace('arxiv:', '') # 确保移除可能存在的arxiv:前缀
|
||||
return paper_id
|
||||
|
||||
def get_arxiv_paper(paper_id):
|
||||
"""获取arXiv论文,处理新旧两种格式"""
|
||||
import arxiv
|
||||
|
||||
# 尝试不同的查询方式
|
||||
query_formats = [
|
||||
paper_id, # 原始ID
|
||||
paper_id.replace('/', ''), # 移除斜杠
|
||||
f"id:{paper_id}", # 添加id:前缀
|
||||
]
|
||||
|
||||
for query in query_formats:
|
||||
try:
|
||||
# 使用Search查询
|
||||
search = arxiv.Search(
|
||||
query=query,
|
||||
max_results=1
|
||||
)
|
||||
result = next(arxiv.Client().results(search))
|
||||
if result:
|
||||
return result
|
||||
except:
|
||||
continue
|
||||
|
||||
try:
|
||||
# 使用id_list查询
|
||||
search = arxiv.Search(id_list=[query])
|
||||
result = next(arxiv.Client().results(search))
|
||||
if result:
|
||||
return result
|
||||
except:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
def create_zip_archive(files, save_path):
|
||||
"""将多个PDF文件打包成zip"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
zip_filename = f"papers_{timestamp}.zip"
|
||||
zip_path = str(save_path / zip_filename)
|
||||
|
||||
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
||||
for file in files:
|
||||
if os.path.exists(file):
|
||||
# 只添加文件名,不包含路径
|
||||
zipf.write(file, os.path.basename(file))
|
||||
|
||||
return zip_path
|
||||
|
||||
@CatchException
|
||||
def 论文下载(txt: str, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
|
||||
"""
|
||||
txt: 用户输入,可以是DOI、arxiv ID或相关链接,支持多行输入进行批量下载
|
||||
"""
|
||||
from crazy_functions.doc_fns.text_content_loader import TextContentLoader
|
||||
from crazy_functions.review_fns.data_sources.arxiv_source import ArxivSource
|
||||
from crazy_functions.review_fns.data_sources.scihub_source import SciHub
|
||||
# 解析输入
|
||||
paper_infos = extract_paper_ids(txt)
|
||||
if not paper_infos:
|
||||
chatbot.append(["输入解析", "未能识别任何论文ID或DOI,请检查输入格式。支持以下格式:\n- arXiv ID (例如:2103.14030)\n- arXiv链接\n- DOI (例如:10.1234/xxx)\n- DOI链接\n\n多个论文ID请用换行分隔。"])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
return
|
||||
|
||||
# 创建保存目录 - 使用时间戳创建唯一文件夹
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
base_save_dir = get_log_folder(get_user(chatbot), plugin_name='paper_download')
|
||||
save_dir = os.path.join(base_save_dir, f"papers_{timestamp}")
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
save_path = Path(save_dir)
|
||||
|
||||
# 记录下载结果
|
||||
success_count = 0
|
||||
failed_papers = []
|
||||
downloaded_files = [] # 记录成功下载的文件路径
|
||||
|
||||
chatbot.append([f"开始下载", f"支持多行输入下载多篇论文,共检测到 {len(paper_infos)} 篇论文,开始下载..."])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
for id_type, paper_id in paper_infos:
|
||||
try:
|
||||
if id_type == 'arxiv':
|
||||
chatbot.append([f"正在下载", f"从arXiv下载论文 {paper_id}..."])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
# 使用改进的arxiv查询方法
|
||||
formatted_id = format_arxiv_id(paper_id)
|
||||
paper_result = get_arxiv_paper(formatted_id)
|
||||
|
||||
if not paper_result:
|
||||
failed_papers.append((paper_id, "未找到论文"))
|
||||
continue
|
||||
|
||||
# 下载PDF
|
||||
try:
|
||||
filename = f"arxiv_{paper_id.replace('/', '_')}.pdf"
|
||||
pdf_path = str(save_path / filename)
|
||||
paper_result.download_pdf(filename=pdf_path)
|
||||
if os.path.exists(pdf_path):
|
||||
downloaded_files.append(pdf_path)
|
||||
except Exception as e:
|
||||
failed_papers.append((paper_id, f"PDF下载失败: {str(e)}"))
|
||||
continue
|
||||
|
||||
else: # doi
|
||||
chatbot.append([f"正在下载", f"从Sci-Hub下载论文 {paper_id}..."])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
sci_hub = SciHub(
|
||||
doi=paper_id,
|
||||
path=save_path
|
||||
)
|
||||
pdf_path = sci_hub.fetch()
|
||||
if pdf_path and os.path.exists(pdf_path):
|
||||
downloaded_files.append(pdf_path)
|
||||
|
||||
# 检查下载结果
|
||||
if pdf_path and os.path.exists(pdf_path):
|
||||
promote_file_to_downloadzone(pdf_path, chatbot=chatbot)
|
||||
success_count += 1
|
||||
else:
|
||||
failed_papers.append((paper_id, "下载失败"))
|
||||
|
||||
except Exception as e:
|
||||
failed_papers.append((paper_id, str(e)))
|
||||
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
# 创建ZIP压缩包
|
||||
if downloaded_files:
|
||||
try:
|
||||
zip_path = create_zip_archive(downloaded_files, Path(base_save_dir))
|
||||
promote_file_to_downloadzone(zip_path, chatbot=chatbot)
|
||||
chatbot.append([
|
||||
f"创建压缩包",
|
||||
f"已将所有下载的论文打包为: {os.path.basename(zip_path)}"
|
||||
])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
except Exception as e:
|
||||
chatbot.append([
|
||||
f"创建压缩包失败",
|
||||
f"打包文件时出现错误: {str(e)}"
|
||||
])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
# 生成最终报告
|
||||
summary = f"下载完成!成功下载 {success_count} 篇论文。\n"
|
||||
if failed_papers:
|
||||
summary += "\n以下论文下载失败:\n"
|
||||
for paper_id, reason in failed_papers:
|
||||
summary += f"- {paper_id}: {reason}\n"
|
||||
|
||||
if downloaded_files:
|
||||
summary += f"\n所有论文已存放在文件夹 '{save_dir}' 中,并打包到压缩文件中。您可以在下载区找到单个PDF文件和压缩包。"
|
||||
|
||||
chatbot.append([
|
||||
f"下载完成",
|
||||
summary
|
||||
])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
# 如果下载成功且用户想要直接阅读内容
|
||||
if downloaded_files:
|
||||
chatbot.append([
|
||||
"提示",
|
||||
"正在读取论文内容进行分析,请稍候..."
|
||||
])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
# 使用TextContentLoader加载整个文件夹的PDF文件内容
|
||||
loader = TextContentLoader(chatbot, history)
|
||||
|
||||
# 删除提示信息
|
||||
chatbot.pop()
|
||||
|
||||
# 加载PDF内容 - 传入文件夹路径而不是单个文件路径
|
||||
yield from loader.execute(save_dir)
|
||||
|
||||
# 添加提示信息
|
||||
chatbot.append([
|
||||
"提示",
|
||||
"论文内容已加载完毕,您可以直接向AI提问有关该论文的问题。"
|
||||
])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试代码
|
||||
import asyncio
|
||||
async def test():
|
||||
# 测试批量输入
|
||||
batch_inputs = [
|
||||
# 换行分隔的测试
|
||||
"""https://arxiv.org/abs/2103.14030
|
||||
math/0211159
|
||||
10.1038/s41586-021-03819-2""",
|
||||
|
||||
# 空格分隔的测试
|
||||
"https://arxiv.org/abs/2103.14030 math/0211159 10.1038/s41586-021-03819-2",
|
||||
|
||||
# 混合分隔的测试
|
||||
"""https://arxiv.org/abs/2103.14030 math/0211159
|
||||
10.1038/s41586-021-03819-2 https://doi.org/10.1038/s41586-021-03819-2
|
||||
2103.14030""",
|
||||
]
|
||||
|
||||
for i, test_input in enumerate(batch_inputs, 1):
|
||||
print(f"\n测试用例 {i}:")
|
||||
print(f"输入: {test_input}")
|
||||
results = extract_paper_ids(test_input)
|
||||
print(f"解析结果:")
|
||||
for result in results:
|
||||
print(f" {result}")
|
||||
|
||||
asyncio.run(test())
|
||||
867
crazy_functions/paper_fns/reduce_aigc.py
Normal file
867
crazy_functions/paper_fns/reduce_aigc.py
Normal file
@@ -0,0 +1,867 @@
|
||||
import os
|
||||
import time
|
||||
import glob
|
||||
import re
|
||||
import threading
|
||||
from typing import Dict, List, Generator, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
|
||||
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
|
||||
from crazy_functions.rag_fns.rag_file_support import extract_text, convert_to_markdown
|
||||
from request_llms.bridge_all import model_info
|
||||
from toolbox import update_ui, CatchException, report_exception, promote_file_to_downloadzone, write_history_to_file
|
||||
from shared_utils.fastapi_server import validate_path_safety
|
||||
|
||||
# 新增:导入结构化论文提取器
|
||||
from crazy_functions.doc_fns.read_fns.unstructured_all.paper_structure_extractor import PaperStructureExtractor, ExtractorConfig, StructuredPaper
|
||||
|
||||
# 导入格式化器
|
||||
from crazy_functions.paper_fns.file2file_doc import (
|
||||
TxtFormatter,
|
||||
MarkdownFormatter,
|
||||
HtmlFormatter,
|
||||
WordFormatter
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class TextFragment:
|
||||
"""文本片段数据类,用于组织处理单元"""
|
||||
content: str
|
||||
fragment_index: int
|
||||
total_fragments: int
|
||||
|
||||
|
||||
class DocumentProcessor:
|
||||
"""文档处理器 - 处理单个文档并输出结果"""
|
||||
|
||||
def __init__(self, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, history: List, system_prompt: str):
|
||||
"""初始化处理器"""
|
||||
self.llm_kwargs = llm_kwargs.copy() # 创建原始llm_kwargs的副本
|
||||
# 固定使用deepseek-reasoner模型
|
||||
self.llm_kwargs['llm_model'] = "deepseek-reasoner"
|
||||
self.plugin_kwargs = plugin_kwargs
|
||||
self.chatbot = chatbot
|
||||
self.history = history
|
||||
self.system_prompt = system_prompt
|
||||
self.processed_results = []
|
||||
self.failed_fragments = []
|
||||
# 新增:初始化论文结构提取器
|
||||
self.paper_extractor = PaperStructureExtractor()
|
||||
# 修改:设置降重次数,默认为1次(原为3次)
|
||||
self.reduction_times = int(self.plugin_kwargs.get("reduction_times", 1))
|
||||
|
||||
def _get_token_limit(self) -> int:
|
||||
"""获取模型token限制,返回更小的值以确保更细粒度的分割"""
|
||||
max_token = model_info[self.llm_kwargs['llm_model']]['max_token']
|
||||
# 降低token限制,使每个片段更小
|
||||
return max_token // 4 # 从3/4降低到1/4
|
||||
|
||||
def _create_batch_inputs(self, fragments: List[TextFragment], current_iteration: int = 1) -> Tuple[List, List, List]:
|
||||
"""创建批处理输入"""
|
||||
inputs_array = []
|
||||
inputs_show_user_array = []
|
||||
history_array = []
|
||||
|
||||
# 基础指令 - 增强单次调用的效果
|
||||
user_instruction = self.plugin_kwargs.get("advanced_arg", """你是一个人类科研工作者,请对以下学术文本进行深度改写,以彻底消除AI生成特征。具体要求如下:
|
||||
|
||||
1. 保持学术写作的严谨性和专业性,但加入一些人类写作的不完美元素
|
||||
2. 维持原文的核心论述和逻辑框架,但打破过于完美的结构
|
||||
3. 彻底改变句式结构:
|
||||
- 灵活运用主动句与被动句,偏好使用主动语态
|
||||
- 适当使用不同长度的句子,包括一些较长的复杂句
|
||||
- 引入变化多样的句式,打破规律性
|
||||
- 完全避免AI常用的模板化句式和套路表达
|
||||
- 增加一些学术写作中常见的转折和连接方式
|
||||
4. 全面改善用词:
|
||||
- 使用更多学术语境下的专业词汇和同义词替换
|
||||
- 避免过于机械和规律性的连接词,使用更自然的过渡
|
||||
- 重构专业术语的表达方式,但保持准确性
|
||||
- 增加词汇多样性,减少重复用词
|
||||
- 偶尔使用一些不太常见但恰当的词汇
|
||||
5. 模拟真实学者的写作风格:
|
||||
- 注重论证的严密性,但允许存在一些微小的不对称性
|
||||
- 保持表达的客观性,同时适度体现个人学术见解
|
||||
- 在适当位置表达观点时更加自信和坚定
|
||||
- 避免过于完美和机械均衡的论述结构
|
||||
- 允许段落长度有所变化,不要过于均匀
|
||||
6. 引入人类学者常见的写作特点:
|
||||
- 段落之间的过渡更加自然流畅
|
||||
- 适当使用一些学术界常见的修辞手法,但不过度使用
|
||||
- 偶尔使用一些强调和限定性表达
|
||||
- 适当使用一些学术界认可的个人化表达
|
||||
7. 彻底消除AI痕迹:
|
||||
- 避免过于规整和均衡的段落结构
|
||||
- 避免机械性的句式变化和词汇替换模式
|
||||
- 避免过于完美的逻辑推导,适当增加一些转折
|
||||
- 减少公式化的表达方式""")
|
||||
|
||||
# 对于单次调用的场景,不需要迭代前缀,直接使用更强力的改写指令
|
||||
for frag in fragments:
|
||||
# 在单次调用时使用更强力的指令
|
||||
if self.reduction_times == 1:
|
||||
i_say = (f'请对以下学术文本进行彻底改写,完全消除AI特征,使其像真实人类学者撰写的内容。\n\n{user_instruction}\n\n'
|
||||
f'请记住以下几点:\n'
|
||||
f'1. 避免过于规整和均衡的结构\n'
|
||||
f'2. 引入一些人类写作的微小不完美之处\n'
|
||||
f'3. 使用多样化的句式和词汇\n'
|
||||
f'4. 打破可能的AI规律性表达模式\n'
|
||||
f'5. 适当使用一些专业领域内的表达习惯\n\n'
|
||||
f'请将对文本的处理结果放在<decision>和</decision>标签之间。\n\n'
|
||||
f'文本内容:\n```\n{frag.content}\n```')
|
||||
else:
|
||||
# 原有的迭代前缀逻辑
|
||||
iteration_prefix = ""
|
||||
if current_iteration > 1:
|
||||
iteration_prefix = f"这是第{current_iteration}次改写,请在保持学术性的基础上,采用更加人性化、不同的表达方式。"
|
||||
if current_iteration == 2:
|
||||
iteration_prefix += "在保持专业性的同时,进一步优化句式结构和用词,显著降低AI痕迹。"
|
||||
elif current_iteration >= 3:
|
||||
iteration_prefix += "请在确保不损失任何学术内容的前提下,彻底重构表达方式,并适当引入少量人类学者常用的表达技巧,避免过度使用比喻和类比。"
|
||||
|
||||
i_say = (f'请按照以下要求处理文本内容:{iteration_prefix}{user_instruction}\n\n'
|
||||
f'请将对文本的处理结果放在<decision>和</decision>标签之间。\n\n'
|
||||
f'文本内容:\n```\n{frag.content}\n```')
|
||||
|
||||
i_say_show_user = f'正在处理文本片段 {frag.fragment_index + 1}/{frag.total_fragments}'
|
||||
|
||||
inputs_array.append(i_say)
|
||||
inputs_show_user_array.append(i_say_show_user)
|
||||
history_array.append([])
|
||||
|
||||
return inputs_array, inputs_show_user_array, history_array
|
||||
|
||||
def _extract_decision(self, text: str) -> str:
|
||||
"""从LLM响应中提取<decision>标签内的内容"""
|
||||
import re
|
||||
pattern = r'<decision>(.*?)</decision>'
|
||||
matches = re.findall(pattern, text, re.DOTALL)
|
||||
|
||||
if matches:
|
||||
return matches[0].strip()
|
||||
else:
|
||||
# 如果没有找到标签,返回原始文本
|
||||
return text.strip()
|
||||
|
||||
def process_file(self, file_path: str) -> Generator:
|
||||
"""处理单个文件"""
|
||||
self.chatbot.append(["开始处理文件", f"文件路径: {file_path}"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
try:
|
||||
# 首先尝试转换为Markdown
|
||||
file_path = convert_to_markdown(file_path)
|
||||
|
||||
# 1. 检查文件是否为支持的论文格式
|
||||
is_paper_format = any(file_path.lower().endswith(ext) for ext in self.paper_extractor.SUPPORTED_EXTENSIONS)
|
||||
|
||||
if is_paper_format:
|
||||
# 使用结构化提取器处理论文
|
||||
return (yield from self._process_structured_paper(file_path))
|
||||
else:
|
||||
# 使用原有方式处理普通文档
|
||||
return (yield from self._process_regular_file(file_path))
|
||||
|
||||
except Exception as e:
|
||||
self.chatbot.append(["处理错误", f"文件处理失败: {str(e)}"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
return None
|
||||
|
||||
def _process_structured_paper(self, file_path: str) -> Generator:
|
||||
"""处理结构化论文文件"""
|
||||
# 1. 提取论文结构
|
||||
self.chatbot[-1] = ["正在分析论文结构", f"文件路径: {file_path}"]
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
try:
|
||||
paper = self.paper_extractor.extract_paper_structure(file_path)
|
||||
|
||||
if not paper or not paper.sections:
|
||||
self.chatbot.append(["无法提取论文结构", "将使用全文内容进行处理"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
# 使用全文内容进行段落切分
|
||||
if paper and paper.full_text:
|
||||
# 使用增强的分割函数进行更细致的分割
|
||||
fragments = self._breakdown_section_content(paper.full_text)
|
||||
|
||||
# 创建文本片段对象
|
||||
text_fragments = []
|
||||
for i, frag in enumerate(fragments):
|
||||
if frag.strip():
|
||||
text_fragments.append(TextFragment(
|
||||
content=frag,
|
||||
fragment_index=i,
|
||||
total_fragments=len(fragments)
|
||||
))
|
||||
|
||||
# 多次降重处理
|
||||
if text_fragments:
|
||||
current_fragments = text_fragments
|
||||
|
||||
# 进行多轮降重处理
|
||||
for iteration in range(1, self.reduction_times + 1):
|
||||
# 处理当前片段
|
||||
processed_content = yield from self._process_text_fragments(current_fragments, iteration)
|
||||
|
||||
# 如果这是最后一次迭代,保存结果
|
||||
if iteration == self.reduction_times:
|
||||
final_content = processed_content
|
||||
break
|
||||
|
||||
# 否则,准备下一轮迭代的片段
|
||||
# 从处理结果中提取处理后的内容
|
||||
next_fragments = []
|
||||
for idx, item in enumerate(self.processed_results):
|
||||
next_fragments.append(TextFragment(
|
||||
content=item['content'],
|
||||
fragment_index=idx,
|
||||
total_fragments=len(self.processed_results)
|
||||
))
|
||||
|
||||
current_fragments = next_fragments
|
||||
|
||||
# 更新UI显示最终结果
|
||||
self.chatbot[-1] = ["处理完成", f"共完成 {self.reduction_times} 轮降重"]
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
return final_content
|
||||
else:
|
||||
self.chatbot.append(["处理失败", "未能提取到有效的文本内容"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
return None
|
||||
else:
|
||||
self.chatbot.append(["处理失败", "未能提取到论文内容"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
return None
|
||||
|
||||
# 2. 准备处理章节内容(不处理标题)
|
||||
self.chatbot[-1] = ["已提取论文结构", f"共 {len(paper.sections)} 个主要章节"]
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
# 3. 收集所有需要处理的章节内容并分割为合适大小
|
||||
sections_to_process = []
|
||||
section_map = {} # 用于映射处理前后的内容
|
||||
|
||||
def collect_section_contents(sections, parent_path=""):
|
||||
"""递归收集章节内容,跳过参考文献部分"""
|
||||
for i, section in enumerate(sections):
|
||||
current_path = f"{parent_path}/{i}" if parent_path else f"{i}"
|
||||
|
||||
# 检查是否为参考文献部分,如果是则跳过
|
||||
if section.section_type == 'references' or section.title.lower() in ['references', '参考文献', 'bibliography', '文献']:
|
||||
continue # 跳过参考文献部分
|
||||
|
||||
# 只处理内容非空的章节
|
||||
if section.content and section.content.strip():
|
||||
# 使用增强的分割函数进行更细致的分割
|
||||
fragments = self._breakdown_section_content(section.content)
|
||||
|
||||
for fragment_idx, fragment_content in enumerate(fragments):
|
||||
if fragment_content.strip():
|
||||
fragment_index = len(sections_to_process)
|
||||
sections_to_process.append(TextFragment(
|
||||
content=fragment_content,
|
||||
fragment_index=fragment_index,
|
||||
total_fragments=0 # 临时值,稍后更新
|
||||
))
|
||||
|
||||
# 保存映射关系,用于稍后更新章节内容
|
||||
# 为每个片段存储原始章节和片段索引信息
|
||||
section_map[fragment_index] = (current_path, section, fragment_idx, len(fragments))
|
||||
|
||||
# 递归处理子章节
|
||||
if section.subsections:
|
||||
collect_section_contents(section.subsections, current_path)
|
||||
|
||||
# 收集所有章节内容
|
||||
collect_section_contents(paper.sections)
|
||||
|
||||
# 更新总片段数
|
||||
total_fragments = len(sections_to_process)
|
||||
for frag in sections_to_process:
|
||||
frag.total_fragments = total_fragments
|
||||
|
||||
# 4. 如果没有内容需要处理,直接返回
|
||||
if not sections_to_process:
|
||||
self.chatbot.append(["处理完成", "未找到需要处理的内容"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
return None
|
||||
|
||||
# 5. 批量处理章节内容
|
||||
self.chatbot[-1] = ["开始处理论文内容", f"共 {len(sections_to_process)} 个内容片段"]
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
# 一次性准备所有输入
|
||||
inputs_array, inputs_show_user_array, history_array = self._create_batch_inputs(sections_to_process)
|
||||
|
||||
# 使用系统提示
|
||||
instruction = self.plugin_kwargs.get("advanced_arg", """请对以下学术文本进行彻底改写,以显著降低AI生成特征。具体要求如下:
|
||||
|
||||
1. 保持学术写作的严谨性和专业性
|
||||
2. 维持原文的核心论述和逻辑框架
|
||||
3. 优化句式结构:
|
||||
- 灵活运用主动句与被动句
|
||||
- 适当拆分复杂句式,提高可读性
|
||||
- 注意句式的多样性,避免重复模式
|
||||
- 打破AI常用的句式模板
|
||||
4. 改善用词:
|
||||
- 使用更多学术语境下的同义词替换
|
||||
- 避免过于机械和规律性的连接词
|
||||
- 适当调整专业术语的表达方式
|
||||
- 增加词汇多样性,减少重复用词
|
||||
5. 增强文本的学术特征:
|
||||
- 注重论证的严密性
|
||||
- 保持表达的客观性
|
||||
- 适度体现作者的学术见解
|
||||
- 避免过于完美和均衡的论述结构
|
||||
6. 确保语言风格的一致性
|
||||
7. 减少AI生成文本常见的套路和模式""")
|
||||
sys_prompt_array = [f"""作为一位专业的学术写作顾问,请按照以下要求改写文本:
|
||||
|
||||
1. 严格保持学术写作规范
|
||||
2. 维持原文的核心论述和逻辑框架
|
||||
3. 通过优化句式结构和用词降低AI生成特征
|
||||
4. 确保语言风格的一致性和专业性
|
||||
5. 保持内容的客观性和准确性
|
||||
6. 避免AI常见的套路化表达和过于完美的结构"""] * len(sections_to_process)
|
||||
|
||||
# 调用LLM一次性处理所有片段
|
||||
response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
|
||||
inputs_array=inputs_array,
|
||||
inputs_show_user_array=inputs_show_user_array,
|
||||
llm_kwargs=self.llm_kwargs,
|
||||
chatbot=self.chatbot,
|
||||
history_array=history_array,
|
||||
sys_prompt_array=sys_prompt_array,
|
||||
)
|
||||
|
||||
# 处理响应,重组章节内容
|
||||
section_contents = {} # 用于重组各章节的处理后内容
|
||||
|
||||
for j, frag in enumerate(sections_to_process):
|
||||
try:
|
||||
llm_response = response_collection[j * 2 + 1]
|
||||
processed_text = self._extract_decision(llm_response)
|
||||
|
||||
if processed_text and processed_text.strip():
|
||||
# 保存处理结果
|
||||
self.processed_results.append({
|
||||
'index': frag.fragment_index,
|
||||
'content': processed_text
|
||||
})
|
||||
|
||||
# 存储处理后的文本片段,用于后续重组
|
||||
fragment_index = frag.fragment_index
|
||||
if fragment_index in section_map:
|
||||
path, section, fragment_idx, total_fragments = section_map[fragment_index]
|
||||
|
||||
# 初始化此章节的内容容器(如果尚未创建)
|
||||
if path not in section_contents:
|
||||
section_contents[path] = [""] * total_fragments
|
||||
|
||||
# 将处理后的片段放入正确位置
|
||||
section_contents[path][fragment_idx] = processed_text
|
||||
else:
|
||||
self.failed_fragments.append(frag)
|
||||
except Exception as e:
|
||||
self.failed_fragments.append(frag)
|
||||
|
||||
# 重组每个章节的内容
|
||||
for path, fragments in section_contents.items():
|
||||
section = None
|
||||
for idx in section_map:
|
||||
if section_map[idx][0] == path:
|
||||
section = section_map[idx][1]
|
||||
break
|
||||
|
||||
if section:
|
||||
# 合并该章节的所有处理后片段
|
||||
section.content = "\n".join(fragments)
|
||||
|
||||
# 6. 更新UI
|
||||
success_count = total_fragments - len(self.failed_fragments)
|
||||
self.chatbot[-1] = ["处理完成", f"成功处理 {success_count}/{total_fragments} 个内容片段"]
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
# 收集参考文献部分(不进行处理)
|
||||
references_sections = []
|
||||
def collect_references(sections, parent_path=""):
|
||||
"""递归收集参考文献部分"""
|
||||
for i, section in enumerate(sections):
|
||||
current_path = f"{parent_path}/{i}" if parent_path else f"{i}"
|
||||
|
||||
# 检查是否为参考文献部分
|
||||
if section.section_type == 'references' or section.title.lower() in ['references', '参考文献', 'bibliography', '文献']:
|
||||
references_sections.append((current_path, section))
|
||||
|
||||
# 递归检查子章节
|
||||
if section.subsections:
|
||||
collect_references(section.subsections, current_path)
|
||||
|
||||
# 收集参考文献
|
||||
collect_references(paper.sections)
|
||||
|
||||
# 7. 将处理后的结构化论文转换为Markdown
|
||||
markdown_content = self.paper_extractor.generate_markdown(paper)
|
||||
|
||||
# 8. 返回处理后的内容
|
||||
self.chatbot[-1] = ["处理完成", f"成功处理 {success_count}/{total_fragments} 个内容片段,参考文献部分未处理"]
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
return markdown_content
|
||||
|
||||
except Exception as e:
|
||||
self.chatbot.append(["结构化处理失败", f"错误: {str(e)},将尝试作为普通文件处理"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
return (yield from self._process_regular_file(file_path))
|
||||
|
||||
def _process_regular_file(self, file_path: str) -> Generator:
|
||||
"""使用原有方式处理普通文件"""
|
||||
# 原有的文件处理逻辑
|
||||
self.chatbot[-1] = ["正在读取文件", f"文件路径: {file_path}"]
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
content = extract_text(file_path)
|
||||
if not content or not content.strip():
|
||||
self.chatbot.append(["处理失败", "文件内容为空或无法提取内容"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
return None
|
||||
|
||||
# 2. 分割文本
|
||||
self.chatbot[-1] = ["正在分析文件", "将文件内容分割为适当大小的片段"]
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
# 使用增强的分割函数
|
||||
fragments = self._breakdown_section_content(content)
|
||||
|
||||
# 3. 创建文本片段对象
|
||||
text_fragments = []
|
||||
for i, frag in enumerate(fragments):
|
||||
if frag.strip():
|
||||
text_fragments.append(TextFragment(
|
||||
content=frag,
|
||||
fragment_index=i,
|
||||
total_fragments=len(fragments)
|
||||
))
|
||||
|
||||
# 4. 多轮降重处理
|
||||
if not text_fragments:
|
||||
self.chatbot.append(["处理失败", "未能提取到有效的文本内容"])
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
return None
|
||||
|
||||
# 批处理大小
|
||||
batch_size = 8 # 每批处理的片段数
|
||||
|
||||
# 第一次迭代
|
||||
current_batches = []
|
||||
for i in range(0, len(text_fragments), batch_size):
|
||||
current_batches.append(text_fragments[i:i + batch_size])
|
||||
|
||||
all_processed_fragments = []
|
||||
|
||||
# 进行多轮降重处理
|
||||
for iteration in range(1, self.reduction_times + 1):
|
||||
self.chatbot[-1] = ["开始处理文本", f"第 {iteration}/{self.reduction_times} 次降重"]
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
next_batches = []
|
||||
all_processed_fragments = []
|
||||
|
||||
# 分批处理当前迭代的片段
|
||||
for batch in current_batches:
|
||||
# 处理当前批次
|
||||
_ = yield from self._process_text_fragments(batch, iteration)
|
||||
|
||||
# 收集处理结果
|
||||
processed_batch = []
|
||||
for item in self.processed_results:
|
||||
processed_batch.append(TextFragment(
|
||||
content=item['content'],
|
||||
fragment_index=len(all_processed_fragments) + len(processed_batch),
|
||||
total_fragments=0 # 临时值,稍后更新
|
||||
))
|
||||
|
||||
all_processed_fragments.extend(processed_batch)
|
||||
|
||||
# 如果不是最后一轮迭代,准备下一批次
|
||||
if iteration < self.reduction_times:
|
||||
for i in range(0, len(processed_batch), batch_size):
|
||||
next_batches.append(processed_batch[i:i + batch_size])
|
||||
|
||||
# 更新总片段数
|
||||
for frag in all_processed_fragments:
|
||||
frag.total_fragments = len(all_processed_fragments)
|
||||
|
||||
# 为下一轮迭代准备批次
|
||||
current_batches = next_batches
|
||||
|
||||
# 合并最终结果
|
||||
final_content = "\n\n".join([frag.content for frag in all_processed_fragments])
|
||||
|
||||
# 5. 更新UI显示最终结果
|
||||
self.chatbot[-1] = ["处理完成", f"共完成 {self.reduction_times} 轮降重"]
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
return final_content
|
||||
|
||||
def save_results(self, content: str, original_file_path: str) -> List[str]:
|
||||
"""保存处理结果为TXT格式"""
|
||||
if not content:
|
||||
return []
|
||||
|
||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||
original_filename = os.path.basename(original_file_path)
|
||||
filename_without_ext = os.path.splitext(original_filename)[0]
|
||||
base_filename = f"{filename_without_ext}_processed_{timestamp}"
|
||||
|
||||
result_files = []
|
||||
|
||||
# 只保存为TXT
|
||||
try:
|
||||
txt_formatter = TxtFormatter()
|
||||
txt_content = txt_formatter.create_document(content)
|
||||
txt_file = write_history_to_file(
|
||||
history=[txt_content],
|
||||
file_basename=f"{base_filename}.txt"
|
||||
)
|
||||
result_files.append(txt_file)
|
||||
except Exception as e:
|
||||
self.chatbot.append(["警告", f"TXT格式保存失败: {str(e)}"])
|
||||
|
||||
# 添加到下载区
|
||||
for file in result_files:
|
||||
promote_file_to_downloadzone(file, chatbot=self.chatbot)
|
||||
|
||||
return result_files
|
||||
|
||||
def _breakdown_section_content(self, content: str) -> List[str]:
|
||||
"""对文本内容进行分割与合并
|
||||
|
||||
主要按段落进行组织,只合并较小的段落以减少片段数量
|
||||
保留原始段落结构,不对长段落进行强制分割
|
||||
针对中英文设置不同的阈值,因为字符密度不同
|
||||
"""
|
||||
# 先按段落分割文本
|
||||
paragraphs = content.split('\n\n')
|
||||
|
||||
# 检测语言类型
|
||||
chinese_char_count = sum(1 for char in content if '\u4e00' <= char <= '\u9fff')
|
||||
is_chinese_text = chinese_char_count / max(1, len(content)) > 0.3
|
||||
|
||||
# 根据语言类型设置不同的阈值(只用于合并小段落)
|
||||
if is_chinese_text:
|
||||
# 中文文本:一个汉字就是一个字符,信息密度高
|
||||
min_chunk_size = 300 # 段落合并的最小阈值
|
||||
target_size = 800 # 理想的段落大小
|
||||
else:
|
||||
# 英文文本:一个单词由多个字符组成,信息密度低
|
||||
min_chunk_size = 600 # 段落合并的最小阈值
|
||||
target_size = 1600 # 理想的段落大小
|
||||
|
||||
# 1. 只合并小段落,不对长段落进行分割
|
||||
result_fragments = []
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
for para in paragraphs:
|
||||
# 如果段落太小且不会超过目标大小,则合并
|
||||
if len(para) < min_chunk_size and current_length + len(para) <= target_size:
|
||||
current_chunk.append(para)
|
||||
current_length += len(para)
|
||||
# 否则,创建新段落
|
||||
else:
|
||||
# 如果当前块非空且与当前段落无关,先保存它
|
||||
if current_chunk and current_length > 0:
|
||||
result_fragments.append('\n\n'.join(current_chunk))
|
||||
|
||||
# 当前段落作为新块
|
||||
current_chunk = [para]
|
||||
current_length = len(para)
|
||||
|
||||
# 如果当前块大小已接近目标大小,保存并开始新块
|
||||
if current_length >= target_size:
|
||||
result_fragments.append('\n\n'.join(current_chunk))
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
# 保存最后一个块
|
||||
if current_chunk:
|
||||
result_fragments.append('\n\n'.join(current_chunk))
|
||||
|
||||
# 2. 处理可能过大的片段(确保不超过token限制)
|
||||
final_fragments = []
|
||||
max_token = self._get_token_limit()
|
||||
|
||||
for fragment in result_fragments:
|
||||
# 检查fragment是否可能超出token限制
|
||||
# 根据语言类型调整token估算
|
||||
if is_chinese_text:
|
||||
estimated_tokens = len(fragment) / 1.5 # 中文每个token约1-2个字符
|
||||
else:
|
||||
estimated_tokens = len(fragment) / 4 # 英文每个token约4个字符
|
||||
|
||||
if estimated_tokens > max_token:
|
||||
# 即使可能超出限制,也尽量保持段落的完整性
|
||||
# 使用breakdown_text但设置更大的限制来减少分割
|
||||
larger_limit = max_token * 0.95 # 使用95%的限制
|
||||
sub_fragments = breakdown_text_to_satisfy_token_limit(
|
||||
txt=fragment,
|
||||
limit=larger_limit,
|
||||
llm_model=self.llm_kwargs['llm_model']
|
||||
)
|
||||
final_fragments.extend(sub_fragments)
|
||||
else:
|
||||
final_fragments.append(fragment)
|
||||
|
||||
return final_fragments
|
||||
|
||||
def _process_text_fragments(self, text_fragments: List[TextFragment], current_iteration: int = 1) -> str:
|
||||
"""处理文本片段,支持多次降重
|
||||
|
||||
Args:
|
||||
text_fragments: 要处理的文本片段列表
|
||||
current_iteration: 当前是第几次降重迭代
|
||||
|
||||
Returns:
|
||||
处理后的文本内容
|
||||
"""
|
||||
self.chatbot[-1] = ["开始处理文本", f"第 {current_iteration}/{self.reduction_times} 次降重,共 {len(text_fragments)} 个片段"]
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
# 重置处理结果,为当前迭代做准备
|
||||
self.processed_results = []
|
||||
self.failed_fragments = []
|
||||
|
||||
# 一次性准备所有输入
|
||||
inputs_array, inputs_show_user_array, history_array = self._create_batch_inputs(text_fragments, current_iteration)
|
||||
|
||||
# 对于单次调用的特殊处理 - 使用更强力的系统提示词
|
||||
if self.reduction_times == 1:
|
||||
sys_prompt = f"""你是一个伪装成人类科研学者的AI,能够将各种AI生成内容改写成人类学者常用的表达方式,让人类学者无法分辨。
|
||||
|
||||
你的任务是对学术文本进行深度改写,以彻底消除AI特征,使其具有真实人类学者的写作特点:
|
||||
|
||||
1. 打破AI常见的规整句式结构和词汇选择模式
|
||||
2. 引入学术写作中的自然变化和不规则性
|
||||
3. 使用多样化的句式和段落结构,避免明显的平衡性和规律性
|
||||
4. 加入人类写作中常见的微小不完美之处,如段落长度变化、论述强调点不均等
|
||||
5. 在维持核心论点的同时,完全重构表达方式
|
||||
6. 适当使用学科领域内专家常用的表达习惯和术语
|
||||
7. 确保文本读起来自然流畅,避免机械感
|
||||
8. 打破AI生成内容常见的逻辑结构和表达套路
|
||||
9. 根据文本内容,在保持专业性的基础上增加写作的特色和个性
|
||||
|
||||
对于每个文本片段,尽可能彻底地重构,使其无法被识别为AI生成。"""
|
||||
|
||||
sys_prompt_array = [sys_prompt] * len(text_fragments)
|
||||
|
||||
else:
|
||||
# 原有的多次迭代处理逻辑
|
||||
# 根据迭代次数调整系统提示词强度
|
||||
base_instruction = self.plugin_kwargs.get("advanced_arg", """你是一个人类科研工作者,请对以下学术文本进行彻底改写,以显著降低AI生成特征。具体要求如下:
|
||||
|
||||
1. 保持学术写作的严谨性和专业性
|
||||
2. 维持原文的核心论述和逻辑框架
|
||||
3. 优化句式结构:
|
||||
- 灵活运用主动句与被动句
|
||||
- 适当拆分复杂句式,提高可读性
|
||||
- 注意句式的多样性,避免重复模式
|
||||
- 打破AI常用的句式模板
|
||||
4. 改善用词:
|
||||
- 使用更多学术语境下的同义词替换
|
||||
- 避免过于机械和规律性的连接词
|
||||
- 适当调整专业术语的表达方式
|
||||
- 增加词汇多样性,减少重复用词
|
||||
5. 增强文本的学术特征:
|
||||
- 注重论证的严密性
|
||||
- 保持表达的客观性
|
||||
- 适度体现作者的学术见解
|
||||
- 避免过于完美和均衡的论述结构
|
||||
6. 确保语言风格的一致性
|
||||
7. 减少AI生成文本常见的套路和模式""")
|
||||
|
||||
# 根据迭代次数增强强度
|
||||
if current_iteration == 1:
|
||||
# 第一次迭代使用基础强度
|
||||
instruction = base_instruction
|
||||
sys_prompt = f"""你是一个伪装成人类科研学者的AI,能够将各种AI生成内容改写成人类学者常用的表达方式,让人类学者无法分辨。请进行第{current_iteration}轮深度改写:
|
||||
|
||||
1. 严格保持学术写作规范
|
||||
2. 维持原文的核心论述和逻辑框架
|
||||
3. 通过优化句式结构和用词降低AI生成特征
|
||||
4. 确保语言风格的一致性和专业性
|
||||
5. 保持内容的客观性和准确性
|
||||
6. 避免AI常见的套路化表达和过于完美的结构"""
|
||||
elif current_iteration == 2:
|
||||
# 第二次迭代使用中等强度
|
||||
instruction = base_instruction + """这是第二次改写,请进一步优化:
|
||||
|
||||
1. 更深入地调整句式结构,增加表达多样性
|
||||
2. 在保持准确性的前提下,替换更多专业术语的表达方式
|
||||
3. 优化段落结构,增强逻辑连贯性
|
||||
4. 适当调整学术论证的表达方式
|
||||
5. 打破AI生成内容的常见模式,如过于均衡的结构和规律性的表达方式"""
|
||||
sys_prompt = f"""你是一个伪装成人类科研学者的AI,能够将各种AI生成内容改写成人类学者常用的表达方式,让人类学者无法分辨。请进行第二轮改写:
|
||||
|
||||
1. 在保持严谨性的基础上,优化表达方式
|
||||
2. 改善文本结构,提升学术论述的清晰度
|
||||
3. 增强专业术语使用的灵活性
|
||||
4. 确保论证过程的完整性和逻辑性
|
||||
5. 进一步减少AI生成内容的特征和模式
|
||||
6. 使文本表现出更自然、更人性化的写作风格"""
|
||||
else:
|
||||
# 第三次或更多次迭代使用高强度
|
||||
instruction = base_instruction + f"""这是第{current_iteration}次改写,需要在保持学术性的前提下彻底重构文本:
|
||||
|
||||
1. 全面重构句式,采用不同的表达策略
|
||||
2. 系统性替换学术用语,保持专业准确性
|
||||
3. 重组段落结构,优化论证层次
|
||||
4. 深化学术观点的表达方式
|
||||
5. 增强文本的逻辑严密性
|
||||
6. 提升论述的专业深度
|
||||
7. 确保不遗漏任何学术观点和论证要素
|
||||
8. 适当使用学术表达技巧(如精准举例、恰当转折等),但不过分依赖比喻和类比
|
||||
9. 彻底消除AI生成内容的特征,使文本具有真实学者的写作风格"""
|
||||
sys_prompt = f"""你是一个伪装成人类科研学者的AI,能够将各种AI生成内容改写成人类学者常用的表达方式,让人类学者无法分辨。请进行第{current_iteration}轮深度改写:
|
||||
|
||||
1. 在保持专业水准的前提下,彻底重构表达方式,引入长难句
|
||||
2. 确保学术论证的严密性和完整性
|
||||
3. 优化专业术语的运用
|
||||
4. 提升文本的学术价值
|
||||
5. 保持论述的逻辑性和连贯性
|
||||
6. 适当使用少量学术表达技巧,提高文本说服力,但避免过度使用比喻和类比
|
||||
7. 消除所有明显的AI生成痕迹,使文本更接近真实学者的写作风格"""
|
||||
|
||||
sys_prompt_array = [sys_prompt] * len(text_fragments)
|
||||
|
||||
# 调用LLM一次性处理所有片段
|
||||
response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
|
||||
inputs_array=inputs_array,
|
||||
inputs_show_user_array=inputs_show_user_array,
|
||||
llm_kwargs=self.llm_kwargs,
|
||||
chatbot=self.chatbot,
|
||||
history_array=history_array,
|
||||
sys_prompt_array=sys_prompt_array,
|
||||
)
|
||||
|
||||
# 处理响应
|
||||
for j, frag in enumerate(text_fragments):
|
||||
try:
|
||||
llm_response = response_collection[j * 2 + 1]
|
||||
processed_text = self._extract_decision(llm_response)
|
||||
|
||||
if processed_text and processed_text.strip():
|
||||
self.processed_results.append({
|
||||
'index': frag.fragment_index,
|
||||
'content': processed_text
|
||||
})
|
||||
else:
|
||||
self.failed_fragments.append(frag)
|
||||
self.processed_results.append({
|
||||
'index': frag.fragment_index,
|
||||
'content': frag.content
|
||||
})
|
||||
except Exception as e:
|
||||
self.failed_fragments.append(frag)
|
||||
self.processed_results.append({
|
||||
'index': frag.fragment_index,
|
||||
'content': frag.content
|
||||
})
|
||||
|
||||
# 按原始顺序合并结果
|
||||
self.processed_results.sort(key=lambda x: x['index'])
|
||||
final_content = "\n".join([item['content'] for item in self.processed_results])
|
||||
|
||||
# 更新UI
|
||||
success_count = len(text_fragments) - len(self.failed_fragments)
|
||||
self.chatbot[-1] = ["当前阶段处理完成", f"第 {current_iteration}/{self.reduction_times} 次降重,成功处理 {success_count}/{len(text_fragments)} 个片段"]
|
||||
yield from update_ui(chatbot=self.chatbot, history=self.history)
|
||||
|
||||
return final_content
|
||||
|
||||
|
||||
@CatchException
|
||||
def 学术降重(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List,
|
||||
history: List, system_prompt: str, user_request: str):
|
||||
"""主函数 - 文件到文件处理"""
|
||||
# 初始化
|
||||
# 从高级参数中提取降重次数
|
||||
if "advanced_arg" in plugin_kwargs and plugin_kwargs["advanced_arg"]:
|
||||
# 检查是否包含降重次数的设置
|
||||
match = re.search(r'reduction_times\s*=\s*(\d+)', plugin_kwargs["advanced_arg"])
|
||||
if match:
|
||||
reduction_times = int(match.group(1))
|
||||
# 替换掉高级参数中的reduction_times设置,但保留其他内容
|
||||
plugin_kwargs["advanced_arg"] = re.sub(r'reduction_times\s*=\s*\d+', '', plugin_kwargs["advanced_arg"]).strip()
|
||||
# 添加到plugin_kwargs中作为单独的参数
|
||||
plugin_kwargs["reduction_times"] = reduction_times
|
||||
|
||||
processor = DocumentProcessor(llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
|
||||
chatbot.append(["函数插件功能", f"文件内容处理:将文档内容进行{processor.reduction_times}次降重处理"])
|
||||
|
||||
# 更新用户提示,提供关于降重策略的详细说明
|
||||
if processor.reduction_times == 1:
|
||||
chatbot.append(["降重策略", "将使用单次深度降重,这种方式能更有效地降低AI特征,减少查重率。我们采用特殊优化的提示词,通过一次性强力改写来实现降重效果。"])
|
||||
elif processor.reduction_times > 1:
|
||||
chatbot.append(["降重策略", f"将进行{processor.reduction_times}轮迭代降重,每轮降重都会基于上一轮的结果,并逐渐增加降重强度。请注意,多轮迭代可能会引入新的AI特征,单次强力降重通常效果更好。"])
|
||||
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
# 验证输入路径
|
||||
if not os.path.exists(txt):
|
||||
report_exception(chatbot, history, a=f"解析路径: {txt}", b=f"找不到路径或无权访问: {txt}")
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
return
|
||||
|
||||
# 验证路径安全性
|
||||
user_name = chatbot.get_user()
|
||||
validate_path_safety(txt, user_name)
|
||||
|
||||
# 获取文件列表
|
||||
if os.path.isfile(txt):
|
||||
# 单个文件处理
|
||||
file_paths = [txt]
|
||||
else:
|
||||
# 目录处理 - 类似批量文件询问插件
|
||||
project_folder = txt
|
||||
extract_folder = next((d for d in glob.glob(f'{project_folder}/*')
|
||||
if os.path.isdir(d) and d.endswith('.extract')), project_folder)
|
||||
|
||||
# 排除压缩文件
|
||||
exclude_patterns = r'/[^/]+\.(zip|rar|7z|tar|gz)$'
|
||||
file_paths = [f for f in glob.glob(f'{extract_folder}/**', recursive=True)
|
||||
if os.path.isfile(f) and not re.search(exclude_patterns, f)]
|
||||
|
||||
# 过滤支持的文件格式
|
||||
file_paths = [f for f in file_paths if any(f.lower().endswith(ext) for ext in
|
||||
list(processor.paper_extractor.SUPPORTED_EXTENSIONS) + ['.json', '.csv', '.xlsx', '.xls'])]
|
||||
|
||||
if not file_paths:
|
||||
report_exception(chatbot, history, a=f"解析路径: {txt}", b="未找到支持的文件类型")
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
return
|
||||
|
||||
# 处理文件
|
||||
if len(file_paths) > 1:
|
||||
chatbot.append(["发现多个文件", f"共找到 {len(file_paths)} 个文件,将处理第一个文件"])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
# 只处理第一个文件
|
||||
file_to_process = file_paths[0]
|
||||
processed_content = yield from processor.process_file(file_to_process)
|
||||
|
||||
if processed_content:
|
||||
# 保存结果
|
||||
result_files = processor.save_results(processed_content, file_to_process)
|
||||
|
||||
if result_files:
|
||||
chatbot.append(["处理完成", f"已生成 {len(result_files)} 个结果文件"])
|
||||
else:
|
||||
chatbot.append(["处理完成", "但未能保存任何结果文件"])
|
||||
else:
|
||||
chatbot.append(["处理失败", "未能生成有效的处理结果"])
|
||||
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
387
crazy_functions/paper_fns/wiki/wikipedia_api.py
Normal file
387
crazy_functions/paper_fns/wiki/wikipedia_api.py
Normal file
@@ -0,0 +1,387 @@
|
||||
import aiohttp
|
||||
import asyncio
|
||||
from typing import List, Dict, Optional
|
||||
import re
|
||||
import random
|
||||
import time
|
||||
|
||||
class WikipediaAPI:
|
||||
"""维基百科API调用实现"""
|
||||
|
||||
def __init__(self, language: str = "zh", user_agent: str = None,
|
||||
max_concurrent: int = 5, request_delay: float = 0.5):
|
||||
"""
|
||||
初始化维基百科API客户端
|
||||
|
||||
Args:
|
||||
language: 语言代码 (zh: 中文, en: 英文, ja: 日文等)
|
||||
user_agent: 用户代理信息,如果为None将使用默认值
|
||||
max_concurrent: 最大并发请求数
|
||||
request_delay: 请求间隔时间(秒)
|
||||
"""
|
||||
self.language = language
|
||||
self.base_url = f"https://{language}.wikipedia.org/w/api.php"
|
||||
self.user_agent = user_agent or "WikipediaAPIClient/1.0 (chatscholar@163.com)"
|
||||
self.headers = {
|
||||
"User-Agent": self.user_agent,
|
||||
"Accept": "application/json"
|
||||
}
|
||||
# 添加并发控制
|
||||
self.semaphore = asyncio.Semaphore(max_concurrent)
|
||||
self.request_delay = request_delay
|
||||
self.last_request_time = 0
|
||||
|
||||
async def _make_request(self, url, params=None):
|
||||
"""
|
||||
发起API请求,包含并发控制和请求延迟
|
||||
|
||||
Args:
|
||||
url: 请求URL
|
||||
params: 请求参数
|
||||
|
||||
Returns:
|
||||
API响应数据
|
||||
"""
|
||||
# 使用信号量控制并发
|
||||
async with self.semaphore:
|
||||
# 添加请求间隔
|
||||
current_time = time.time()
|
||||
time_since_last_request = current_time - self.last_request_time
|
||||
if time_since_last_request < self.request_delay:
|
||||
await asyncio.sleep(self.request_delay - time_since_last_request)
|
||||
|
||||
# 设置随机延迟,避免规律性请求
|
||||
jitter = random.uniform(0, 0.2)
|
||||
await asyncio.sleep(jitter)
|
||||
|
||||
# 记录本次请求时间
|
||||
self.last_request_time = time.time()
|
||||
|
||||
# 发起请求
|
||||
try:
|
||||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||||
async with session.get(url, params=params) as response:
|
||||
if response.status == 429: # Too Many Requests
|
||||
retry_after = int(response.headers.get('Retry-After', 5))
|
||||
print(f"达到请求限制,等待 {retry_after} 秒后重试...")
|
||||
await asyncio.sleep(retry_after)
|
||||
return await self._make_request(url, params)
|
||||
|
||||
if response.status != 200:
|
||||
print(f"API请求失败: HTTP {response.status}")
|
||||
print(f"响应内容: {await response.text()}")
|
||||
return None
|
||||
|
||||
return await response.json()
|
||||
except aiohttp.ClientError as e:
|
||||
print(f"请求错误: {str(e)}")
|
||||
return None
|
||||
|
||||
async def search(self, query: str, limit: int = 10, namespace: int = 0) -> List[Dict]:
|
||||
"""
|
||||
搜索维基百科文章
|
||||
|
||||
Args:
|
||||
query: 搜索关键词
|
||||
limit: 返回结果数量
|
||||
namespace: 命名空间 (0表示文章, 14表示分类等)
|
||||
|
||||
Returns:
|
||||
搜索结果列表
|
||||
"""
|
||||
params = {
|
||||
"action": "query",
|
||||
"list": "search",
|
||||
"srsearch": query,
|
||||
"format": "json",
|
||||
"srlimit": limit,
|
||||
"srnamespace": namespace,
|
||||
"srprop": "snippet|titlesnippet|sectiontitle|categorysnippet|size|wordcount|timestamp|redirecttitle"
|
||||
}
|
||||
|
||||
data = await self._make_request(self.base_url, params)
|
||||
if not data:
|
||||
return []
|
||||
|
||||
search_results = data.get("query", {}).get("search", [])
|
||||
return search_results
|
||||
|
||||
async def get_page_content(self, title: str, section: Optional[int] = None) -> Dict:
|
||||
"""
|
||||
获取维基百科页面内容
|
||||
|
||||
Args:
|
||||
title: 页面标题
|
||||
section: 特定章节编号(可选)
|
||||
|
||||
Returns:
|
||||
页面内容字典
|
||||
"""
|
||||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||||
params = {
|
||||
"action": "parse",
|
||||
"page": title,
|
||||
"format": "json",
|
||||
"prop": "text|langlinks|categories|links|templates|images|externallinks|sections|revid|displaytitle|iwlinks|properties"
|
||||
}
|
||||
|
||||
# 如果指定了章节,只获取该章节内容
|
||||
if section is not None:
|
||||
params["section"] = section
|
||||
|
||||
async with session.get(self.base_url, params=params) as response:
|
||||
if response.status != 200:
|
||||
print(f"API请求失败: HTTP {response.status}")
|
||||
return {}
|
||||
|
||||
data = await response.json()
|
||||
if "error" in data:
|
||||
print(f"API错误: {data['error'].get('info', '未知错误')}")
|
||||
return {}
|
||||
|
||||
return data.get("parse", {})
|
||||
|
||||
async def get_summary(self, title: str, sentences: int = 3) -> str:
|
||||
"""
|
||||
获取页面摘要
|
||||
|
||||
Args:
|
||||
title: 页面标题
|
||||
sentences: 返回的句子数量
|
||||
|
||||
Returns:
|
||||
页面摘要文本
|
||||
"""
|
||||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||||
params = {
|
||||
"action": "query",
|
||||
"prop": "extracts",
|
||||
"exintro": "1",
|
||||
"exsentences": sentences,
|
||||
"explaintext": "1",
|
||||
"titles": title,
|
||||
"format": "json"
|
||||
}
|
||||
|
||||
async with session.get(self.base_url, params=params) as response:
|
||||
if response.status != 200:
|
||||
print(f"API请求失败: HTTP {response.status}")
|
||||
return ""
|
||||
|
||||
data = await response.json()
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
# 获取第一个页面ID的内容
|
||||
for page_id in pages:
|
||||
return pages[page_id].get("extract", "")
|
||||
return ""
|
||||
|
||||
async def get_random_articles(self, count: int = 1, namespace: int = 0) -> List[Dict]:
|
||||
"""
|
||||
获取随机文章
|
||||
|
||||
Args:
|
||||
count: 需要的随机文章数量
|
||||
namespace: 命名空间
|
||||
|
||||
Returns:
|
||||
随机文章列表
|
||||
"""
|
||||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||||
params = {
|
||||
"action": "query",
|
||||
"list": "random",
|
||||
"rnlimit": count,
|
||||
"rnnamespace": namespace,
|
||||
"format": "json"
|
||||
}
|
||||
|
||||
async with session.get(self.base_url, params=params) as response:
|
||||
if response.status != 200:
|
||||
print(f"API请求失败: HTTP {response.status}")
|
||||
return []
|
||||
|
||||
data = await response.json()
|
||||
return data.get("query", {}).get("random", [])
|
||||
|
||||
async def login(self, username: str, password: str) -> bool:
|
||||
"""
|
||||
使用维基百科账户登录
|
||||
|
||||
Args:
|
||||
username: 维基百科用户名
|
||||
password: 维基百科密码
|
||||
|
||||
Returns:
|
||||
登录是否成功
|
||||
"""
|
||||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||||
# 获取登录令牌
|
||||
params = {
|
||||
"action": "query",
|
||||
"meta": "tokens",
|
||||
"type": "login",
|
||||
"format": "json"
|
||||
}
|
||||
|
||||
async with session.get(self.base_url, params=params) as response:
|
||||
if response.status != 200:
|
||||
print(f"获取登录令牌失败: HTTP {response.status}")
|
||||
return False
|
||||
|
||||
data = await response.json()
|
||||
login_token = data.get("query", {}).get("tokens", {}).get("logintoken")
|
||||
|
||||
if not login_token:
|
||||
print("获取登录令牌失败")
|
||||
return False
|
||||
|
||||
# 使用令牌登录
|
||||
login_params = {
|
||||
"action": "login",
|
||||
"lgname": username,
|
||||
"lgpassword": password,
|
||||
"lgtoken": login_token,
|
||||
"format": "json"
|
||||
}
|
||||
|
||||
async with session.post(self.base_url, data=login_params) as login_response:
|
||||
login_data = await login_response.json()
|
||||
|
||||
if login_data.get("login", {}).get("result") == "Success":
|
||||
print(f"登录成功: {username}")
|
||||
return True
|
||||
else:
|
||||
print(f"登录失败: {login_data.get('login', {}).get('reason', '未知原因')}")
|
||||
return False
|
||||
|
||||
async def setup_oauth(self, consumer_token: str, consumer_secret: str,
|
||||
access_token: str = None, access_secret: str = None) -> bool:
|
||||
"""
|
||||
设置OAuth认证
|
||||
|
||||
Args:
|
||||
consumer_token: 消费者令牌
|
||||
consumer_secret: 消费者密钥
|
||||
access_token: 访问令牌(可选)
|
||||
access_secret: 访问密钥(可选)
|
||||
|
||||
Returns:
|
||||
设置是否成功
|
||||
"""
|
||||
try:
|
||||
# 需要安装 mwoauth 库: pip install mwoauth
|
||||
import mwoauth
|
||||
import requests_oauthlib
|
||||
|
||||
# 设置OAuth
|
||||
self.consumer_token = consumer_token
|
||||
self.consumer_secret = consumer_secret
|
||||
|
||||
if access_token and access_secret:
|
||||
# 如果已有访问令牌
|
||||
self.auth = requests_oauthlib.OAuth1(
|
||||
consumer_token,
|
||||
consumer_secret,
|
||||
access_token,
|
||||
access_secret
|
||||
)
|
||||
print("OAuth设置成功")
|
||||
return True
|
||||
else:
|
||||
# 需要获取访问令牌(这通常需要用户在网页上授权)
|
||||
print("请在开发环境中完成以下OAuth授权流程:")
|
||||
|
||||
# 创建消费者
|
||||
consumer = mwoauth.Consumer(
|
||||
consumer_token, consumer_secret
|
||||
)
|
||||
|
||||
# 初始化握手
|
||||
redirect, request_token = mwoauth.initiate(
|
||||
f"https://{self.language}.wikipedia.org/w/index.php",
|
||||
consumer
|
||||
)
|
||||
|
||||
print(f"请访问此URL授权应用: {redirect}")
|
||||
# 这里通常会提示用户访问URL并输入授权码
|
||||
# 实际应用中需要实现适当的授权流程
|
||||
return False
|
||||
except ImportError:
|
||||
print("请安装 mwoauth 库: pip install mwoauth")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"设置OAuth时发生错误: {str(e)}")
|
||||
return False
|
||||
|
||||
async def example_usage():
|
||||
"""演示WikipediaAPI的使用方法"""
|
||||
# 创建默认中文维基百科API客户端
|
||||
wiki_zh = WikipediaAPI(language="zh")
|
||||
|
||||
try:
|
||||
# 示例1: 基本搜索
|
||||
print("\n=== 示例1: 搜索维基百科 ===")
|
||||
results = await wiki_zh.search("人工智能", limit=3)
|
||||
|
||||
for i, result in enumerate(results, 1):
|
||||
print(f"\n--- 结果 {i} ---")
|
||||
print(f"标题: {result.get('title')}")
|
||||
snippet = result.get('snippet', '')
|
||||
# 清理HTML标签
|
||||
snippet = re.sub(r'<.*?>', '', snippet)
|
||||
print(f"摘要: {snippet}")
|
||||
print(f"字数: {result.get('wordcount')}")
|
||||
print(f"大小: {result.get('size')} 字节")
|
||||
|
||||
# 示例2: 获取页面摘要
|
||||
print("\n=== 示例2: 获取页面摘要 ===")
|
||||
summary = await wiki_zh.get_summary("深度学习", sentences=2)
|
||||
print(f"深度学习摘要: {summary}")
|
||||
|
||||
# 示例3: 获取页面内容
|
||||
print("\n=== 示例3: 获取页面内容 ===")
|
||||
content = await wiki_zh.get_page_content("机器学习")
|
||||
if content and "text" in content:
|
||||
text = content["text"].get("*", "")
|
||||
# 移除HTML标签以便控制台显示
|
||||
clean_text = re.sub(r'<.*?>', '', text)
|
||||
print(f"机器学习页面内容片段: {clean_text[:200]}...")
|
||||
|
||||
# 显示页面包含的分类数量
|
||||
categories = content.get("categories", [])
|
||||
print(f"分类数量: {len(categories)}")
|
||||
|
||||
# 显示页面包含的链接数量
|
||||
links = content.get("links", [])
|
||||
print(f"链接数量: {len(links)}")
|
||||
|
||||
# 示例4: 获取特定章节内容
|
||||
print("\n=== 示例4: 获取特定章节内容 ===")
|
||||
# 获取引言部分(通常是0号章节)
|
||||
intro_content = await wiki_zh.get_page_content("人工智能", section=0)
|
||||
if intro_content and "text" in intro_content:
|
||||
intro_text = intro_content["text"].get("*", "")
|
||||
clean_intro = re.sub(r'<.*?>', '', intro_text)
|
||||
print(f"人工智能引言内容片段: {clean_intro[:200]}...")
|
||||
|
||||
# 示例5: 获取随机文章
|
||||
print("\n=== 示例5: 获取随机文章 ===")
|
||||
random_articles = await wiki_zh.get_random_articles(count=2)
|
||||
print("随机文章:")
|
||||
for i, article in enumerate(random_articles, 1):
|
||||
print(f"{i}. {article.get('title')}")
|
||||
|
||||
# 显示随机文章的简短摘要
|
||||
article_summary = await wiki_zh.get_summary(article.get('title'), sentences=1)
|
||||
print(f" 摘要: {article_summary[:100]}...")
|
||||
|
||||
except Exception as e:
|
||||
print(f"发生错误: {str(e)}")
|
||||
import traceback
|
||||
print(traceback.format_exc())
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
# 运行示例
|
||||
asyncio.run(example_usage())
|
||||
Reference in New Issue
Block a user