Master 4.0 (#2210)

* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
This commit is contained in:
binary-husky
2025-08-23 15:59:22 +08:00
committed by GitHub
parent 65a4cf59c2
commit 8042750d41
79 changed files with 20850 additions and 57 deletions

View File

@@ -0,0 +1,412 @@
import asyncio
from datetime import datetime
from abc import ABC, abstractmethod
from typing import List, Dict, Any
from crazy_functions.review_fns.query_analyzer import SearchCriteria
from crazy_functions.review_fns.data_sources.arxiv_source import ArxivSource
from crazy_functions.review_fns.data_sources.semantic_source import SemanticScholarSource
from crazy_functions.review_fns.data_sources.pubmed_source import PubMedSource
from crazy_functions.review_fns.paper_processor.paper_llm_ranker import PaperLLMRanker
from crazy_functions.pdf_fns.breakdown_pdf_txt import cut_from_end_to_satisfy_token_limit
from request_llms.bridge_all import model_info
from crazy_functions.review_fns.data_sources.crossref_source import CrossrefSource
from crazy_functions.review_fns.data_sources.adsabs_source import AdsabsSource
from toolbox import get_conf
class BaseHandler(ABC):
"""处理器基类"""
def __init__(self, arxiv: ArxivSource, semantic: SemanticScholarSource, llm_kwargs: Dict = None):
self.arxiv = arxiv
self.semantic = semantic
self.pubmed = PubMedSource()
self.crossref = CrossrefSource() # 添加 Crossref 实例
self.adsabs = AdsabsSource() # 添加 ADS 实例
self.paper_ranker = PaperLLMRanker(llm_kwargs=llm_kwargs)
self.ranked_papers = [] # 存储排序后的论文列表
self.llm_kwargs = llm_kwargs or {} # 保存llm_kwargs
def _get_search_params(self, plugin_kwargs: Dict) -> Dict:
"""获取搜索参数"""
return {
'max_papers': plugin_kwargs.get('max_papers', 100), # 最大论文数量
'min_year': plugin_kwargs.get('min_year', 2015), # 最早年份
'search_multiplier': plugin_kwargs.get('search_multiplier', 3), # 检索倍数
}
@abstractmethod
async def handle(
self,
criteria: SearchCriteria,
chatbot: List[List[str]],
history: List[List[str]],
system_prompt: str,
llm_kwargs: Dict[str, Any],
plugin_kwargs: Dict[str, Any],
) -> List[List[str]]:
"""处理查询"""
pass
async def _search_arxiv(self, params: Dict, limit_multiplier: int = 1, min_year: int = 2015) -> List:
"""使用arXiv专用参数搜索"""
try:
original_limit = params.get("limit", 20)
params["limit"] = original_limit * limit_multiplier
papers = []
# 首先尝试基础搜索
query = params.get("query", "")
if query:
papers = await self.arxiv.search(
query,
limit=params["limit"],
sort_by=params.get("sort_by", "relevance"),
sort_order=params.get("sort_order", "descending"),
start_year=min_year
)
# 如果基础搜索没有结果,尝试分类搜索
if not papers:
categories = params.get("categories", [])
for category in categories:
category_papers = await self.arxiv.search_by_category(
category,
limit=params["limit"],
sort_by=params.get("sort_by", "relevance"),
sort_order=params.get("sort_order", "descending"),
)
if category_papers:
papers.extend(category_papers)
return papers or []
except Exception as e:
print(f"arXiv搜索出错: {str(e)}")
return []
async def _search_semantic(self, params: Dict, limit_multiplier: int = 1, min_year: int = 2015) -> List:
"""使用Semantic Scholar专用参数搜索"""
try:
original_limit = params.get("limit", 20)
params["limit"] = original_limit * limit_multiplier
# 只使用基本的搜索参数
papers = await self.semantic.search(
query=params.get("query", ""),
limit=params["limit"]
)
# 在内存中进行过滤
if papers and min_year:
papers = [p for p in papers if getattr(p, 'year', 0) and p.year >= min_year]
return papers or []
except Exception as e:
print(f"Semantic Scholar搜索出错: {str(e)}")
return []
async def _search_pubmed(self, params: Dict, limit_multiplier: int = 1, min_year: int = 2015) -> List:
"""使用PubMed专用参数搜索"""
try:
# 如果不需要PubMed搜索直接返回空列表
if params.get("search_type") == "none":
return []
original_limit = params.get("limit", 20)
params["limit"] = original_limit * limit_multiplier
papers = []
# 根据搜索类型选择搜索方法
if params.get("search_type") == "basic":
papers = await self.pubmed.search(
query=params.get("query", ""),
limit=params["limit"],
start_year=min_year
)
elif params.get("search_type") == "author":
papers = await self.pubmed.search_by_author(
author=params.get("query", ""),
limit=params["limit"],
start_year=min_year
)
elif params.get("search_type") == "journal":
papers = await self.pubmed.search_by_journal(
journal=params.get("query", ""),
limit=params["limit"],
start_year=min_year
)
return papers or []
except Exception as e:
print(f"PubMed搜索出错: {str(e)}")
return []
async def _search_crossref(self, params: Dict, limit_multiplier: int = 1, min_year: int = 2015) -> List:
"""使用Crossref专用参数搜索"""
try:
original_limit = params.get("limit", 20)
params["limit"] = original_limit * limit_multiplier
papers = []
# 根据搜索类型选择搜索方法
if params.get("search_type") == "basic":
papers = await self.crossref.search(
query=params.get("query", ""),
limit=params["limit"],
start_year=min_year
)
elif params.get("search_type") == "author":
papers = await self.crossref.search_by_authors(
authors=[params.get("query", "")],
limit=params["limit"],
start_year=min_year
)
elif params.get("search_type") == "journal":
# 实现期刊搜索逻辑
pass
return papers or []
except Exception as e:
print(f"Crossref搜索出错: {str(e)}")
return []
async def _search_adsabs(self, params: Dict, limit_multiplier: int = 1, min_year: int = 2015) -> List:
"""使用ADS专用参数搜索"""
try:
original_limit = params.get("limit", 20)
params["limit"] = original_limit * limit_multiplier
papers = []
# 执行搜索
if params.get("search_type") == "basic":
papers = await self.adsabs.search(
query=params.get("query", ""),
limit=params["limit"],
start_year=min_year
)
return papers or []
except Exception as e:
print(f"ADS搜索出错: {str(e)}")
return []
async def _search_all_sources(self, criteria: SearchCriteria, search_params: Dict) -> List:
"""从所有数据源搜索论文"""
search_tasks = []
# # 检查是否需要执行PubMed搜索
# is_using_pubmed = criteria.pubmed_params.get("search_type") != "none" and criteria.pubmed_params.get("query") != "none"
is_using_pubmed = False # 开源版本不再搜索pubmed
# 如果使用PubMed则只执行PubMed和Semantic Scholar搜索
if is_using_pubmed:
search_tasks.append(
self._search_pubmed(
criteria.pubmed_params,
limit_multiplier=search_params['search_multiplier'],
min_year=criteria.start_year
)
)
# Semantic Scholar总是执行搜索
search_tasks.append(
self._search_semantic(
criteria.semantic_params,
limit_multiplier=search_params['search_multiplier'],
min_year=criteria.start_year
)
)
else:
# 如果不使用ADS则执行Crossref搜索
if criteria.crossref_params.get("search_type") != "none" and criteria.crossref_params.get("query") != "none":
search_tasks.append(
self._search_crossref(
criteria.crossref_params,
limit_multiplier=search_params['search_multiplier'],
min_year=criteria.start_year
)
)
search_tasks.append(
self._search_arxiv(
criteria.arxiv_params,
limit_multiplier=search_params['search_multiplier'],
min_year=criteria.start_year
)
)
if get_conf("SEMANTIC_SCHOLAR_KEY"):
search_tasks.append(
self._search_semantic(
criteria.semantic_params,
limit_multiplier=search_params['search_multiplier'],
min_year=criteria.start_year
)
)
# 执行所有需要的搜索任务
papers = await asyncio.gather(*search_tasks)
# 合并所有来源的论文并统计各来源的数量
all_papers = []
source_counts = {
'arxiv': 0,
'semantic': 0,
'pubmed': 0,
'crossref': 0,
'adsabs': 0
}
for source_papers in papers:
if source_papers:
for paper in source_papers:
source = getattr(paper, 'source', 'unknown')
if source in source_counts:
source_counts[source] += 1
all_papers.extend(source_papers)
# 打印各来源的论文数量
print("\n=== 各数据源找到的论文数量 ===")
for source, count in source_counts.items():
if count > 0: # 只打印有论文的来源
print(f"{source.capitalize()}: {count}")
print(f"总计: {len(all_papers)}")
print("===========================\n")
return all_papers
def _format_paper_time(self, paper) -> str:
"""格式化论文时间信息"""
year = getattr(paper, 'year', None)
if not year:
return ""
# 如果有具体的发表日期,使用具体日期
if hasattr(paper, 'published') and paper.published:
return f"(发表于 {paper.published.strftime('%Y-%m')})"
# 如果只有年份,只显示年份
return f"({year})"
def _format_papers(self, papers: List) -> str:
"""格式化论文列表使用token限制控制长度"""
formatted = []
for i, paper in enumerate(papers, 1):
# 只保留前三个作者
authors = paper.authors[:3]
if len(paper.authors) > 3:
authors.append("et al.")
# 构建所有可能的下载链接
download_links = []
# 添加arXiv链接
if hasattr(paper, 'doi') and paper.doi:
if paper.doi.startswith("10.48550/arXiv."):
# 从DOI中提取完整的arXiv ID
arxiv_id = paper.doi.split("arXiv.")[-1]
# 移除多余的点号并确保格式正确
arxiv_id = arxiv_id.replace("..", ".") # 移除重复的点号
if arxiv_id.startswith("."): # 移除开头的点号
arxiv_id = arxiv_id[1:]
if arxiv_id.endswith("."): # 移除结尾的点号
arxiv_id = arxiv_id[:-1]
download_links.append(f"[arXiv PDF](https://arxiv.org/pdf/{arxiv_id}.pdf)")
download_links.append(f"[arXiv Page](https://arxiv.org/abs/{arxiv_id})")
elif "arxiv.org/abs/" in paper.doi:
# 直接从URL中提取arXiv ID
arxiv_id = paper.doi.split("arxiv.org/abs/")[-1]
if "v" in arxiv_id: # 移除版本号
arxiv_id = arxiv_id.split("v")[0]
download_links.append(f"[arXiv PDF](https://arxiv.org/pdf/{arxiv_id}.pdf)")
download_links.append(f"[arXiv Page](https://arxiv.org/abs/{arxiv_id})")
else:
download_links.append(f"[DOI](https://doi.org/{paper.doi})")
# 添加直接URL链接如果存在且不同于前面的链接
if hasattr(paper, 'url') and paper.url:
if not any(paper.url in link for link in download_links):
download_links.append(f"[Source]({paper.url})")
# 构建下载链接字符串
download_section = " | ".join(download_links) if download_links else "No direct download link available"
# 构建来源信息
source_info = []
if hasattr(paper, 'venue_type') and paper.venue_type and paper.venue_type != 'preprint':
source_info.append(f"Type: {paper.venue_type}")
if hasattr(paper, 'venue_name') and paper.venue_name:
source_info.append(f"Venue: {paper.venue_name}")
# 添加IF指数和分区信息
if hasattr(paper, 'if_factor') and paper.if_factor:
source_info.append(f"IF: {paper.if_factor}")
if hasattr(paper, 'cas_division') and paper.cas_division:
source_info.append(f"中科院分区: {paper.cas_division}")
if hasattr(paper, 'jcr_division') and paper.jcr_division:
source_info.append(f"JCR分区: {paper.jcr_division}")
if hasattr(paper, 'venue_info') and paper.venue_info:
if paper.venue_info.get('journal_ref'):
source_info.append(f"Journal Reference: {paper.venue_info['journal_ref']}")
if paper.venue_info.get('publisher'):
source_info.append(f"Publisher: {paper.venue_info['publisher']}")
# 构建当前论文的格式化文本
paper_text = (
f"{i}. **{paper.title}**\n" +
f" Authors: {', '.join(authors)}\n" +
f" Year: {paper.year}\n" +
f" Citations: {paper.citations if paper.citations else 'N/A'}\n" +
(f" Source: {'; '.join(source_info)}\n" if source_info else "") +
# 添加PubMed特有信息
(f" MeSH Terms: {'; '.join(paper.mesh_terms)}\n" if hasattr(paper,
'mesh_terms') and paper.mesh_terms else "") +
f" 📥 PDF Downloads: {download_section}\n" +
f" Abstract: {paper.abstract}\n"
)
formatted.append(paper_text)
full_text = "\n".join(formatted)
# 根据不同模型设置不同的token限制
model_name = getattr(self, 'llm_kwargs', {}).get('llm_model', 'gpt-3.5-turbo')
token_limit = model_info[model_name]['max_token'] * 3 // 4
# 使用token限制控制长度
return cut_from_end_to_satisfy_token_limit(full_text, limit=token_limit, reserve_token=0, llm_model=model_name)
def _get_current_time(self) -> str:
"""获取当前时间信息"""
now = datetime.now()
return now.strftime("%Y年%m月%d")
def _generate_apology_prompt(self, criteria: SearchCriteria) -> str:
"""生成道歉提示"""
return f"""很抱歉,我们未能找到与"{criteria.main_topic}"相关的有效文献。
可能的原因:
1. 搜索词过于具体或专业
2. 时间范围限制过严
建议解决方案:
1. 尝试使用更通用的关键词
2. 扩大搜索时间范围
3. 使用同义词或相关术语
请根据以上建议调整后重试。"""
def get_ranked_papers(self) -> str:
"""获取排序后的论文列表的格式化字符串"""
return self._format_papers(self.ranked_papers) if self.ranked_papers else ""
def _is_pubmed_paper(self, paper) -> bool:
"""判断是否为PubMed论文"""
return (paper.url and 'pubmed.ncbi.nlm.nih.gov' in paper.url)

View File

@@ -0,0 +1,106 @@
from typing import List, Dict, Any
from .base_handler import BaseHandler
from crazy_functions.review_fns.query_analyzer import SearchCriteria
import asyncio
class Arxiv最新论文推荐功能(BaseHandler):
"""最新论文推荐处理器"""
def __init__(self, arxiv, semantic, llm_kwargs=None):
super().__init__(arxiv, semantic, llm_kwargs)
async def handle(
self,
criteria: SearchCriteria,
chatbot: List[List[str]],
history: List[List[str]],
system_prompt: str,
llm_kwargs: Dict[str, Any],
plugin_kwargs: Dict[str, Any],
) -> str:
"""处理最新论文推荐请求"""
# 获取搜索参数
search_params = self._get_search_params(plugin_kwargs)
# 获取最新论文
papers = []
for category in criteria.arxiv_params["categories"]:
latest_papers = await self.arxiv.get_latest_papers(
category=category,
debug=False,
batch_size=50
)
papers.extend(latest_papers)
if not papers:
return self._generate_apology_prompt(criteria)
# 使用embedding模型对论文进行排序
self.ranked_papers = self.paper_ranker.rank_papers(
query=criteria.original_query,
papers=papers,
search_criteria=criteria
)
# 构建最终的prompt
current_time = self._get_current_time()
final_prompt = f"""Current time: {current_time}
Based on your interest in {criteria.main_topic}, here are the latest papers from arXiv in relevant categories:
{', '.join(criteria.arxiv_params["categories"])}
Latest papers available:
{self._format_papers(self.ranked_papers)}
Please provide:
1. A clear list of latext papers, organized by themes or approaches
2. Group papers by sub-topics or themes if applicable
3. For each paper:
- Publication time
- The key contributions and main findings
- Why it's relevant to the user's interests
- How it relates to other latest papers
- The paper's citation count and citation impact
- The paper's download link
4. A suggested reading order based on:
- Paper relationships and dependencies
- Difficulty level
- Significance
5. Future Directions
- Emerging venues and research streams
- Novel methodological approaches
- Cross-disciplinary opportunities
- Research gaps by publication type
IMPORTANT:
- Focus on explaining why each paper is interesting
- Highlight the novelty and potential impact
- Consider the credibility and stage of each publication
- Use the provided paper titles with their links when referring to specific papers
- Base recommendations ONLY on the explicitly provided paper information
- Do not make ANY assumptions about papers beyond the given data
- When information is missing or unclear, acknowledge the limitation
- Never speculate about:
* Paper quality or rigor not evidenced in the data
* Research impact beyond citation counts
* Implementation details not mentioned
* Author expertise or background
* Future research directions not stated
- For each paper, cite only verifiable information
- Clearly distinguish between facts and potential implications
- Each paper includes download links in its 📥 PDF Downloads section
Format your response in markdown with clear sections.
Language requirement:
- If the query explicitly specifies a language, use that language
- Otherwise, match the language of the original user query
"""
return final_prompt

View File

@@ -0,0 +1,344 @@
from typing import List, Dict, Any, Optional, Tuple
from .base_handler import BaseHandler
from crazy_functions.review_fns.query_analyzer import SearchCriteria
import asyncio
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency as request_gpt
class 单篇论文分析功能(BaseHandler):
"""论文分析处理器"""
def __init__(self, arxiv, semantic, llm_kwargs=None):
super().__init__(arxiv, semantic, llm_kwargs)
async def handle(
self,
criteria: SearchCriteria,
chatbot: List[List[str]],
history: List[List[str]],
system_prompt: str,
llm_kwargs: Dict[str, Any],
plugin_kwargs: Dict[str, Any],
) -> str:
"""处理论文分析请求返回最终的prompt"""
# 1. 获取论文详情
paper = await self._get_paper_details(criteria)
if not paper:
return self._generate_apology_prompt(criteria)
# 保存为ranked_papers以便统一接口
self.ranked_papers = [paper]
# 2. 构建最终的prompt
current_time = self._get_current_time()
# 获取论文信息
title = getattr(paper, "title", "Unknown Title")
authors = getattr(paper, "authors", [])
year = getattr(paper, "year", "Unknown Year")
abstract = getattr(paper, "abstract", "No abstract available")
citations = getattr(paper, "citations", "N/A")
# 添加论文ID信息
paper_id = ""
if criteria.paper_source == "arxiv":
paper_id = f"arXiv ID: {criteria.paper_id}\n"
elif criteria.paper_source == "doi":
paper_id = f"DOI: {criteria.paper_id}\n"
# 格式化作者列表
authors_str = ', '.join(authors) if isinstance(authors, list) else authors
final_prompt = f"""Current time: {current_time}
Please provide a comprehensive analysis of the following paper:
{paper_id}Title: {title}
Authors: {authors_str}
Year: {year}
Citations: {citations}
Publication Venue: {paper.venue_name} ({paper.venue_type})
{f"Publisher: {paper.venue_info.get('publisher')}" if paper.venue_info.get('publisher') else ""}
{f"Journal Reference: {paper.venue_info.get('journal_ref')}" if paper.venue_info.get('journal_ref') else ""}
Abstract: {abstract}
Please provide:
1. Publication Context
- Publication venue analysis and impact factor (if available)
- Paper type (journal article, conference paper, preprint)
- Publication timeline and peer review status
- Publisher reputation and venue prestige
2. Research Context
- Field positioning and significance
- Historical context and prior work
- Related research streams
- Cross-venue impact analysis
3. Technical Analysis
- Detailed methodology review
- Implementation details
- Experimental setup and results
- Technical innovations
4. Impact Analysis
- Citation patterns and influence
- Cross-venue recognition
- Industry vs. academic impact
- Practical applications
5. Critical Review
- Methodological rigor assessment
- Result reliability and reproducibility
- Venue-appropriate evaluation standards
- Limitations and potential improvements
IMPORTANT:
- Strictly use ONLY the information provided above about the paper
- Do not make ANY assumptions or inferences beyond the given data
- If certain information is not provided, explicitly state that it is unknown
- For any unclear or missing details, acknowledge the limitation rather than speculating
- When discussing methodology or results, only describe what is explicitly stated in the abstract
- Never fabricate or assume any details about:
* Publication venues or status
* Implementation details not mentioned
* Results or findings not stated
* Impact or influence not supported by the citation count
* Authors' affiliations or backgrounds
* Future work or implications not mentioned
- You can find the paper's download options in the 📥 PDF Downloads section
- Available download formats include arXiv PDF, DOI links, and source URLs
Format your response in markdown with clear sections.
Language requirement:
- If the query explicitly specifies a language, use that language
- Otherwise, match the language of the original user query
"""
return final_prompt
async def _get_paper_details(self, criteria: SearchCriteria):
"""获取论文详情"""
try:
if criteria.paper_source == "arxiv":
# 使用 arxiv ID 搜索
papers = await self.arxiv.search_by_id(criteria.paper_id)
return papers[0] if papers else None
elif criteria.paper_source == "doi":
# 尝试从所有来源获取
paper = await self.semantic.get_paper_by_doi(criteria.paper_id)
if not paper:
# 如果Semantic Scholar没有找到尝试PubMed
papers = await self.pubmed.search(
f"{criteria.paper_id}[doi]",
limit=1
)
if papers:
return papers[0]
return paper
elif criteria.paper_source == "title":
# 使用_search_all_sources搜索
search_params = {
'max_papers': 1,
'min_year': 1900, # 不限制年份
'search_multiplier': 1
}
# 设置搜索参数
criteria.arxiv_params = {
"search_type": "basic",
"query": f'ti:"{criteria.paper_title}"',
"limit": 1
}
criteria.semantic_params = {
"query": criteria.paper_title,
"limit": 1
}
criteria.pubmed_params = {
"search_type": "basic",
"query": f'"{criteria.paper_title}"[Title]',
"limit": 1
}
papers = await self._search_all_sources(criteria, search_params)
return papers[0] if papers else None
# 如果都没有找到,尝试使用 main_topic 作为标题搜索
if not criteria.paper_title and not criteria.paper_id:
search_params = {
'max_papers': 1,
'min_year': 1900,
'search_multiplier': 1
}
# 设置搜索参数
criteria.arxiv_params = {
"search_type": "basic",
"query": f'ti:"{criteria.main_topic}"',
"limit": 1
}
criteria.semantic_params = {
"query": criteria.main_topic,
"limit": 1
}
criteria.pubmed_params = {
"search_type": "basic",
"query": f'"{criteria.main_topic}"[Title]',
"limit": 1
}
papers = await self._search_all_sources(criteria, search_params)
return papers[0] if papers else None
return None
except Exception as e:
print(f"获取论文详情时出错: {str(e)}")
return None
async def _get_citation_context(self, paper: Dict, plugin_kwargs: Dict) -> Tuple[List, List]:
"""获取引用上下文"""
search_params = self._get_search_params(plugin_kwargs)
# 使用论文标题构建搜索参数
title_query = f'ti:"{getattr(paper, "title", "")}"'
arxiv_params = {
"query": title_query,
"limit": search_params['max_papers'],
"search_type": "basic",
"sort_by": "relevance",
"sort_order": "descending"
}
semantic_params = {
"query": getattr(paper, "title", ""),
"limit": search_params['max_papers']
}
citations, references = await asyncio.gather(
self._search_semantic(
semantic_params,
limit_multiplier=search_params['search_multiplier'],
min_year=search_params['min_year']
),
self._search_arxiv(
arxiv_params,
limit_multiplier=search_params['search_multiplier'],
min_year=search_params['min_year']
)
)
return citations, references
async def _generate_analysis(
self,
paper: Dict,
citations: List,
references: List,
chatbot: List[List[str]],
history: List[List[str]],
system_prompt: str,
llm_kwargs: Dict[str, Any]
) -> List[List[str]]:
"""生成论文分析"""
# 构建提示
analysis_prompt = f"""Please provide a comprehensive analysis of the following paper:
Paper details:
{self._format_paper(paper)}
Key references (papers cited by this paper):
{self._format_papers(references)}
Important citations (papers that cite this paper):
{self._format_papers(citations)}
Please provide:
1. Paper Overview
- Main research question/objective
- Key methodology/approach
- Main findings/contributions
2. Technical Analysis
- Detailed methodology review
- Technical innovations
- Implementation details
- Experimental setup and results
3. Impact Analysis
- Significance in the field
- Influence on subsequent research (based on citing papers)
- Relationship to prior work (based on cited papers)
- Practical applications
4. Critical Review
- Strengths and limitations
- Potential improvements
- Open questions and future directions
- Alternative approaches
5. Related Research Context
- How it builds on previous work
- How it has influenced subsequent research
- Comparison with alternative approaches
Format your response in markdown with clear sections."""
# 并行生成概述和技术分析
for response_chunk in request_gpt(
inputs_array=[
analysis_prompt,
self._get_technical_prompt(paper)
],
inputs_show_user_array=[
"Generating paper analysis...",
"Analyzing technical details..."
],
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history_array=[history, []],
sys_prompt_array=[
system_prompt,
"You are an expert at analyzing technical details in research papers."
]
):
pass # 等待生成完成
# 获取最后的两个回答
if chatbot and len(chatbot[-2:]) == 2:
analysis = chatbot[-2][1]
technical = chatbot[-1][1]
full_analysis = f"""# Paper Analysis: {paper.title}
## General Analysis
{analysis}
## Technical Deep Dive
{technical}
"""
chatbot.append(["Here is the paper analysis:", full_analysis])
else:
chatbot.append(["Here is the paper analysis:", "Failed to generate analysis."])
return chatbot
def _get_technical_prompt(self, paper: Dict) -> str:
"""生成技术分析提示"""
return f"""Please provide a detailed technical analysis of the following paper:
{self._format_paper(paper)}
Focus on:
1. Mathematical formulations and their implications
2. Algorithm design and complexity analysis
3. Architecture details and design choices
4. Implementation challenges and solutions
5. Performance analysis and bottlenecks
6. Technical limitations and potential improvements
Format your response in markdown, focusing purely on technical aspects."""

View File

@@ -0,0 +1,147 @@
from typing import List, Dict, Any
from .base_handler import BaseHandler
from crazy_functions.review_fns.query_analyzer import SearchCriteria
from textwrap import dedent
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency as request_gpt
class 学术问答功能(BaseHandler):
"""学术问答处理器"""
def __init__(self, arxiv, semantic, llm_kwargs=None):
super().__init__(arxiv, semantic, llm_kwargs)
async def handle(
self,
criteria: SearchCriteria,
chatbot: List[List[str]],
history: List[List[str]],
system_prompt: str,
llm_kwargs: Dict[str, Any],
plugin_kwargs: Dict[str, Any],
) -> str:
"""处理学术问答请求返回最终的prompt"""
# 1. 获取搜索参数
search_params = self._get_search_params(plugin_kwargs)
# 2. 搜索相关论文
papers = await self._search_relevant_papers(criteria, search_params)
if not papers:
return self._generate_apology_prompt(criteria)
# 构建最终的prompt
current_time = self._get_current_time()
final_prompt = dedent(f"""Current time: {current_time}
Based on the following paper abstracts, please answer this academic question: {criteria.original_query}
Available papers for reference:
{self._format_papers(self.ranked_papers)}
Please structure your response in the following format:
1. Core Answer (2-3 paragraphs)
- Provide a clear, direct answer synthesizing key findings
- Support main points with citations [1,2,etc.]
- Focus on consensus and differences across papers
2. Key Evidence (2-3 paragraphs)
- Present supporting evidence from abstracts
- Compare methodologies and results
- Highlight significant findings with citations
3. Research Context (1-2 paragraphs)
- Discuss current trends and developments
- Identify research gaps or limitations
- Suggest potential future directions
Guidelines:
- Base your answer ONLY on the provided abstracts
- Use numbered citations [1], [2,3], etc. for every claim
- Maintain academic tone and objectivity
- Synthesize findings across multiple papers
- Focus on the most relevant information to the question
Constraints:
- Do not include information beyond the provided abstracts
- Avoid speculation or personal opinions
- Do not elaborate on technical details unless directly relevant
- Keep citations concise and focused
- Use [N] citations for every major claim or finding
- Cite multiple papers [1,2,3] when showing consensus
- Place citations immediately after the relevant statements
Note: Provide citations for every major claim to ensure traceability to source papers.
Language requirement:
- If the query explicitly specifies a language, use that language. Use Chinese to answer if no language is specified.
- Otherwise, match the language of the original user query
"""
)
return final_prompt
async def _search_relevant_papers(self, criteria: SearchCriteria, search_params: Dict) -> List:
"""搜索相关论文"""
# 使用_search_all_sources替代原来的并行搜索
all_papers = await self._search_all_sources(criteria, search_params)
if not all_papers:
return []
# 使用BGE重排序
self.ranked_papers = self.paper_ranker.rank_papers(
query=criteria.main_topic,
papers=all_papers,
search_criteria=criteria
)
return self.ranked_papers or []
async def _generate_answer(
self,
criteria: SearchCriteria,
papers: List,
chatbot: List[List[str]],
history: List[List[str]],
system_prompt: str,
llm_kwargs: Dict[str, Any]
) -> List[List[str]]:
"""生成答案"""
# 构建提示
qa_prompt = dedent(f"""Please answer the following academic question based on recent research papers.
Question: {criteria.main_topic}
Relevant papers:
{self._format_papers(papers)}
Please provide:
1. A direct answer to the question
2. Supporting evidence from the papers
3. Different perspectives or approaches if applicable
4. Current limitations and open questions
5. References to specific papers
Format your response in markdown with clear sections."""
)
# 调用LLM生成答案
for response_chunk in request_gpt(
inputs_array=[qa_prompt],
inputs_show_user_array=["Generating answer..."],
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history_array=[history],
sys_prompt_array=[system_prompt]
):
pass # 等待生成完成
# 获取最后的回答
if chatbot and len(chatbot[-1]) >= 2:
answer = chatbot[-1][1]
chatbot.append(["Here is the answer:", answer])
else:
chatbot.append(["Here is the answer:", "Failed to generate answer."])
return chatbot

View File

@@ -0,0 +1,185 @@
from typing import List, Dict, Any
from .base_handler import BaseHandler
from textwrap import dedent
from crazy_functions.review_fns.query_analyzer import SearchCriteria
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency as request_gpt
class 论文推荐功能(BaseHandler):
"""论文推荐处理器"""
def __init__(self, arxiv, semantic, llm_kwargs=None):
super().__init__(arxiv, semantic, llm_kwargs)
async def handle(
self,
criteria: SearchCriteria,
chatbot: List[List[str]],
history: List[List[str]],
system_prompt: str,
llm_kwargs: Dict[str, Any],
plugin_kwargs: Dict[str, Any],
) -> str:
"""处理论文推荐请求返回最终的prompt"""
search_params = self._get_search_params(plugin_kwargs)
# 1. 先搜索种子论文
seed_papers = await self._search_seed_papers(criteria, search_params)
if not seed_papers:
return self._generate_apology_prompt(criteria)
# 使用BGE重排序
all_papers = seed_papers
if not all_papers:
return self._generate_apology_prompt(criteria)
self.ranked_papers = self.paper_ranker.rank_papers(
query=criteria.original_query,
papers=all_papers,
search_criteria=criteria
)
if not self.ranked_papers:
return self._generate_apology_prompt(criteria)
# 构建最终的prompt
current_time = self._get_current_time()
final_prompt = dedent(f"""Current time: {current_time}
Based on the user's interest in {criteria.main_topic}, here are relevant papers.
Available papers for recommendation:
{self._format_papers(self.ranked_papers)}
Please provide:
1. Group papers by sub-topics or themes if applicable
2. For each paper:
- Publication time and venue (when available)
- Journal metrics (when available):
* Impact Factor (IF)
* JCR Quartile
* Chinese Academy of Sciences (CAS) Division
- The key contributions and main findings
- Why it's relevant to the user's interests
- How it relates to other recommended papers
- The paper's citation count and citation impact
- The paper's download link
3. A suggested reading order based on:
- Journal impact and quality metrics
- Chronological development of ideas
- Paper relationships and dependencies
- Difficulty level
- Impact and significance
4. Future Directions
- Emerging venues and research streams
- Novel methodological approaches
- Cross-disciplinary opportunities
- Research gaps by publication type
IMPORTANT:
- Focus on explaining why each paper is valuable
- Highlight connections between papers
- Consider both citation counts AND journal metrics when discussing impact
- When available, use IF, JCR quartile, and CAS division to assess paper quality
- Mention publication timing when discussing paper relationships
- When referring to papers, use HTML links in this format:
* For DOIs: <a href='https://doi.org/DOI_HERE' target='_blank'>DOI: DOI_HERE</a>
* For titles: <a href='PAPER_URL' target='_blank'>PAPER_TITLE</a>
- Present papers in a way that shows the evolution of ideas over time
- Base recommendations ONLY on the explicitly provided paper information
- Do not make ANY assumptions about papers beyond the given data
- When information is missing or unclear, acknowledge the limitation
- Never speculate about:
* Paper quality or rigor not evidenced in the data
* Research impact beyond citation counts and journal metrics
* Implementation details not mentioned
* Author expertise or background
* Future research directions not stated
- For each recommendation, cite only verifiable information
- Clearly distinguish between facts and potential implications
Format your response in markdown with clear sections.
Language requirement:
- If the query explicitly specifies a language, use that language
- Otherwise, match the language of the original user query
"""
)
return final_prompt
async def _search_seed_papers(self, criteria: SearchCriteria, search_params: Dict) -> List:
"""搜索种子论文"""
try:
# 使用_search_all_sources替代原来的并行搜索
all_papers = await self._search_all_sources(criteria, search_params)
if not all_papers:
return []
return all_papers
except Exception as e:
print(f"搜索种子论文时出错: {str(e)}")
return []
async def _get_recommendations(self, seed_papers: List, multiplier: int = 1) -> List:
"""获取推荐论文"""
recommendations = []
base_limit = 3 * multiplier
# 将种子论文添加到推荐列表中
recommendations.extend(seed_papers)
# 只使用前5篇论文作为种子
seed_papers = seed_papers[:5]
for paper in seed_papers:
try:
if paper.doi and paper.doi.startswith("10.48550/arXiv."):
# arXiv论文
arxiv_id = paper.doi.split(".")[-1]
paper_details = await self.arxiv.get_paper_details(arxiv_id)
if paper_details and hasattr(paper_details, 'venue'):
category = paper_details.venue.split(":")[-1]
similar_papers = await self.arxiv.search_by_category(
category,
limit=base_limit,
sort_by='relevance'
)
recommendations.extend(similar_papers)
elif paper.doi: # 只对有DOI的论文获取推荐
# Semantic Scholar论文
similar_papers = await self.semantic.get_recommended_papers(
paper.doi,
limit=base_limit
)
if similar_papers: # 只添加成功获取的推荐
recommendations.extend(similar_papers)
else:
# 对于没有DOI的论文使用标题进行相关搜索
if paper.title:
similar_papers = await self.semantic.search(
query=paper.title,
limit=base_limit
)
recommendations.extend(similar_papers)
except Exception as e:
print(f"获取论文 '{paper.title}' 的推荐时发生错误: {str(e)}")
continue
# 去重处理
seen_dois = set()
unique_recommendations = []
for paper in recommendations:
if paper.doi and paper.doi not in seen_dois:
seen_dois.add(paper.doi)
unique_recommendations.append(paper)
elif not paper.doi and paper not in unique_recommendations:
unique_recommendations.append(paper)
return unique_recommendations

View File

@@ -0,0 +1,193 @@
from typing import List, Dict, Any, Tuple
from .base_handler import BaseHandler
from crazy_functions.review_fns.query_analyzer import SearchCriteria
import asyncio
class 文献综述功能(BaseHandler):
"""文献综述处理器"""
def __init__(self, arxiv, semantic, llm_kwargs=None):
super().__init__(arxiv, semantic, llm_kwargs)
async def handle(
self,
criteria: SearchCriteria,
chatbot: List[List[str]],
history: List[List[str]],
system_prompt: str,
llm_kwargs: Dict[str, Any],
plugin_kwargs: Dict[str, Any],
) -> str:
"""处理文献综述请求返回最终的prompt"""
# 获取搜索参数
search_params = self._get_search_params(plugin_kwargs)
# 使用_search_all_sources替代原来的并行搜索
all_papers = await self._search_all_sources(criteria, search_params)
if not all_papers:
return self._generate_apology_prompt(criteria)
self.ranked_papers = self.paper_ranker.rank_papers(
query=criteria.original_query,
papers=all_papers,
search_criteria=criteria
)
# 检查排序后的论文数量
if not self.ranked_papers:
return self._generate_apology_prompt(criteria)
# 检查是否包含PubMed论文
has_pubmed_papers = any(paper.url and 'pubmed.ncbi.nlm.nih.gov' in paper.url
for paper in self.ranked_papers)
if has_pubmed_papers:
return self._generate_medical_review_prompt(criteria)
else:
return self._generate_general_review_prompt(criteria)
def _generate_medical_review_prompt(self, criteria: SearchCriteria) -> str:
"""生成医学文献综述prompt"""
return f"""Current time: {self._get_current_time()}
Conduct a systematic medical literature review on {criteria.main_topic} based STRICTLY on the provided articles.
Available literature for review:
{self._format_papers(self.ranked_papers)}
IMPORTANT: If the user query contains specific requirements for the review structure or format, those requirements take precedence over the following guidelines.
Please structure your medical review following these guidelines:
1. Research Overview
- Main research questions and objectives from the studies
- Types of studies included (clinical trials, observational studies, etc.)
- Study populations and settings
- Time period of the research
2. Key Findings
- Main outcomes and results reported in abstracts
- Primary endpoints and their measurements
- Statistical significance when reported
- Observed trends across studies
3. Methods Summary
- Study designs used
- Major interventions or treatments studied
- Key outcome measures
- Patient populations studied
4. Clinical Relevance
- Reported clinical implications
- Main conclusions from authors
- Reported benefits and risks
- Treatment responses when available
5. Research Status
- Current research focus areas
- Reported limitations
- Gaps identified in abstracts
- Authors' suggested future directions
CRITICAL REQUIREMENTS:
Citation Rules (MANDATORY):
- EVERY finding or statement MUST be supported by citations [N], where N is the number matching the paper in the provided literature list
- When reporting outcomes, ALWAYS cite the source studies using the exact paper numbers from the literature list
- For findings supported by multiple studies, use consecutive numbers as shown in the literature list [1,2,3]
- Use ONLY the papers provided in the available literature list above
- Citations must appear immediately after each statement
- Citation numbers MUST match the numbers assigned to papers in the literature list above (e.g., if a finding comes from the first paper in the list, cite it as [1])
- DO NOT change or reorder the citation numbers - they must exactly match the paper numbers in the literature list
Content Guidelines:
- Present only information available in the provided papers
- If certain information is not available, simply omit that aspect rather than explicitly stating its absence
- Focus on synthesizing and presenting available findings
- Maintain professional medical writing style
- Present limitations and gaps as research opportunities rather than missing information
Writing Style:
- Use precise medical terminology
- Maintain objective reporting
- Use consistent terminology throughout
- Present a cohesive narrative without referencing data limitations
Language requirement:
- If the query explicitly specifies a language, use that language
- Otherwise, match the language of the original user query
"""
def _generate_general_review_prompt(self, criteria: SearchCriteria) -> str:
"""生成通用文献综述prompt"""
current_time = self._get_current_time()
final_prompt = f"""Current time: {current_time}
Conduct a comprehensive literature review on {criteria.main_topic} focusing on the following aspects:
{', '.join(criteria.sub_topics)}
Available literature for review:
{self._format_papers(self.ranked_papers)}
IMPORTANT: If the user query contains specific requirements for the review structure or format, those requirements take precedence over the following guidelines.
Please structure your review following these guidelines:
1. Introduction and Research Background
- Current state and significance of the research field
- Key research problems and challenges
- Research development timeline and evolution
2. Research Directions and Classifications
- Major research directions and their relationships
- Different technical approaches and their characteristics
- Comparative analysis of various solutions
3. Core Technologies and Methods
- Key technological breakthroughs
- Advantages and limitations of different methods
- Technical challenges and solutions
4. Applications and Impact
- Real-world applications and use cases
- Industry influence and practical value
- Implementation challenges and solutions
5. Future Trends and Prospects
- Emerging research directions
- Unsolved problems and challenges
- Potential breakthrough points
CRITICAL REQUIREMENTS:
Citation Rules (MANDATORY):
- EVERY finding or statement MUST be supported by citations [N], where N is the number matching the paper in the provided literature list
- When reporting outcomes, ALWAYS cite the source studies using the exact paper numbers from the literature list
- For findings supported by multiple studies, use consecutive numbers as shown in the literature list [1,2,3]
- Use ONLY the papers provided in the available literature list above
- Citations must appear immediately after each statement
- Citation numbers MUST match the numbers assigned to papers in the literature list above (e.g., if a finding comes from the first paper in the list, cite it as [1])
- DO NOT change or reorder the citation numbers - they must exactly match the paper numbers in the literature list
Writing Style:
- Maintain academic and professional tone
- Focus on objective analysis with proper citations
- Ensure logical flow and clear structure
Content Requirements:
- Base ALL analysis STRICTLY on the provided papers with explicit citations
- When introducing any concept, method, or finding, immediately follow with [N]
- For each research direction or approach, cite the specific papers [N] that proposed or developed it
- When discussing limitations or challenges, cite the papers [N] that identified them
- DO NOT include information from sources outside the provided paper list
- DO NOT make unsupported claims or statements
Language requirement:
- If the query explicitly specifies a language, use that language
- Otherwise, match the language of the original user query
"""
return final_prompt