Files
gpt_academic/crazy_functions/review_fns/handlers/paper_handler.py
binary-husky 8042750d41 Master 4.0 (#2210)
* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-08-23 15:59:22 +08:00

345 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import List, Dict, Any, Optional, Tuple
from .base_handler import BaseHandler
from crazy_functions.review_fns.query_analyzer import SearchCriteria
import asyncio
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency as request_gpt
class 单篇论文分析功能(BaseHandler):
"""论文分析处理器"""
def __init__(self, arxiv, semantic, llm_kwargs=None):
super().__init__(arxiv, semantic, llm_kwargs)
async def handle(
self,
criteria: SearchCriteria,
chatbot: List[List[str]],
history: List[List[str]],
system_prompt: str,
llm_kwargs: Dict[str, Any],
plugin_kwargs: Dict[str, Any],
) -> str:
"""处理论文分析请求返回最终的prompt"""
# 1. 获取论文详情
paper = await self._get_paper_details(criteria)
if not paper:
return self._generate_apology_prompt(criteria)
# 保存为ranked_papers以便统一接口
self.ranked_papers = [paper]
# 2. 构建最终的prompt
current_time = self._get_current_time()
# 获取论文信息
title = getattr(paper, "title", "Unknown Title")
authors = getattr(paper, "authors", [])
year = getattr(paper, "year", "Unknown Year")
abstract = getattr(paper, "abstract", "No abstract available")
citations = getattr(paper, "citations", "N/A")
# 添加论文ID信息
paper_id = ""
if criteria.paper_source == "arxiv":
paper_id = f"arXiv ID: {criteria.paper_id}\n"
elif criteria.paper_source == "doi":
paper_id = f"DOI: {criteria.paper_id}\n"
# 格式化作者列表
authors_str = ', '.join(authors) if isinstance(authors, list) else authors
final_prompt = f"""Current time: {current_time}
Please provide a comprehensive analysis of the following paper:
{paper_id}Title: {title}
Authors: {authors_str}
Year: {year}
Citations: {citations}
Publication Venue: {paper.venue_name} ({paper.venue_type})
{f"Publisher: {paper.venue_info.get('publisher')}" if paper.venue_info.get('publisher') else ""}
{f"Journal Reference: {paper.venue_info.get('journal_ref')}" if paper.venue_info.get('journal_ref') else ""}
Abstract: {abstract}
Please provide:
1. Publication Context
- Publication venue analysis and impact factor (if available)
- Paper type (journal article, conference paper, preprint)
- Publication timeline and peer review status
- Publisher reputation and venue prestige
2. Research Context
- Field positioning and significance
- Historical context and prior work
- Related research streams
- Cross-venue impact analysis
3. Technical Analysis
- Detailed methodology review
- Implementation details
- Experimental setup and results
- Technical innovations
4. Impact Analysis
- Citation patterns and influence
- Cross-venue recognition
- Industry vs. academic impact
- Practical applications
5. Critical Review
- Methodological rigor assessment
- Result reliability and reproducibility
- Venue-appropriate evaluation standards
- Limitations and potential improvements
IMPORTANT:
- Strictly use ONLY the information provided above about the paper
- Do not make ANY assumptions or inferences beyond the given data
- If certain information is not provided, explicitly state that it is unknown
- For any unclear or missing details, acknowledge the limitation rather than speculating
- When discussing methodology or results, only describe what is explicitly stated in the abstract
- Never fabricate or assume any details about:
* Publication venues or status
* Implementation details not mentioned
* Results or findings not stated
* Impact or influence not supported by the citation count
* Authors' affiliations or backgrounds
* Future work or implications not mentioned
- You can find the paper's download options in the 📥 PDF Downloads section
- Available download formats include arXiv PDF, DOI links, and source URLs
Format your response in markdown with clear sections.
Language requirement:
- If the query explicitly specifies a language, use that language
- Otherwise, match the language of the original user query
"""
return final_prompt
async def _get_paper_details(self, criteria: SearchCriteria):
"""获取论文详情"""
try:
if criteria.paper_source == "arxiv":
# 使用 arxiv ID 搜索
papers = await self.arxiv.search_by_id(criteria.paper_id)
return papers[0] if papers else None
elif criteria.paper_source == "doi":
# 尝试从所有来源获取
paper = await self.semantic.get_paper_by_doi(criteria.paper_id)
if not paper:
# 如果Semantic Scholar没有找到尝试PubMed
papers = await self.pubmed.search(
f"{criteria.paper_id}[doi]",
limit=1
)
if papers:
return papers[0]
return paper
elif criteria.paper_source == "title":
# 使用_search_all_sources搜索
search_params = {
'max_papers': 1,
'min_year': 1900, # 不限制年份
'search_multiplier': 1
}
# 设置搜索参数
criteria.arxiv_params = {
"search_type": "basic",
"query": f'ti:"{criteria.paper_title}"',
"limit": 1
}
criteria.semantic_params = {
"query": criteria.paper_title,
"limit": 1
}
criteria.pubmed_params = {
"search_type": "basic",
"query": f'"{criteria.paper_title}"[Title]',
"limit": 1
}
papers = await self._search_all_sources(criteria, search_params)
return papers[0] if papers else None
# 如果都没有找到,尝试使用 main_topic 作为标题搜索
if not criteria.paper_title and not criteria.paper_id:
search_params = {
'max_papers': 1,
'min_year': 1900,
'search_multiplier': 1
}
# 设置搜索参数
criteria.arxiv_params = {
"search_type": "basic",
"query": f'ti:"{criteria.main_topic}"',
"limit": 1
}
criteria.semantic_params = {
"query": criteria.main_topic,
"limit": 1
}
criteria.pubmed_params = {
"search_type": "basic",
"query": f'"{criteria.main_topic}"[Title]',
"limit": 1
}
papers = await self._search_all_sources(criteria, search_params)
return papers[0] if papers else None
return None
except Exception as e:
print(f"获取论文详情时出错: {str(e)}")
return None
async def _get_citation_context(self, paper: Dict, plugin_kwargs: Dict) -> Tuple[List, List]:
"""获取引用上下文"""
search_params = self._get_search_params(plugin_kwargs)
# 使用论文标题构建搜索参数
title_query = f'ti:"{getattr(paper, "title", "")}"'
arxiv_params = {
"query": title_query,
"limit": search_params['max_papers'],
"search_type": "basic",
"sort_by": "relevance",
"sort_order": "descending"
}
semantic_params = {
"query": getattr(paper, "title", ""),
"limit": search_params['max_papers']
}
citations, references = await asyncio.gather(
self._search_semantic(
semantic_params,
limit_multiplier=search_params['search_multiplier'],
min_year=search_params['min_year']
),
self._search_arxiv(
arxiv_params,
limit_multiplier=search_params['search_multiplier'],
min_year=search_params['min_year']
)
)
return citations, references
async def _generate_analysis(
self,
paper: Dict,
citations: List,
references: List,
chatbot: List[List[str]],
history: List[List[str]],
system_prompt: str,
llm_kwargs: Dict[str, Any]
) -> List[List[str]]:
"""生成论文分析"""
# 构建提示
analysis_prompt = f"""Please provide a comprehensive analysis of the following paper:
Paper details:
{self._format_paper(paper)}
Key references (papers cited by this paper):
{self._format_papers(references)}
Important citations (papers that cite this paper):
{self._format_papers(citations)}
Please provide:
1. Paper Overview
- Main research question/objective
- Key methodology/approach
- Main findings/contributions
2. Technical Analysis
- Detailed methodology review
- Technical innovations
- Implementation details
- Experimental setup and results
3. Impact Analysis
- Significance in the field
- Influence on subsequent research (based on citing papers)
- Relationship to prior work (based on cited papers)
- Practical applications
4. Critical Review
- Strengths and limitations
- Potential improvements
- Open questions and future directions
- Alternative approaches
5. Related Research Context
- How it builds on previous work
- How it has influenced subsequent research
- Comparison with alternative approaches
Format your response in markdown with clear sections."""
# 并行生成概述和技术分析
for response_chunk in request_gpt(
inputs_array=[
analysis_prompt,
self._get_technical_prompt(paper)
],
inputs_show_user_array=[
"Generating paper analysis...",
"Analyzing technical details..."
],
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history_array=[history, []],
sys_prompt_array=[
system_prompt,
"You are an expert at analyzing technical details in research papers."
]
):
pass # 等待生成完成
# 获取最后的两个回答
if chatbot and len(chatbot[-2:]) == 2:
analysis = chatbot[-2][1]
technical = chatbot[-1][1]
full_analysis = f"""# Paper Analysis: {paper.title}
## General Analysis
{analysis}
## Technical Deep Dive
{technical}
"""
chatbot.append(["Here is the paper analysis:", full_analysis])
else:
chatbot.append(["Here is the paper analysis:", "Failed to generate analysis."])
return chatbot
def _get_technical_prompt(self, paper: Dict) -> str:
"""生成技术分析提示"""
return f"""Please provide a detailed technical analysis of the following paper:
{self._format_paper(paper)}
Focus on:
1. Mathematical formulations and their implications
2. Algorithm design and complexity analysis
3. Architecture details and design choices
4. Implementation challenges and solutions
5. Performance analysis and bottlenecks
6. Technical limitations and potential improvements
Format your response in markdown, focusing purely on technical aspects."""