Compare commits
13 Commits
version2.6
...
version2.6
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1c6c29e6dd | ||
|
|
74f26a7d64 | ||
|
|
1c96ed39c3 | ||
|
|
5b040d552e | ||
|
|
c563ac2274 | ||
|
|
d28af7611a | ||
|
|
9714d3ea2d | ||
|
|
65c51eb05b | ||
|
|
5d98d82526 | ||
|
|
9215199ae1 | ||
|
|
7ded328970 | ||
|
|
057b8cd943 | ||
|
|
2ab6acc6de |
50
Dockerfile+ChatGLM
Normal file
50
Dockerfile+ChatGLM
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
# How to build | 如何构建: docker build -t gpt-academic --network=host -f Dockerfile+ChatGLM .
|
||||||
|
# How to run | 如何运行 (1) 直接运行: docker run --rm -it --net=host --gpus=all gpt-academic
|
||||||
|
# How to run | 如何运行 (2) 我想运行之前进容器做一些调整: docker run --rm -it --net=host --gpus=all gpt-academic bash
|
||||||
|
|
||||||
|
# 从NVIDIA源,从而支持显卡运损(检查宿主的nvidia-smi中的cuda版本必须>=11.3)
|
||||||
|
FROM nvidia/cuda:11.3.1-runtime-ubuntu20.04
|
||||||
|
ARG useProxyNetwork=''
|
||||||
|
RUN apt-get update
|
||||||
|
RUN apt-get install -y curl proxychains curl
|
||||||
|
RUN apt-get install -y git python python3 python-dev python3-dev --fix-missing
|
||||||
|
|
||||||
|
# 配置代理网络(构建Docker镜像时使用)
|
||||||
|
# # comment out below if you do not need proxy network | 如果不需要翻墙 - 从此行向下删除
|
||||||
|
RUN $useProxyNetwork curl cip.cc
|
||||||
|
RUN sed -i '$ d' /etc/proxychains.conf
|
||||||
|
RUN sed -i '$ d' /etc/proxychains.conf
|
||||||
|
RUN echo "socks5 127.0.0.1 10880" >> /etc/proxychains.conf
|
||||||
|
ARG useProxyNetwork=proxychains
|
||||||
|
# # comment out above if you do not need proxy network | 如果不需要翻墙 - 从此行向上删除
|
||||||
|
|
||||||
|
|
||||||
|
# use python3 as the system default python
|
||||||
|
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.8
|
||||||
|
|
||||||
|
# 下载分支
|
||||||
|
WORKDIR /gpt
|
||||||
|
RUN $useProxyNetwork git clone https://github.com/binary-husky/chatgpt_academic.git -b v3.0
|
||||||
|
WORKDIR /gpt/chatgpt_academic
|
||||||
|
RUN $useProxyNetwork python3 -m pip install -r requirements.txt
|
||||||
|
RUN $useProxyNetwork python3 -m pip install -r request_llm/requirements_chatglm.txt
|
||||||
|
RUN $useProxyNetwork python3 -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu113
|
||||||
|
|
||||||
|
# 预热CHATGLM参数(非必要 可选步骤)
|
||||||
|
RUN echo ' \n\
|
||||||
|
from transformers import AutoModel, AutoTokenizer \n\
|
||||||
|
chatglm_tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) \n\
|
||||||
|
chatglm_model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).float() ' >> warm_up_chatglm.py
|
||||||
|
RUN python3 -u warm_up_chatglm.py
|
||||||
|
RUN $useProxyNetwork git pull
|
||||||
|
|
||||||
|
# 为chatgpt-academic配置代理和API-KEY (非必要 可选步骤)
|
||||||
|
RUN echo ' \n\
|
||||||
|
API_KEY = "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" \n\
|
||||||
|
USE_PROXY = True \n\
|
||||||
|
LLM_MODEL = "chatglm" \n\
|
||||||
|
LOCAL_MODEL_DEVICE = "cuda" \n\
|
||||||
|
proxies = { "http": "socks5h://localhost:10880", "https": "socks5h://localhost:10880", } ' >> config_private.py
|
||||||
|
|
||||||
|
# 启动
|
||||||
|
CMD ["python3", "-u", "main.py"]
|
||||||
20
README.md
20
README.md
@@ -12,7 +12,7 @@ If you like this project, please give it a Star. If you've come up with more use
|
|||||||
|
|
||||||
> **Note**
|
> **Note**
|
||||||
>
|
>
|
||||||
> 1.请注意只有“红颜色”标识的函数插件(按钮)才支持读取文件。目前对pdf/word格式文件的支持插件正在逐步完善中,需要更多developer的帮助。
|
> 1.请注意只有**红颜色**标识的函数插件(按钮)才支持读取文件,部分插件位于插件区的**下拉菜单**中。另外我们以**最高优先级**欢迎和处理任何新插件的PR!
|
||||||
>
|
>
|
||||||
> 2.本项目中每个文件的功能都在自译解[`self_analysis.md`](https://github.com/binary-husky/chatgpt_academic/wiki/chatgpt-academic%E9%A1%B9%E7%9B%AE%E8%87%AA%E8%AF%91%E8%A7%A3%E6%8A%A5%E5%91%8A)详细说明。随着版本的迭代,您也可以随时自行点击相关函数插件,调用GPT重新生成项目的自我解析报告。常见问题汇总在[`wiki`](https://github.com/binary-husky/chatgpt_academic/wiki/%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98)当中。
|
> 2.本项目中每个文件的功能都在自译解[`self_analysis.md`](https://github.com/binary-husky/chatgpt_academic/wiki/chatgpt-academic%E9%A1%B9%E7%9B%AE%E8%87%AA%E8%AF%91%E8%A7%A3%E6%8A%A5%E5%91%8A)详细说明。随着版本的迭代,您也可以随时自行点击相关函数插件,调用GPT重新生成项目的自我解析报告。常见问题汇总在[`wiki`](https://github.com/binary-husky/chatgpt_academic/wiki/%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98)当中。
|
||||||
>
|
>
|
||||||
@@ -30,9 +30,10 @@ If you like this project, please give it a Star. If you've come up with more use
|
|||||||
[自定义快捷键](https://www.bilibili.com/video/BV14s4y1E7jN) | 支持自定义快捷键
|
[自定义快捷键](https://www.bilibili.com/video/BV14s4y1E7jN) | 支持自定义快捷键
|
||||||
[配置代理服务器](https://www.bilibili.com/video/BV1rc411W7Dr) | 支持配置代理服务器
|
[配置代理服务器](https://www.bilibili.com/video/BV1rc411W7Dr) | 支持配置代理服务器
|
||||||
模块化设计 | 支持自定义高阶的实验性功能与[函数插件],插件支持[热更新](https://github.com/binary-husky/chatgpt_academic/wiki/%E5%87%BD%E6%95%B0%E6%8F%92%E4%BB%B6%E6%8C%87%E5%8D%97)
|
模块化设计 | 支持自定义高阶的实验性功能与[函数插件],插件支持[热更新](https://github.com/binary-husky/chatgpt_academic/wiki/%E5%87%BD%E6%95%B0%E6%8F%92%E4%BB%B6%E6%8C%87%E5%8D%97)
|
||||||
[自我程序剖析](https://www.bilibili.com/video/BV1cj411A7VW) | [函数插件] 一键读懂本项目的源代码
|
[自我程序剖析](https://www.bilibili.com/video/BV1cj411A7VW) | [函数插件] [一键读懂](https://github.com/binary-husky/chatgpt_academic/wiki/chatgpt-academic%E9%A1%B9%E7%9B%AE%E8%87%AA%E8%AF%91%E8%A7%A3%E6%8A%A5%E5%91%8A)本项目的源代码
|
||||||
[程序剖析](https://www.bilibili.com/video/BV1cj411A7VW) | [函数插件] 一键可以剖析其他Python/C/C++/Java项目树
|
[程序剖析](https://www.bilibili.com/video/BV1cj411A7VW) | [函数插件] 一键可以剖析其他Python/C/C++/Java项目树
|
||||||
读论文 | [函数插件] 一键解读latex论文全文并生成摘要
|
读论文 | [函数插件] 一键解读latex论文全文并生成摘要
|
||||||
|
Latex全文翻译、润色 | [函数插件] 一键翻译或润色latex论文
|
||||||
批量注释生成 | [函数插件] 一键批量生成函数注释
|
批量注释生成 | [函数插件] 一键批量生成函数注释
|
||||||
chat分析报告生成 | [函数插件] 运行后自动生成总结汇报
|
chat分析报告生成 | [函数插件] 运行后自动生成总结汇报
|
||||||
[arxiv小助手](https://www.bilibili.com/video/BV1LM4y1279X) | [函数插件] 输入arxiv文章url即可一键翻译摘要+下载PDF
|
[arxiv小助手](https://www.bilibili.com/video/BV1LM4y1279X) | [函数插件] 输入arxiv文章url即可一键翻译摘要+下载PDF
|
||||||
@@ -44,6 +45,8 @@ chat分析报告生成 | [函数插件] 运行后自动生成总结汇报
|
|||||||
支持GPT输出的markdown表格 | 可以输出支持GPT的markdown表格
|
支持GPT输出的markdown表格 | 可以输出支持GPT的markdown表格
|
||||||
启动暗色gradio[主题](https://github.com/binary-husky/chatgpt_academic/issues/173) | 在浏览器url后面添加```/?__dark-theme=true```可以切换dark主题
|
启动暗色gradio[主题](https://github.com/binary-husky/chatgpt_academic/issues/173) | 在浏览器url后面添加```/?__dark-theme=true```可以切换dark主题
|
||||||
huggingface免科学上网[在线体验](https://huggingface.co/spaces/qingxu98/gpt-academic) | 登陆huggingface后复制[此空间](https://huggingface.co/spaces/qingxu98/gpt-academic)
|
huggingface免科学上网[在线体验](https://huggingface.co/spaces/qingxu98/gpt-academic) | 登陆huggingface后复制[此空间](https://huggingface.co/spaces/qingxu98/gpt-academic)
|
||||||
|
[多LLM模型](https://www.bilibili.com/video/BV1EM411K7VH/)混合支持([v3.0分支](https://github.com/binary-husky/chatgpt_academic/tree/v3.0)测试中) | 同时被ChatGPT和[清华ChatGLM](https://github.com/THUDM/ChatGLM-6B)伺候的感觉一定会很不错吧?
|
||||||
|
兼容[TGUI](https://github.com/oobabooga/text-generation-webui)接入更多样的语言模型 | 接入opt-1.3b, galactica-1.3b等模型([v3.0分支](https://github.com/binary-husky/chatgpt_academic/tree/v3.0)测试中)
|
||||||
…… | ……
|
…… | ……
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
@@ -84,6 +87,14 @@ huggingface免科学上网[在线体验](https://huggingface.co/spaces/qingxu98/
|
|||||||
<img src="https://user-images.githubusercontent.com/96192199/226935232-6b6a73ce-8900-4aee-93f9-733c7e6fef53.png" width="700" >
|
<img src="https://user-images.githubusercontent.com/96192199/226935232-6b6a73ce-8900-4aee-93f9-733c7e6fef53.png" width="700" >
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
- 多种大语言模型混合调用([v3.0分支](https://github.com/binary-husky/chatgpt_academic/tree/v3.0)测试中)
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<img src="https://user-images.githubusercontent.com/96192199/231222778-34776885-a7f0-4f2c-b5f4-7cc2ef3ecb58.png" width="700" >
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## 直接运行 (Windows, Linux or MacOS)
|
## 直接运行 (Windows, Linux or MacOS)
|
||||||
|
|
||||||
### 1. 下载项目
|
### 1. 下载项目
|
||||||
@@ -287,8 +298,7 @@ python check_proxy.py
|
|||||||
# 借鉴项目1:借鉴了ChuanhuChatGPT中读取OpenAI json的方法、记录历史问询记录的方法以及gradio queue的使用技巧
|
# 借鉴项目1:借鉴了ChuanhuChatGPT中读取OpenAI json的方法、记录历史问询记录的方法以及gradio queue的使用技巧
|
||||||
https://github.com/GaiZhenbiao/ChuanhuChatGPT
|
https://github.com/GaiZhenbiao/ChuanhuChatGPT
|
||||||
|
|
||||||
# 借鉴项目2:借鉴了mdtex2html中公式处理的方法
|
# 借鉴项目2:
|
||||||
https://github.com/polarwinkel/mdtex2html
|
https://github.com/THUDM/ChatGLM-6B
|
||||||
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -76,7 +76,6 @@ def get_crazy_functions():
|
|||||||
from crazy_functions.总结word文档 import 总结word文档
|
from crazy_functions.总结word文档 import 总结word文档
|
||||||
from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
|
from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
|
||||||
from crazy_functions.谷歌检索小助手 import 谷歌检索小助手
|
from crazy_functions.谷歌检索小助手 import 谷歌检索小助手
|
||||||
from crazy_functions.理解PDF文档内容 import 理解PDF文档内容
|
|
||||||
from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入
|
from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入
|
||||||
from crazy_functions.Latex全文润色 import Latex中文润色
|
from crazy_functions.Latex全文润色 import Latex中文润色
|
||||||
from crazy_functions.Latex全文翻译 import Latex中译英
|
from crazy_functions.Latex全文翻译 import Latex中译英
|
||||||
@@ -108,12 +107,7 @@ def get_crazy_functions():
|
|||||||
"Color": "stop",
|
"Color": "stop",
|
||||||
"Function": HotReload(总结word文档)
|
"Function": HotReload(总结word文档)
|
||||||
},
|
},
|
||||||
# "[测试功能] 理解PDF文档内容(Tk文件选择接口,仅本地)": {
|
"理解PDF文档内容 (模仿ChatPDF)": {
|
||||||
# # HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
|
|
||||||
# "AsButton": False, # 加入下拉菜单中
|
|
||||||
# "Function": HotReload(理解PDF文档内容)
|
|
||||||
# },
|
|
||||||
"[测试功能] 理解PDF文档内容(通用接口,读取文件输入区)": {
|
|
||||||
# HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
|
# HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
|
||||||
"Color": "stop",
|
"Color": "stop",
|
||||||
"AsButton": False, # 加入下拉菜单中
|
"AsButton": False, # 加入下拉菜单中
|
||||||
@@ -131,7 +125,6 @@ def get_crazy_functions():
|
|||||||
"AsButton": False, # 加入下拉菜单中
|
"AsButton": False, # 加入下拉菜单中
|
||||||
"Function": HotReload(Latex中文润色)
|
"Function": HotReload(Latex中文润色)
|
||||||
},
|
},
|
||||||
|
|
||||||
"[测试功能] Latex项目全文中译英(输入路径或上传压缩包)": {
|
"[测试功能] Latex项目全文中译英(输入路径或上传压缩包)": {
|
||||||
# HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
|
# HotReload 的意思是热更新,修改函数插件代码后,不需要重启程序,代码直接生效
|
||||||
"Color": "stop",
|
"Color": "stop",
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ def 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
|
|||||||
pfg = PaperFileGroup()
|
pfg = PaperFileGroup()
|
||||||
|
|
||||||
for index, fp in enumerate(file_manifest):
|
for index, fp in enumerate(file_manifest):
|
||||||
with open(fp, 'r', encoding='utf-8') as f:
|
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
file_content = f.read()
|
file_content = f.read()
|
||||||
# 定义注释的正则表达式
|
# 定义注释的正则表达式
|
||||||
comment_pattern = r'%.*'
|
comment_pattern = r'%.*'
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
|
|||||||
pfg = PaperFileGroup()
|
pfg = PaperFileGroup()
|
||||||
|
|
||||||
for index, fp in enumerate(file_manifest):
|
for index, fp in enumerate(file_manifest):
|
||||||
with open(fp, 'r', encoding='utf-8') as f:
|
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
file_content = f.read()
|
file_content = f.read()
|
||||||
# 定义注释的正则表达式
|
# 定义注释的正则表达式
|
||||||
comment_pattern = r'%.*'
|
comment_pattern = r'%.*'
|
||||||
|
|||||||
@@ -360,3 +360,171 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
|
|||||||
# 这个中文的句号是故意的,作为一个标识而存在
|
# 这个中文的句号是故意的,作为一个标识而存在
|
||||||
res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False)
|
res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False)
|
||||||
return [r.replace('。\n', '.') for r in res]
|
return [r.replace('。\n', '.') for r in res]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def read_and_clean_pdf_text(fp):
|
||||||
|
"""
|
||||||
|
这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好
|
||||||
|
|
||||||
|
**输入参数说明**
|
||||||
|
- `fp`:需要读取和清理文本的pdf文件路径
|
||||||
|
|
||||||
|
**输出参数说明**
|
||||||
|
- `meta_txt`:清理后的文本内容字符串
|
||||||
|
- `page_one_meta`:第一页清理后的文本内容列表
|
||||||
|
|
||||||
|
**函数功能**
|
||||||
|
读取pdf文件并清理其中的文本内容,清理规则包括:
|
||||||
|
- 提取所有块元的文本信息,并合并为一个字符串
|
||||||
|
- 去除短块(字符数小于100)并替换为回车符
|
||||||
|
- 清理多余的空行
|
||||||
|
- 合并小写字母开头的段落块并替换为空格
|
||||||
|
- 清除重复的换行
|
||||||
|
- 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔
|
||||||
|
"""
|
||||||
|
import fitz, copy
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
from colorful import print亮黄, print亮绿
|
||||||
|
fc = 0
|
||||||
|
fs = 1
|
||||||
|
fb = 2
|
||||||
|
REMOVE_FOOT_NOTE = True
|
||||||
|
REMOVE_FOOT_FFSIZE_PERCENT = 0.95
|
||||||
|
def primary_ffsize(l):
|
||||||
|
fsize_statiscs = {}
|
||||||
|
for wtf in l['spans']:
|
||||||
|
if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
|
||||||
|
fsize_statiscs[wtf['size']] += len(wtf['text'])
|
||||||
|
return max(fsize_statiscs, key=fsize_statiscs.get)
|
||||||
|
|
||||||
|
def ffsize_same(a,b):
|
||||||
|
return abs((a-b)/max(a,b)) < 0.02
|
||||||
|
# file_content = ""
|
||||||
|
with fitz.open(fp) as doc:
|
||||||
|
meta_txt = []
|
||||||
|
meta_font = []
|
||||||
|
|
||||||
|
meta_line = []
|
||||||
|
meta_span = []
|
||||||
|
for index, page in enumerate(doc):
|
||||||
|
# file_content += page.get_text()
|
||||||
|
text_areas = page.get_text("dict") # 获取页面上的文本信息
|
||||||
|
for t in text_areas['blocks']:
|
||||||
|
if 'lines' in t:
|
||||||
|
pf = 998
|
||||||
|
for l in t['lines']:
|
||||||
|
txt_line = "".join([wtf['text'] for wtf in l['spans']])
|
||||||
|
pf = primary_ffsize(l)
|
||||||
|
meta_line.append([txt_line, pf, l['bbox'], l])
|
||||||
|
for wtf in l['spans']: # for l in t['lines']:
|
||||||
|
meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
|
||||||
|
# meta_line.append(["NEW_BLOCK", pf])
|
||||||
|
# 块元提取 for each word segment with in line for each line cross-line words for each block
|
||||||
|
meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
|
||||||
|
'- ', '') for t in text_areas['blocks'] if 'lines' in t])
|
||||||
|
meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
|
||||||
|
for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
|
||||||
|
if index == 0:
|
||||||
|
page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
|
||||||
|
'- ', '') for t in text_areas['blocks'] if 'lines' in t]
|
||||||
|
# 获取正文主字体
|
||||||
|
fsize_statiscs = {}
|
||||||
|
for span in meta_span:
|
||||||
|
if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
|
||||||
|
fsize_statiscs[span[1]] += span[2]
|
||||||
|
main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
|
||||||
|
if REMOVE_FOOT_NOTE:
|
||||||
|
give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
|
||||||
|
|
||||||
|
# 切分和重新整合
|
||||||
|
mega_sec = []
|
||||||
|
sec = []
|
||||||
|
for index, line in enumerate(meta_line):
|
||||||
|
if index == 0:
|
||||||
|
sec.append(line[fc])
|
||||||
|
continue
|
||||||
|
if REMOVE_FOOT_NOTE:
|
||||||
|
if meta_line[index][fs] <= give_up_fize_threshold:
|
||||||
|
continue
|
||||||
|
if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
|
||||||
|
# 尝试识别段落
|
||||||
|
if meta_line[index][fc].endswith('.') and\
|
||||||
|
(meta_line[index-1][fc] != 'NEW_BLOCK') and \
|
||||||
|
(meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
|
||||||
|
sec[-1] += line[fc]
|
||||||
|
sec[-1] += "\n\n"
|
||||||
|
else:
|
||||||
|
sec[-1] += " "
|
||||||
|
sec[-1] += line[fc]
|
||||||
|
else:
|
||||||
|
if (index+1 < len(meta_line)) and \
|
||||||
|
meta_line[index][fs] > main_fsize:
|
||||||
|
# 单行 + 字体大
|
||||||
|
mega_sec.append(copy.deepcopy(sec))
|
||||||
|
sec = []
|
||||||
|
sec.append("# " + line[fc])
|
||||||
|
else:
|
||||||
|
# 尝试识别section
|
||||||
|
if meta_line[index-1][fs] > meta_line[index][fs]:
|
||||||
|
sec.append("\n" + line[fc])
|
||||||
|
else:
|
||||||
|
sec.append(line[fc])
|
||||||
|
mega_sec.append(copy.deepcopy(sec))
|
||||||
|
|
||||||
|
finals = []
|
||||||
|
for ms in mega_sec:
|
||||||
|
final = " ".join(ms)
|
||||||
|
final = final.replace('- ', ' ')
|
||||||
|
finals.append(final)
|
||||||
|
meta_txt = finals
|
||||||
|
|
||||||
|
def 把字符太少的块清除为回车(meta_txt):
|
||||||
|
for index, block_txt in enumerate(meta_txt):
|
||||||
|
if len(block_txt) < 100:
|
||||||
|
meta_txt[index] = '\n'
|
||||||
|
return meta_txt
|
||||||
|
meta_txt = 把字符太少的块清除为回车(meta_txt)
|
||||||
|
|
||||||
|
def 清理多余的空行(meta_txt):
|
||||||
|
for index in reversed(range(1, len(meta_txt))):
|
||||||
|
if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
|
||||||
|
meta_txt.pop(index)
|
||||||
|
return meta_txt
|
||||||
|
meta_txt = 清理多余的空行(meta_txt)
|
||||||
|
|
||||||
|
def 合并小写开头的段落块(meta_txt):
|
||||||
|
def starts_with_lowercase_word(s):
|
||||||
|
pattern = r"^[a-z]+"
|
||||||
|
match = re.match(pattern, s)
|
||||||
|
if match:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
for _ in range(100):
|
||||||
|
for index, block_txt in enumerate(meta_txt):
|
||||||
|
if starts_with_lowercase_word(block_txt):
|
||||||
|
if meta_txt[index-1] != '\n':
|
||||||
|
meta_txt[index-1] += ' '
|
||||||
|
else:
|
||||||
|
meta_txt[index-1] = ''
|
||||||
|
meta_txt[index-1] += meta_txt[index]
|
||||||
|
meta_txt[index] = '\n'
|
||||||
|
return meta_txt
|
||||||
|
meta_txt = 合并小写开头的段落块(meta_txt)
|
||||||
|
meta_txt = 清理多余的空行(meta_txt)
|
||||||
|
|
||||||
|
meta_txt = '\n'.join(meta_txt)
|
||||||
|
# 清除重复的换行
|
||||||
|
for _ in range(5):
|
||||||
|
meta_txt = meta_txt.replace('\n\n', '\n')
|
||||||
|
|
||||||
|
# 换行 -> 双换行
|
||||||
|
meta_txt = meta_txt.replace('\n', '\n\n')
|
||||||
|
|
||||||
|
for f in finals:
|
||||||
|
print亮黄(f)
|
||||||
|
print亮绿('***************************')
|
||||||
|
|
||||||
|
return meta_txt, page_one_meta
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ def 全项目切换英文(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_
|
|||||||
# 第4步:随便显示点什么防止卡顿的感觉
|
# 第4步:随便显示点什么防止卡顿的感觉
|
||||||
for index, fp in enumerate(file_manifest):
|
for index, fp in enumerate(file_manifest):
|
||||||
# if 'test_project' in fp: continue
|
# if 'test_project' in fp: continue
|
||||||
with open(fp, 'r', encoding='utf-8') as f:
|
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
file_content = f.read()
|
file_content = f.read()
|
||||||
i_say_show_user =f'[{index}/{len(file_manifest)}] 接下来请将以下代码中包含的所有中文转化为英文,只输出转化后的英文代码,请用代码块输出代码: {os.path.abspath(fp)}'
|
i_say_show_user =f'[{index}/{len(file_manifest)}] 接下来请将以下代码中包含的所有中文转化为英文,只输出转化后的英文代码,请用代码块输出代码: {os.path.abspath(fp)}'
|
||||||
i_say_show_user_buffer.append(i_say_show_user)
|
i_say_show_user_buffer.append(i_say_show_user)
|
||||||
@@ -72,7 +72,7 @@ def 全项目切换英文(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_
|
|||||||
if index > 10:
|
if index > 10:
|
||||||
time.sleep(60)
|
time.sleep(60)
|
||||||
print('Openai 限制免费用户每分钟20次请求,降低请求频率中。')
|
print('Openai 限制免费用户每分钟20次请求,降低请求频率中。')
|
||||||
with open(fp, 'r', encoding='utf-8') as f:
|
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
file_content = f.read()
|
file_content = f.read()
|
||||||
i_say_template = lambda fp, file_content: f'接下来请将以下代码中包含的所有中文转化为英文,只输出代码,文件名是{fp},文件代码是 ```{file_content}```'
|
i_say_template = lambda fp, file_content: f'接下来请将以下代码中包含的所有中文转化为英文,只输出代码,文件名是{fp},文件代码是 ```{file_content}```'
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ def 解析Paper(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbo
|
|||||||
print('begin analysis on:', file_manifest)
|
print('begin analysis on:', file_manifest)
|
||||||
for index, fp in enumerate(file_manifest):
|
for index, fp in enumerate(file_manifest):
|
||||||
if ".tex" in fp:
|
if ".tex" in fp:
|
||||||
with open(fp, 'r', encoding='utf-8') as f:
|
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
file_content = f.read()
|
file_content = f.read()
|
||||||
if ".pdf" in fp.lower():
|
if ".pdf" in fp.lower():
|
||||||
file_content = readPdf(fp)
|
file_content = readPdf(fp)
|
||||||
|
|||||||
@@ -2,174 +2,9 @@ from toolbox import CatchException, report_execption, write_results_to_file
|
|||||||
from toolbox import update_ui
|
from toolbox import update_ui
|
||||||
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
||||||
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
|
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
|
||||||
|
from .crazy_utils import read_and_clean_pdf_text
|
||||||
from colorful import *
|
from colorful import *
|
||||||
|
|
||||||
def read_and_clean_pdf_text(fp):
|
|
||||||
"""
|
|
||||||
这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好,不建议任何人去读这个函数
|
|
||||||
|
|
||||||
**输入参数说明**
|
|
||||||
- `fp`:需要读取和清理文本的pdf文件路径
|
|
||||||
|
|
||||||
**输出参数说明**
|
|
||||||
- `meta_txt`:清理后的文本内容字符串
|
|
||||||
- `page_one_meta`:第一页清理后的文本内容列表
|
|
||||||
|
|
||||||
**函数功能**
|
|
||||||
读取pdf文件并清理其中的文本内容,清理规则包括:
|
|
||||||
- 提取所有块元的文本信息,并合并为一个字符串
|
|
||||||
- 去除短块(字符数小于100)并替换为回车符
|
|
||||||
- 清理多余的空行
|
|
||||||
- 合并小写字母开头的段落块并替换为空格
|
|
||||||
- 清除重复的换行
|
|
||||||
- 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔
|
|
||||||
"""
|
|
||||||
import fitz, copy
|
|
||||||
import re
|
|
||||||
import numpy as np
|
|
||||||
fc = 0
|
|
||||||
fs = 1
|
|
||||||
fb = 2
|
|
||||||
REMOVE_FOOT_NOTE = True
|
|
||||||
REMOVE_FOOT_FFSIZE_PERCENT = 0.95
|
|
||||||
def primary_ffsize(l):
|
|
||||||
fsize_statiscs = {}
|
|
||||||
for wtf in l['spans']:
|
|
||||||
if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
|
|
||||||
fsize_statiscs[wtf['size']] += len(wtf['text'])
|
|
||||||
return max(fsize_statiscs, key=fsize_statiscs.get)
|
|
||||||
|
|
||||||
def ffsize_same(a,b):
|
|
||||||
return abs((a-b)/max(a,b)) < 0.02
|
|
||||||
# file_content = ""
|
|
||||||
with fitz.open(fp) as doc:
|
|
||||||
meta_txt = []
|
|
||||||
meta_font = []
|
|
||||||
|
|
||||||
meta_line = []
|
|
||||||
meta_span = []
|
|
||||||
for index, page in enumerate(doc):
|
|
||||||
# file_content += page.get_text()
|
|
||||||
text_areas = page.get_text("dict") # 获取页面上的文本信息
|
|
||||||
for t in text_areas['blocks']:
|
|
||||||
if 'lines' in t:
|
|
||||||
pf = 998
|
|
||||||
for l in t['lines']:
|
|
||||||
txt_line = "".join([wtf['text'] for wtf in l['spans']])
|
|
||||||
pf = primary_ffsize(l)
|
|
||||||
meta_line.append([txt_line, pf, l['bbox'], l])
|
|
||||||
for wtf in l['spans']: # for l in t['lines']:
|
|
||||||
meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
|
|
||||||
# meta_line.append(["NEW_BLOCK", pf])
|
|
||||||
# 块元提取 for each word segment with in line for each line cross-line words for each block
|
|
||||||
meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
|
|
||||||
'- ', '') for t in text_areas['blocks'] if 'lines' in t])
|
|
||||||
meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
|
|
||||||
for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
|
|
||||||
if index == 0:
|
|
||||||
page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
|
|
||||||
'- ', '') for t in text_areas['blocks'] if 'lines' in t]
|
|
||||||
# 获取正文主字体
|
|
||||||
fsize_statiscs = {}
|
|
||||||
for span in meta_span:
|
|
||||||
if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
|
|
||||||
fsize_statiscs[span[1]] += span[2]
|
|
||||||
main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
|
|
||||||
if REMOVE_FOOT_NOTE:
|
|
||||||
give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
|
|
||||||
|
|
||||||
# 切分和重新整合
|
|
||||||
mega_sec = []
|
|
||||||
sec = []
|
|
||||||
for index, line in enumerate(meta_line):
|
|
||||||
if index == 0:
|
|
||||||
sec.append(line[fc])
|
|
||||||
continue
|
|
||||||
if REMOVE_FOOT_NOTE:
|
|
||||||
if meta_line[index][fs] <= give_up_fize_threshold:
|
|
||||||
continue
|
|
||||||
if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
|
|
||||||
# 尝试识别段落
|
|
||||||
if meta_line[index][fc].endswith('.') and\
|
|
||||||
(meta_line[index-1][fc] != 'NEW_BLOCK') and \
|
|
||||||
(meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
|
|
||||||
sec[-1] += line[fc]
|
|
||||||
sec[-1] += "\n\n"
|
|
||||||
else:
|
|
||||||
sec[-1] += " "
|
|
||||||
sec[-1] += line[fc]
|
|
||||||
else:
|
|
||||||
if (index+1 < len(meta_line)) and \
|
|
||||||
meta_line[index][fs] > main_fsize:
|
|
||||||
# 单行 + 字体大
|
|
||||||
mega_sec.append(copy.deepcopy(sec))
|
|
||||||
sec = []
|
|
||||||
sec.append("# " + line[fc])
|
|
||||||
else:
|
|
||||||
# 尝试识别section
|
|
||||||
if meta_line[index-1][fs] > meta_line[index][fs]:
|
|
||||||
sec.append("\n" + line[fc])
|
|
||||||
else:
|
|
||||||
sec.append(line[fc])
|
|
||||||
mega_sec.append(copy.deepcopy(sec))
|
|
||||||
|
|
||||||
finals = []
|
|
||||||
for ms in mega_sec:
|
|
||||||
final = " ".join(ms)
|
|
||||||
final = final.replace('- ', ' ')
|
|
||||||
finals.append(final)
|
|
||||||
meta_txt = finals
|
|
||||||
|
|
||||||
def 把字符太少的块清除为回车(meta_txt):
|
|
||||||
for index, block_txt in enumerate(meta_txt):
|
|
||||||
if len(block_txt) < 100:
|
|
||||||
meta_txt[index] = '\n'
|
|
||||||
return meta_txt
|
|
||||||
meta_txt = 把字符太少的块清除为回车(meta_txt)
|
|
||||||
|
|
||||||
def 清理多余的空行(meta_txt):
|
|
||||||
for index in reversed(range(1, len(meta_txt))):
|
|
||||||
if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
|
|
||||||
meta_txt.pop(index)
|
|
||||||
return meta_txt
|
|
||||||
meta_txt = 清理多余的空行(meta_txt)
|
|
||||||
|
|
||||||
def 合并小写开头的段落块(meta_txt):
|
|
||||||
def starts_with_lowercase_word(s):
|
|
||||||
pattern = r"^[a-z]+"
|
|
||||||
match = re.match(pattern, s)
|
|
||||||
if match:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
for _ in range(100):
|
|
||||||
for index, block_txt in enumerate(meta_txt):
|
|
||||||
if starts_with_lowercase_word(block_txt):
|
|
||||||
if meta_txt[index-1] != '\n':
|
|
||||||
meta_txt[index-1] += ' '
|
|
||||||
else:
|
|
||||||
meta_txt[index-1] = ''
|
|
||||||
meta_txt[index-1] += meta_txt[index]
|
|
||||||
meta_txt[index] = '\n'
|
|
||||||
return meta_txt
|
|
||||||
meta_txt = 合并小写开头的段落块(meta_txt)
|
|
||||||
meta_txt = 清理多余的空行(meta_txt)
|
|
||||||
|
|
||||||
meta_txt = '\n'.join(meta_txt)
|
|
||||||
# 清除重复的换行
|
|
||||||
for _ in range(5):
|
|
||||||
meta_txt = meta_txt.replace('\n\n', '\n')
|
|
||||||
|
|
||||||
# 换行 -> 双换行
|
|
||||||
meta_txt = meta_txt.replace('\n', '\n\n')
|
|
||||||
|
|
||||||
for f in finals:
|
|
||||||
print亮黄(f)
|
|
||||||
print亮绿('***************************')
|
|
||||||
|
|
||||||
return meta_txt, page_one_meta
|
|
||||||
|
|
||||||
|
|
||||||
@CatchException
|
@CatchException
|
||||||
def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt, web_port):
|
def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt, web_port):
|
||||||
import glob
|
import glob
|
||||||
|
|||||||
@@ -1,142 +1,67 @@
|
|||||||
from toolbox import update_ui
|
from toolbox import update_ui
|
||||||
from toolbox import CatchException, report_execption
|
from toolbox import CatchException, report_execption
|
||||||
import re
|
from .crazy_utils import read_and_clean_pdf_text
|
||||||
import unicodedata
|
|
||||||
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
||||||
fast_debug = False
|
fast_debug = False
|
||||||
|
|
||||||
def is_paragraph_break(match):
|
|
||||||
"""
|
|
||||||
根据给定的匹配结果来判断换行符是否表示段落分隔。
|
|
||||||
如果换行符前为句子结束标志(句号,感叹号,问号),且下一个字符为大写字母,则换行符更有可能表示段落分隔。
|
|
||||||
也可以根据之前的内容长度来判断段落是否已经足够长。
|
|
||||||
"""
|
|
||||||
prev_char, next_char = match.groups()
|
|
||||||
|
|
||||||
# 句子结束标志
|
|
||||||
sentence_endings = ".!?"
|
|
||||||
|
|
||||||
# 设定一个最小段落长度阈值
|
|
||||||
min_paragraph_length = 140
|
|
||||||
|
|
||||||
if prev_char in sentence_endings and next_char.isupper() and len(match.string[:match.start(1)]) > min_paragraph_length:
|
|
||||||
return "\n\n"
|
|
||||||
else:
|
|
||||||
return " "
|
|
||||||
|
|
||||||
def normalize_text(text):
|
|
||||||
"""
|
|
||||||
通过把连字(ligatures)等文本特殊符号转换为其基本形式来对文本进行归一化处理。
|
|
||||||
例如,将连字 "fi" 转换为 "f" 和 "i"。
|
|
||||||
"""
|
|
||||||
# 对文本进行归一化处理,分解连字
|
|
||||||
normalized_text = unicodedata.normalize("NFKD", text)
|
|
||||||
|
|
||||||
# 替换其他特殊字符
|
|
||||||
cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
|
|
||||||
|
|
||||||
return cleaned_text
|
|
||||||
|
|
||||||
def clean_text(raw_text):
|
|
||||||
"""
|
|
||||||
对从 PDF 提取出的原始文本进行清洗和格式化处理。
|
|
||||||
1. 对原始文本进行归一化处理。
|
|
||||||
2. 替换跨行的连词,例如 “Espe-\ncially” 转换为 “Especially”。
|
|
||||||
3. 根据 heuristic 规则判断换行符是否是段落分隔,并相应地进行替换。
|
|
||||||
"""
|
|
||||||
# 对文本进行归一化处理
|
|
||||||
normalized_text = normalize_text(raw_text)
|
|
||||||
|
|
||||||
# 替换跨行的连词
|
|
||||||
text = re.sub(r'(\w+-\n\w+)', lambda m: m.group(1).replace('-\n', ''), normalized_text)
|
|
||||||
|
|
||||||
# 根据前后相邻字符的特点,找到原文本中的换行符
|
|
||||||
newlines = re.compile(r'(\S)\n(\S)')
|
|
||||||
|
|
||||||
# 根据 heuristic 规则,用空格或段落分隔符替换原换行符
|
|
||||||
final_text = re.sub(newlines, lambda m: m.group(1) + is_paragraph_break(m) + m.group(2), text)
|
|
||||||
|
|
||||||
return final_text.strip()
|
|
||||||
|
|
||||||
def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
|
def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
|
||||||
import time, glob, os, fitz
|
import tiktoken
|
||||||
print('begin analysis on:', file_name)
|
print('begin analysis on:', file_name)
|
||||||
|
file_content, page_one = read_and_clean_pdf_text(file_name)
|
||||||
|
|
||||||
with fitz.open(file_name) as doc:
|
############################## <第零步,从摘要中提取高价值信息,放到history中> ##################################
|
||||||
file_content = ""
|
# 递归地切割PDF文件,每一块(尽量是完整的一个section,比如introduction,experiment等,必要时再进行切割)
|
||||||
for page in doc:
|
# 的长度必须小于 2500 个 Token
|
||||||
file_content += page.get_text()
|
TOKEN_LIMIT_PER_FRAGMENT = 2500
|
||||||
file_content = clean_text(file_content)
|
|
||||||
# print(file_content)
|
|
||||||
split_number = 10000
|
|
||||||
split_group = (len(file_content)//split_number)+1
|
|
||||||
for i in range(0,split_group):
|
|
||||||
if i==0:
|
|
||||||
prefix = "接下来请你仔细分析下面的论文,学习里面的内容(专业术语、公式、数学概念).并且注意:由于论文内容较多,将分批次发送,每次发送完之后,你只需要回答“接受完成”"
|
|
||||||
i_say = prefix + f'文件名是{file_name},文章内容第{i+1}部分是 ```{file_content[i*split_number:(i+1)*split_number]}```'
|
|
||||||
i_say_show_user = f'文件名是:\n{file_name},\n由于论文内容过长,将分批请求(共{len(file_content)}字符,将分为{split_group}批,每批{split_number}字符)。\n当前发送{i+1}/{split_group}部分'
|
|
||||||
elif i==split_group-1:
|
|
||||||
i_say = f'你只需要回答“所有论文接受完成,请进行下一步”。文章内容第{i+1}/{split_group}部分是 ```{file_content[i*split_number:]}```'
|
|
||||||
i_say_show_user = f'当前发送{i+1}/{split_group}部分'
|
|
||||||
else:
|
|
||||||
i_say = f'你只需要回答“接受完成”。文章内容第{i+1}/{split_group}部分是 ```{file_content[i*split_number:(i+1)*split_number]}```'
|
|
||||||
i_say_show_user = f'当前发送{i+1}/{split_group}部分'
|
|
||||||
chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
|
|
||||||
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, llm_kwargs, chatbot, history=[], sys_prompt="") # 带超时倒计时
|
|
||||||
while "完成" not in gpt_say:
|
|
||||||
i_say = f'你只需要回答“接受完成”。文章内容第{i+1}/{split_group}部分是 ```{file_content[i*split_number:(i+1)*split_number]}```'
|
|
||||||
i_say_show_user = f'出现error,重新发送{i+1}/{split_group}部分'
|
|
||||||
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, llm_kwargs, chatbot, history=[], sys_prompt="") # 带超时倒计时
|
|
||||||
time.sleep(1)
|
|
||||||
chatbot[-1] = (i_say_show_user, gpt_say)
|
|
||||||
history.append(i_say_show_user); history.append(gpt_say)
|
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
i_say = f'接下来,请你扮演一名专业的学术教授,利用你的所有知识并且结合这篇文章,回答我的问题。(请牢记:1.直到我说“退出”,你才能结束任务;2.所有问题需要紧密围绕文章内容;3.如果有公式,请使用tex渲染)'
|
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
|
||||||
chatbot.append((i_say, "[Local Message] waiting gpt response."))
|
from toolbox import get_conf
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
|
||||||
|
def get_token_num(txt): return len(enc.encode(txt))
|
||||||
|
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
||||||
|
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
|
||||||
|
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
|
||||||
|
txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
|
||||||
|
# 为了更好的效果,我们剥离Introduction之后的部分(如果有)
|
||||||
|
paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
|
||||||
|
|
||||||
|
############################## <第一步,从摘要中提取高价值信息,放到history中> ##################################
|
||||||
|
final_results = []
|
||||||
|
final_results.append(paper_meta)
|
||||||
|
|
||||||
# ** gpt request **
|
############################## <第二步,迭代地历遍整个文章,提取精炼信息> ##################################
|
||||||
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say, llm_kwargs, chatbot, history=history, sys_prompt="") # 带超时倒计时
|
i_say_show_user = f'首先你在英文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。" # 用户提示
|
||||||
chatbot[-1] = (i_say, gpt_say)
|
chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[]) # 更新UI
|
||||||
history.append(i_say); history.append(gpt_say)
|
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
|
||||||
|
|
||||||
|
iteration_results = []
|
||||||
|
last_iteration_result = paper_meta # 初始值是摘要
|
||||||
|
MAX_WORD_TOTAL = 4096
|
||||||
|
n_fragment = len(paper_fragments)
|
||||||
|
if n_fragment >= 20: print('文章极长,不能达到预期效果')
|
||||||
|
for i in range(n_fragment):
|
||||||
|
NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
|
||||||
|
i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i]}"
|
||||||
|
i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]}"
|
||||||
|
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问
|
||||||
|
llm_kwargs, chatbot,
|
||||||
|
history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
|
||||||
|
sys_prompt="Extract the main idea of this section." # 提示
|
||||||
|
)
|
||||||
|
iteration_results.append(gpt_say)
|
||||||
|
last_iteration_result = gpt_say
|
||||||
|
|
||||||
@CatchException
|
############################## <第三步,整理history> ##################################
|
||||||
def 理解PDF文档内容(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
|
final_results.extend(iteration_results)
|
||||||
import glob, os
|
final_results.append(f'接下来,你是一名专业的学术教授,利用以上信息,使用中文回答我的问题。')
|
||||||
|
# 接下来两句话只显示在界面上,不起实际作用
|
||||||
# 基本信息:功能、贡献者
|
i_say_show_user = f'接下来,你是一名专业的学术教授,利用以上信息,使用中文回答我的问题。'; gpt_say = "[Local Message] 收到。"
|
||||||
chatbot.append([
|
chatbot.append([i_say_show_user, gpt_say])
|
||||||
"函数插件功能?",
|
|
||||||
"理解PDF论文内容,并且将结合上下文内容,进行学术解答。函数插件贡献者: Hanzoe。"])
|
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
|
||||||
|
|
||||||
import tkinter as tk
|
|
||||||
from tkinter import filedialog
|
|
||||||
|
|
||||||
root = tk.Tk()
|
|
||||||
root.withdraw()
|
|
||||||
txt = filedialog.askopenfilename()
|
|
||||||
|
|
||||||
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
|
||||||
try:
|
|
||||||
import fitz
|
|
||||||
except:
|
|
||||||
report_execption(chatbot, history,
|
|
||||||
a = f"解析项目: {txt}",
|
|
||||||
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。")
|
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
|
||||||
return
|
|
||||||
|
|
||||||
# 清空历史,以免输入溢出
|
|
||||||
history = []
|
|
||||||
|
|
||||||
# 开始正式执行任务
|
|
||||||
yield from 解析PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
|
|
||||||
|
|
||||||
|
############################## <第四步,设置一个token上限,防止回答时Token溢出> ##################################
|
||||||
|
from .crazy_utils import input_clipping
|
||||||
|
_, final_results = input_clipping("", final_results, max_token_limit=3200)
|
||||||
|
yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了
|
||||||
|
|
||||||
|
|
||||||
@CatchException
|
@CatchException
|
||||||
@@ -146,7 +71,7 @@ def 理解PDF文档内容标准文件输入(txt, llm_kwargs, plugin_kwargs, chat
|
|||||||
# 基本信息:功能、贡献者
|
# 基本信息:功能、贡献者
|
||||||
chatbot.append([
|
chatbot.append([
|
||||||
"函数插件功能?",
|
"函数插件功能?",
|
||||||
"理解PDF论文内容,并且将结合上下文内容,进行学术解答。函数插件贡献者: Hanzoe。"])
|
"理解PDF论文内容,并且将结合上下文内容,进行学术解答。函数插件贡献者: Hanzoe, binary-husky"])
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
|
|
||||||
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ def 生成函数注释(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
|
|||||||
import time, os
|
import time, os
|
||||||
print('begin analysis on:', file_manifest)
|
print('begin analysis on:', file_manifest)
|
||||||
for index, fp in enumerate(file_manifest):
|
for index, fp in enumerate(file_manifest):
|
||||||
with open(fp, 'r', encoding='utf-8') as f:
|
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
file_content = f.read()
|
file_content = f.read()
|
||||||
|
|
||||||
i_say = f'请对下面的程序文件做一个概述,并对文件中的所有函数生成注释,使用markdown表格输出结果,文件名是{os.path.relpath(fp, project_folder)},文件内容是 ```{file_content}```'
|
i_say = f'请对下面的程序文件做一个概述,并对文件中的所有函数生成注释,使用markdown表格输出结果,文件名是{os.path.relpath(fp, project_folder)},文件内容是 ```{file_content}```'
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ def 解析源代码新(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
|
|||||||
|
|
||||||
############################## <第一步,逐个文件分析,多线程> ##################################
|
############################## <第一步,逐个文件分析,多线程> ##################################
|
||||||
for index, fp in enumerate(file_manifest):
|
for index, fp in enumerate(file_manifest):
|
||||||
with open(fp, 'r', encoding='utf-8') as f:
|
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
file_content = f.read()
|
file_content = f.read()
|
||||||
prefix = "接下来请你逐文件分析下面的工程" if index==0 else ""
|
prefix = "接下来请你逐文件分析下面的工程" if index==0 else ""
|
||||||
i_say = prefix + f'请对下面的程序文件做一个概述文件名是{os.path.relpath(fp, project_folder)},文件代码是 ```{file_content}```'
|
i_say = prefix + f'请对下面的程序文件做一个概述文件名是{os.path.relpath(fp, project_folder)},文件代码是 ```{file_content}```'
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ def 解析Paper(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbo
|
|||||||
import time, glob, os
|
import time, glob, os
|
||||||
print('begin analysis on:', file_manifest)
|
print('begin analysis on:', file_manifest)
|
||||||
for index, fp in enumerate(file_manifest):
|
for index, fp in enumerate(file_manifest):
|
||||||
with open(fp, 'r', encoding='utf-8') as f:
|
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
file_content = f.read()
|
file_content = f.read()
|
||||||
|
|
||||||
prefix = "接下来请你逐文件分析下面的论文文件,概括其内容" if index==0 else ""
|
prefix = "接下来请你逐文件分析下面的论文文件,概括其内容" if index==0 else ""
|
||||||
|
|||||||
4
version
4
version
@@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"version": 2.67,
|
"version": 2.68,
|
||||||
"show_feature": true,
|
"show_feature": true,
|
||||||
"new_feature": "现可通过输入区更新临时api-key <-> 增强多线程稳定性(涉及代码解析、PDF翻译、自译解等) <-> 修复Token计数错误(解决PDF翻译的分割不合理的问题) <-> 如果一键更新失败,可前往github手动更新"
|
"new_feature": "改善理解pdf(chatpdf)功能 <-> 如果一键更新失败,可前往github手动更新"
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user