Compare commits

..

46 Commits

Author SHA1 Message Date
binary-husky
7894e2f02d renew pip url 2024-02-25 22:15:24 +08:00
binary-husky
9e9bd7aa27 Merge branch 'master' into huggingfacelocal 2024-02-25 22:14:50 +08:00
binary-husky
c534814363 Merge branch 'master' into huggingfacelocal 2024-01-23 15:52:51 +08:00
binary-husky
8a525a4560 ADD_WAIFU 2023-12-27 00:04:53 +08:00
binary-husky
22f58d0953 Toggle dark mode in config.py 2023-12-26 23:58:53 +08:00
binary-husky
59b4345945 Merge branch 'master' into huggingfacelocal 2023-12-26 23:57:23 +08:00
binary-husky
72ba7e9738 Merge branch 'master' into huggingfacelocal 2023-11-29 00:35:20 +08:00
binary-husky
d7f4b07fe4 Merge branch 'master' into huggingfacelocal 2023-11-20 01:17:04 +08:00
binary-husky
df27843e51 Merge branch 'master' into huggingfacelocal 2023-10-06 11:59:18 +08:00
binary-husky
e2fefec2e3 减小Latex容器体积 2023-10-06 11:47:27 +08:00
binary-husky
2bf71bd1a8 Chuanhu-Small-and-Beautiful 2023-09-15 17:30:31 +08:00
binary-husky
6a56fb7477 Merge branch 'master' into huggingfacelocal 2023-09-15 17:19:59 +08:00
binary-husky
a2e7ea748c up 2023-09-09 19:02:51 +08:00
binary-husky
8153c1b49d Merge branch 'master' into huggingfacelocal 2023-09-09 18:54:53 +08:00
binary-husky
2552126744 up 2023-08-28 01:43:45 +08:00
binary-husky
2475163337 Merge branch 'master' into huggingfacelocal 2023-08-28 01:39:20 +08:00
binary-husky
a3bdb69e30 sync 2023-08-16 13:28:44 +08:00
binary-husky
81874b380f qw 2023-08-16 13:01:41 +08:00
binary-husky
96c1852abc Merge branch 'master' into huggingface 2023-06-30 12:09:25 +08:00
binary-husky
cd145c0794 1 2023-06-29 15:04:03 +08:00
binary-husky
7a4d4ad956 Merge branch 'huggingface' of github.com:binary-husky/chatgpt_academic into huggingface 2023-06-29 12:54:24 +08:00
binary-husky
9f9848c6e9 again 2023-06-29 12:54:19 +08:00
binary-husky
94425c49fd again 2023-05-28 21:34:50 +08:00
binary-husky
e874a16050 try again 2023-05-28 21:33:28 +08:00
binary-husky
c28388c5fe load version 2023-05-28 21:32:10 +08:00
binary-husky
b4a56d391b Merge branch 'huggingface' of github.com:binary-husky/chatgpt_academic into huggingface 2023-05-28 21:30:34 +08:00
binary-husky
7075092f86 fix app 2023-05-28 21:30:29 +08:00
binary-husky
1086ff8092 Merge branch 'huggingface' of github.com:binary-husky/chatgpt_academic into huggingface 2023-05-28 21:27:31 +08:00
binary-husky
3a22446b47 try4 2023-05-28 21:27:25 +08:00
binary-husky
7842cf03cc Merge branch 'master' into huggingface 2023-05-28 21:27:20 +08:00
binary-husky
54f55c32f2 213 2023-05-28 21:25:45 +08:00
binary-husky
94318ff0a2 try3 2023-05-28 21:24:46 +08:00
binary-husky
5be6b83762 try2 2023-05-28 21:24:02 +08:00
binary-husky
6f18d1716e Merge branch 'master' into huggingface 2023-05-28 21:21:12 +08:00
binary-husky
90944bd744 up 2023-05-25 15:04:53 +08:00
binary-husky
752937cb70 Merge branch 'master' into huggingface 2023-05-25 15:01:30 +08:00
binary-husky
c584cbac5b fix ver 2023-05-19 14:08:47 +08:00
binary-husky
309d12b404 Merge branch 'master' into huggingface 2023-05-19 14:05:23 +08:00
binary-husky
52ea0acd61 Merge branch 'master' into huggingface 2023-05-06 23:06:53 +08:00
binary-husky
9f5e3e0fd5 Merge branch 'master' into huggingface 2023-05-05 18:24:36 +08:00
binary-husky
315e78e5d9 Merge branch 'master' into huggingface 2023-04-29 03:53:32 +08:00
binary-husky
b6b4ba684a Merge branch 'master' into huggingface 2023-04-24 18:32:56 +08:00
binary-husky
2281a5ca7f 修改提示 2023-04-24 12:55:53 +08:00
binary-husky
49558686f2 Merge branch 'master' into huggingface 2023-04-24 12:30:59 +08:00
Your Name
b050ccedb5 Merge branch 'master' into huggingface 2023-04-21 18:48:00 +08:00
Your Name
ae56cab6f4 huggingface 2023-04-19 18:07:32 +08:00
236 changed files with 3165 additions and 20816 deletions

View File

@@ -1,14 +1,14 @@
# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages
name: build-with-latex-arm
name: build-with-all-capacity-beta
on:
push:
branches:
- "master"
- 'master'
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}_with_latex_arm
IMAGE_NAME: ${{ github.repository }}_with_all_capacity_beta
jobs:
build-and-push-image:
@@ -18,17 +18,11 @@ jobs:
packages: write
steps:
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Checkout repository
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Log in to the Container registry
uses: docker/login-action@v3
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
@@ -41,11 +35,10 @@ jobs:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
- name: Build and push Docker image
uses: docker/build-push-action@v6
uses: docker/build-push-action@v4
with:
context: .
push: true
platforms: linux/arm64
file: docs/GithubAction+NoLocal+Latex
file: docs/GithubAction+AllCapacityBeta
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

View File

@@ -0,0 +1,44 @@
# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages
name: build-with-jittorllms
on:
push:
branches:
- 'master'
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}_jittorllms
jobs:
build-and-push-image:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Log in to the Container registry
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
- name: Build and push Docker image
uses: docker/build-push-action@v4
with:
context: .
push: true
file: docs/GithubAction+JittorLLMs
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

9
.gitignore vendored
View File

@@ -131,9 +131,6 @@ dmypy.json
# Pyre type checker
.pyre/
# macOS files
.DS_Store
.vscode
.idea
@@ -156,9 +153,3 @@ media
flagged
request_llms/ChatGLM-6b-onnx-u8s8
.pre-commit-config.yaml
test.*
temp.*
objdump*
*.min.*.js
TODO
*.cursorrules

View File

@@ -12,16 +12,11 @@ RUN echo '[global]' > /etc/pip.conf && \
echo 'trusted-host = mirrors.aliyun.com' >> /etc/pip.conf
# 语音输出功能以下两行第一行更换阿里源第二行安装ffmpeg都可以删除
RUN UBUNTU_VERSION=$(awk -F= '/^VERSION_CODENAME=/{print $2}' /etc/os-release); echo "deb https://mirrors.aliyun.com/debian/ $UBUNTU_VERSION main non-free contrib" > /etc/apt/sources.list; apt-get update
RUN apt-get install ffmpeg -y
# 进入工作路径(必要)
WORKDIR /gpt
# 安装大部分依赖利用Docker缓存加速以后的构建 (以下行,可以删除)
# 安装大部分依赖利用Docker缓存加速以后的构建 (以下行,可以删除)
COPY requirements.txt ./
RUN pip3 install -r requirements.txt

View File

@@ -1,9 +1,20 @@
> [!IMPORTANT]
> 2024.10.10: 突发停电,紧急恢复了提供[whl包](https://drive.google.com/file/d/19U_hsLoMrjOlQSzYS3pzWX9fTzyusArP/view?usp=sharing)的文件服务器
> 2024.10.8: 版本3.90加入对llama-index的初步支持版本3.80加入插件二级菜单功能详见wiki
> 2024.5.1: 加入Doc2x翻译PDF论文的功能[查看详情](https://github.com/binary-husky/gpt_academic/wiki/Doc2x)
> 2024.3.11: 全力支持Qwen、GLM、DeepseekCoder等中文大语言模型 SoVits语音克隆模块[查看详情](https://www.bilibili.com/video/BV1Rp421S7tF/)
> 2024.1.17: 安装依赖时,请选择`requirements.txt`中**指定的版本**。 安装命令:`pip install -r requirements.txt`。本项目完全开源免费,您可通过订阅[在线服务](https://github.com/binary-husky/gpt_academic/wiki/online)的方式鼓励本项目的发展。
---
title: GPT-Academic
emoji: 😻
colorFrom: blue
colorTo: blue
sdk: gradio
sdk_version: 3.32.0
app_file: app.py
pinned: false
---
# ChatGPT 学术优化
> **Note**
>
> 2023.11.12: 某些依赖包尚不兼容python 3.12推荐python 3.11。
>
> 2023.12.26: 安装依赖时,请选择`requirements.txt`中**指定的版本**。 安装命令:`pip install -r requirements.txt`。本项目完全开源免费,您可通过订阅[在线服务](https://github.com/binary-husky/gpt_academic/wiki/online)的方式鼓励本项目的发展。
<br>
@@ -68,7 +79,7 @@ Read this in [English](docs/README.English.md) | [日本語](docs/README.Japanes
读论文、[翻译](https://www.bilibili.com/video/BV1KT411x7Wn)论文 | [插件] 一键解读latex/pdf论文全文并生成摘要
Latex全文[翻译](https://www.bilibili.com/video/BV1nk4y1Y7Js/)、[润色](https://www.bilibili.com/video/BV1FT411H7c5/) | [插件] 一键翻译或润色latex论文
批量注释生成 | [插件] 一键批量生成函数注释
Markdown[中英互译](https://www.bilibili.com/video/BV1yo4y157jV/) | [插件] 看到上面5种语言的[README](https://github.com/binary-husky/gpt_academic/blob/master/docs/README.English.md)了吗?就是出自他的手笔
Markdown[中英互译](https://www.bilibili.com/video/BV1yo4y157jV/) | [插件] 看到上面5种语言的[README](https://github.com/binary-husky/gpt_academic/blob/master/docs/README_EN.md)了吗?就是出自他的手笔
[PDF论文全文翻译功能](https://www.bilibili.com/video/BV1KT411x7Wn) | [插件] PDF论文提取题目&摘要+翻译全文(多线程)
[Arxiv小助手](https://www.bilibili.com/video/BV1LM4y1279X) | [插件] 输入arxiv文章url即可一键翻译摘要+下载PDF
Latex论文一键校对 | [插件] 仿Grammarly对Latex文章进行语法、拼写纠错+输出对照PDF
@@ -88,10 +99,6 @@ Latex论文一键校对 | [插件] 仿Grammarly对Latex文章进行语法、拼
<img src="https://user-images.githubusercontent.com/96192199/279702205-d81137c3-affd-4cd1-bb5e-b15610389762.gif" width="700" >
</div>
<div align="center">
<img src="https://github.com/binary-husky/gpt_academic/assets/96192199/70ff1ec5-e589-4561-a29e-b831079b37fb.gif" width="700" >
</div>
- 所有按钮都通过读取functional.py动态生成可随意加自定义功能解放剪贴板
<div align="center">
@@ -258,7 +265,8 @@ P.S. 如果需要依赖Latex的插件功能请见Wiki。另外您也可以
# Advanced Usage
### I自定义新的便捷按钮学术快捷键
现在已可以通过UI中的`界面外观`菜单中的`自定义菜单`添加新的便捷按钮。如果需要在代码中定义,请使用任意文本编辑器打开`core_functional.py`,添加如下条目即可:
任意文本编辑器打开`core_functional.py`,添加如下条目,然后重启程序。(如果按钮已存在,那么可以直接修改(前缀、后缀都已支持热修改),无需重启程序即可生效。)
例如
```python
"超级英译中": {

412
app.py Normal file
View File

@@ -0,0 +1,412 @@
import os; os.environ['no_proxy'] = '*' # 避免代理网络产生意外污染
help_menu_description = \
"""Github源代码开源和更新[地址🚀](https://github.com/binary-husky/gpt_academic),
感谢热情的[开发者们❤️](https://github.com/binary-husky/gpt_academic/graphs/contributors).
</br></br>常见问题请查阅[项目Wiki](https://github.com/binary-husky/gpt_academic/wiki),
如遇到Bug请前往[Bug反馈](https://github.com/binary-husky/gpt_academic/issues).
</br></br>普通对话使用说明: 1. 输入问题; 2. 点击提交
</br></br>基础功能区使用说明: 1. 输入文本; 2. 点击任意基础功能区按钮
</br></br>函数插件区使用说明: 1. 输入路径/问题, 或者上传文件; 2. 点击任意函数插件区按钮
</br></br>虚空终端使用说明: 点击虚空终端, 然后根据提示输入指令, 再次点击虚空终端
</br></br>如何保存对话: 点击保存当前的对话按钮
</br></br>如何语音对话: 请阅读Wiki
</br></br>如何临时更换API_KEY: 在输入区输入临时API_KEY后提交网页刷新后失效"""
def main():
import subprocess, sys
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://public.agent-matrix.com/publish/gradio-3.32.8-py3-none-any.whl'])
import gradio as gr
if gr.__version__ not in ['3.32.8']:
raise ModuleNotFoundError("使用项目内置Gradio获取最优体验! 请运行 `pip install -r requirements.txt` 指令安装内置Gradio及其他依赖, 详情信息见requirements.txt.")
from request_llms.bridge_all import predict
from toolbox import format_io, find_free_port, on_file_uploaded, on_report_generated, get_conf, ArgsGeneralWrapper, load_chat_cookies, DummyWith
# 建议您复制一个config_private.py放自己的秘密, 如API和代理网址
proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION = get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION')
CHATBOT_HEIGHT, LAYOUT, AVAIL_LLM_MODELS, AUTO_CLEAR_TXT = get_conf('CHATBOT_HEIGHT', 'LAYOUT', 'AVAIL_LLM_MODELS', 'AUTO_CLEAR_TXT')
ENABLE_AUDIO, AUTO_CLEAR_TXT, PATH_LOGGING, AVAIL_THEMES, THEME, ADD_WAIFU = get_conf('ENABLE_AUDIO', 'AUTO_CLEAR_TXT', 'PATH_LOGGING', 'AVAIL_THEMES', 'THEME', 'ADD_WAIFU')
DARK_MODE, NUM_CUSTOM_BASIC_BTN, SSL_KEYFILE, SSL_CERTFILE = get_conf('DARK_MODE', 'NUM_CUSTOM_BASIC_BTN', 'SSL_KEYFILE', 'SSL_CERTFILE')
INIT_SYS_PROMPT = get_conf('INIT_SYS_PROMPT')
# 如果WEB_PORT是-1, 则随机选取WEB端口
PORT = find_free_port() if WEB_PORT <= 0 else WEB_PORT
from check_proxy import get_current_version
from themes.theme import adjust_theme, advanced_css, theme_declaration, js_code_clear, js_code_reset, js_code_show_or_hide, js_code_show_or_hide_group2
from themes.theme import js_code_for_css_changing, js_code_for_toggle_darkmode, js_code_for_persistent_cookie_init
from themes.theme import load_dynamic_theme, to_cookie_str, from_cookie_str, init_cookie
title_html = f"<h1 align=\"center\">GPT 学术优化 {get_current_version()}</h1>{theme_declaration}"
# 问询记录, python 版本建议3.9+(越新越好)
import logging, uuid
os.makedirs(PATH_LOGGING, exist_ok=True)
try:logging.basicConfig(filename=f"{PATH_LOGGING}/chat_secrets.log", level=logging.INFO, encoding="utf-8", format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
except:logging.basicConfig(filename=f"{PATH_LOGGING}/chat_secrets.log", level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
# Disable logging output from the 'httpx' logger
logging.getLogger("httpx").setLevel(logging.WARNING)
print(f"所有问询记录将自动保存在本地目录./{PATH_LOGGING}/chat_secrets.log, 请注意自我隐私保护哦!")
# 一些普通功能模块
from core_functional import get_core_functions
functional = get_core_functions()
# 高级函数插件
from crazy_functional import get_crazy_functions
DEFAULT_FN_GROUPS = get_conf('DEFAULT_FN_GROUPS')
plugins = get_crazy_functions()
all_plugin_groups = list(set([g for _, plugin in plugins.items() for g in plugin['Group'].split('|')]))
match_group = lambda tags, groups: any([g in groups for g in tags.split('|')])
# 处理markdown文本格式的转变
gr.Chatbot.postprocess = format_io
# 做一些外观色彩上的调整
set_theme = adjust_theme()
# 代理与自动更新
from check_proxy import check_proxy, auto_update, warm_up_modules
proxy_info = check_proxy(proxies)
gr_L1 = lambda: gr.Row().style()
gr_L2 = lambda scale, elem_id: gr.Column(scale=scale, elem_id=elem_id, min_width=400)
if LAYOUT == "TOP-DOWN":
gr_L1 = lambda: DummyWith()
gr_L2 = lambda scale, elem_id: gr.Row()
CHATBOT_HEIGHT /= 2
cancel_handles = []
customize_btns = {}
predefined_btns = {}
with gr.Blocks(title="GPT 学术优化", theme=set_theme, analytics_enabled=False, css=advanced_css) as demo:
gr.HTML(title_html)
gr.HTML('''<center><a href="https://huggingface.co/spaces/qingxu98/gpt-academic?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>请您打开此页面后务必点击上方的“复制空间”Duplicate Space按钮<font color="#FF00FF">使用时先在输入框填入API-KEY然后回车。</font><br/>切忌在“复制空间”Duplicate Space之前填入API_KEY或进行提问否则您的API_KEY将极可能被空间所有者攫取<br/>支持任意数量的OpenAI的密钥和API2D的密钥共存例如输入"OpenAI密钥1,API2D密钥2",然后提交,即可同时使用两种模型接口。</center>''')
secret_css, dark_mode, py_pickle_cookie = gr.Textbox(visible=False), gr.Textbox(DARK_MODE, visible=False), gr.Textbox(visible=False)
cookies = gr.State(load_chat_cookies())
with gr_L1():
with gr_L2(scale=2, elem_id="gpt-chat"):
chatbot = gr.Chatbot(label=f"当前模型:{LLM_MODEL}", elem_id="gpt-chatbot")
if LAYOUT == "TOP-DOWN": chatbot.style(height=CHATBOT_HEIGHT)
history = gr.State([])
with gr_L2(scale=1, elem_id="gpt-panel"):
with gr.Accordion("输入区", open=True, elem_id="input-panel") as area_input_primary:
with gr.Row():
txt = gr.Textbox(show_label=False, lines=2, placeholder="输入问题或API密钥输入多个密钥时用英文逗号间隔。支持多个OpenAI密钥共存。").style(container=False)
with gr.Row():
submitBtn = gr.Button("提交", elem_id="elem_submit", variant="primary")
with gr.Row():
resetBtn = gr.Button("重置", elem_id="elem_reset", variant="secondary"); resetBtn.style(size="sm")
stopBtn = gr.Button("停止", elem_id="elem_stop", variant="secondary"); stopBtn.style(size="sm")
clearBtn = gr.Button("清除", elem_id="elem_clear", variant="secondary", visible=False); clearBtn.style(size="sm")
if ENABLE_AUDIO:
with gr.Row():
audio_mic = gr.Audio(source="microphone", type="numpy", elem_id="elem_audio", streaming=True, show_label=False).style(container=False)
with gr.Row():
status = gr.Markdown(f"Tip: 按Enter提交, 按Shift+Enter换行。当前模型: {LLM_MODEL} \n {proxy_info}", elem_id="state-panel")
with gr.Accordion("基础功能区", open=True, elem_id="basic-panel") as area_basic_fn:
with gr.Row():
for k in range(NUM_CUSTOM_BASIC_BTN):
customize_btn = gr.Button("自定义按钮" + str(k+1), visible=False, variant="secondary", info_str=f'基础功能区: 自定义按钮')
customize_btn.style(size="sm")
customize_btns.update({"自定义按钮" + str(k+1): customize_btn})
for k in functional:
if ("Visible" in functional[k]) and (not functional[k]["Visible"]): continue
variant = functional[k]["Color"] if "Color" in functional[k] else "secondary"
functional[k]["Button"] = gr.Button(k, variant=variant, info_str=f'基础功能区: {k}')
functional[k]["Button"].style(size="sm")
predefined_btns.update({k: functional[k]["Button"]})
with gr.Accordion("函数插件区", open=True, elem_id="plugin-panel") as area_crazy_fn:
with gr.Row():
gr.Markdown("插件可读取“输入区”文本/路径作为参数(上传文件自动修正路径)")
with gr.Row(elem_id="input-plugin-group"):
plugin_group_sel = gr.Dropdown(choices=all_plugin_groups, label='', show_label=False, value=DEFAULT_FN_GROUPS,
multiselect=True, interactive=True, elem_classes='normal_mut_select').style(container=False)
with gr.Row():
for k, plugin in plugins.items():
if not plugin.get("AsButton", True): continue
visible = True if match_group(plugin['Group'], DEFAULT_FN_GROUPS) else False
variant = plugins[k]["Color"] if "Color" in plugin else "secondary"
info = plugins[k].get("Info", k)
plugin['Button'] = plugins[k]['Button'] = gr.Button(k, variant=variant,
visible=visible, info_str=f'函数插件区: {info}').style(size="sm")
with gr.Row():
with gr.Accordion("更多函数插件", open=True):
dropdown_fn_list = []
for k, plugin in plugins.items():
if not match_group(plugin['Group'], DEFAULT_FN_GROUPS): continue
if not plugin.get("AsButton", True): dropdown_fn_list.append(k) # 排除已经是按钮的插件
elif plugin.get('AdvancedArgs', False): dropdown_fn_list.append(k) # 对于需要高级参数的插件,亦在下拉菜单中显示
with gr.Row():
dropdown = gr.Dropdown(dropdown_fn_list, value=r"打开插件列表", label="", show_label=False).style(container=False)
with gr.Row():
plugin_advanced_arg = gr.Textbox(show_label=True, label="高级参数输入区", visible=False,
placeholder="这里是特殊函数插件的高级参数输入区").style(container=False)
with gr.Row():
switchy_bt = gr.Button(r"请先从插件列表中选择", variant="secondary").style(size="sm")
with gr.Row():
with gr.Accordion("点击展开“文件下载区”。", open=False) as area_file_up:
file_upload = gr.Files(label="任何文件, 推荐上传压缩文件(zip, tar)", file_count="multiple", elem_id="elem_upload")
with gr.Floating(init_x="0%", init_y="0%", visible=True, width=None, drag="forbidden", elem_id="tooltip"):
with gr.Row():
with gr.Tab("上传文件", elem_id="interact-panel"):
gr.Markdown("请上传本地文件/压缩包供“函数插件区”功能调用。请注意: 上传文件后会自动把输入区修改为相应路径。")
file_upload_2 = gr.Files(label="任何文件, 推荐上传压缩文件(zip, tar)", file_count="multiple", elem_id="elem_upload_float")
with gr.Tab("更换模型", elem_id="interact-panel"):
md_dropdown = gr.Dropdown(AVAIL_LLM_MODELS, value=LLM_MODEL, label="更换LLM模型/请求源").style(container=False)
top_p = gr.Slider(minimum=-0, maximum=1.0, value=1.0, step=0.01,interactive=True, label="Top-p (nucleus sampling)",)
temperature = gr.Slider(minimum=-0, maximum=2.0, value=1.0, step=0.01, interactive=True, label="Temperature",)
max_length_sl = gr.Slider(minimum=256, maximum=1024*32, value=4096, step=128, interactive=True, label="Local LLM MaxLength",)
system_prompt = gr.Textbox(show_label=True, lines=2, placeholder=f"System Prompt", label="System prompt", value=INIT_SYS_PROMPT)
with gr.Tab("界面外观", elem_id="interact-panel"):
theme_dropdown = gr.Dropdown(AVAIL_THEMES, value=THEME, label="更换UI主题").style(container=False)
checkboxes = gr.CheckboxGroup(["基础功能区", "函数插件区", "浮动输入区", "输入清除键", "插件参数区"], value=["基础功能区", "函数插件区"], label="显示/隐藏功能区", elem_id='cbs').style(container=False)
opt = ["自定义菜单"]
value=[]
if ADD_WAIFU: opt += ["添加Live2D形象"]; value += ["添加Live2D形象"]
checkboxes_2 = gr.CheckboxGroup(opt, value=value, label="显示/隐藏自定义菜单", elem_id='cbsc').style(container=False)
dark_mode_btn = gr.Button("切换界面明暗 ☀", variant="secondary").style(size="sm")
dark_mode_btn.click(None, None, None, _js=js_code_for_toggle_darkmode)
with gr.Tab("帮助", elem_id="interact-panel"):
gr.Markdown(help_menu_description)
with gr.Floating(init_x="20%", init_y="50%", visible=False, width="40%", drag="top") as area_input_secondary:
with gr.Accordion("浮动输入区", open=True, elem_id="input-panel2"):
with gr.Row() as row:
row.style(equal_height=True)
with gr.Column(scale=10):
txt2 = gr.Textbox(show_label=False, placeholder="Input question here.",
elem_id='user_input_float', lines=8, label="输入区2").style(container=False)
with gr.Column(scale=1, min_width=40):
submitBtn2 = gr.Button("提交", variant="primary"); submitBtn2.style(size="sm")
resetBtn2 = gr.Button("重置", variant="secondary"); resetBtn2.style(size="sm")
stopBtn2 = gr.Button("停止", variant="secondary"); stopBtn2.style(size="sm")
clearBtn2 = gr.Button("清除", elem_id="elem_clear2", variant="secondary", visible=False); clearBtn2.style(size="sm")
with gr.Floating(init_x="20%", init_y="50%", visible=False, width="40%", drag="top") as area_customize:
with gr.Accordion("自定义菜单", open=True, elem_id="edit-panel"):
with gr.Row() as row:
with gr.Column(scale=10):
AVAIL_BTN = [btn for btn in customize_btns.keys()] + [k for k in functional]
basic_btn_dropdown = gr.Dropdown(AVAIL_BTN, value="自定义按钮1", label="选择一个需要自定义基础功能区按钮").style(container=False)
basic_fn_title = gr.Textbox(show_label=False, placeholder="输入新按钮名称", lines=1).style(container=False)
basic_fn_prefix = gr.Textbox(show_label=False, placeholder="输入新提示前缀", lines=4).style(container=False)
basic_fn_suffix = gr.Textbox(show_label=False, placeholder="输入新提示后缀", lines=4).style(container=False)
with gr.Column(scale=1, min_width=70):
basic_fn_confirm = gr.Button("确认并保存", variant="primary"); basic_fn_confirm.style(size="sm")
basic_fn_clean = gr.Button("恢复默认", variant="primary"); basic_fn_clean.style(size="sm")
def assign_btn(persistent_cookie_, cookies_, basic_btn_dropdown_, basic_fn_title, basic_fn_prefix, basic_fn_suffix, clean_up=False):
ret = {}
# 读取之前的自定义按钮
customize_fn_overwrite_ = cookies_['customize_fn_overwrite']
# 更新新的自定义按钮
customize_fn_overwrite_.update({
basic_btn_dropdown_:
{
"Title":basic_fn_title,
"Prefix":basic_fn_prefix,
"Suffix":basic_fn_suffix,
}
}
)
if clean_up:
customize_fn_overwrite_ = {}
cookies_.update(customize_fn_overwrite_) # 更新cookie
visible = (not clean_up) and (basic_fn_title != "")
if basic_btn_dropdown_ in customize_btns:
# 是自定义按钮,不是预定义按钮
ret.update({customize_btns[basic_btn_dropdown_]: gr.update(visible=visible, value=basic_fn_title)})
else:
# 是预定义按钮
ret.update({predefined_btns[basic_btn_dropdown_]: gr.update(visible=visible, value=basic_fn_title)})
ret.update({cookies: cookies_})
try: persistent_cookie_ = from_cookie_str(persistent_cookie_) # persistent cookie to dict
except: persistent_cookie_ = {}
persistent_cookie_["custom_bnt"] = customize_fn_overwrite_ # dict update new value
persistent_cookie_ = to_cookie_str(persistent_cookie_) # persistent cookie to dict
ret.update({py_pickle_cookie: persistent_cookie_}) # write persistent cookie
return ret
# update btn
h = basic_fn_confirm.click(assign_btn, [py_pickle_cookie, cookies, basic_btn_dropdown, basic_fn_title, basic_fn_prefix, basic_fn_suffix],
[py_pickle_cookie, cookies, *customize_btns.values(), *predefined_btns.values()])
h.then(None, [py_pickle_cookie], None, _js="""(py_pickle_cookie)=>{setCookie("py_pickle_cookie", py_pickle_cookie, 365);}""")
# clean up btn
h2 = basic_fn_clean.click(assign_btn, [py_pickle_cookie, cookies, basic_btn_dropdown, basic_fn_title, basic_fn_prefix, basic_fn_suffix, gr.State(True)],
[py_pickle_cookie, cookies, *customize_btns.values(), *predefined_btns.values()])
h2.then(None, [py_pickle_cookie], None, _js="""(py_pickle_cookie)=>{setCookie("py_pickle_cookie", py_pickle_cookie, 365);}""")
def persistent_cookie_reload(persistent_cookie_, cookies_):
ret = {}
for k in customize_btns:
ret.update({customize_btns[k]: gr.update(visible=False, value="")})
try: persistent_cookie_ = from_cookie_str(persistent_cookie_) # persistent cookie to dict
except: return ret
customize_fn_overwrite_ = persistent_cookie_.get("custom_bnt", {})
cookies_['customize_fn_overwrite'] = customize_fn_overwrite_
ret.update({cookies: cookies_})
for k,v in persistent_cookie_["custom_bnt"].items():
if v['Title'] == "": continue
if k in customize_btns: ret.update({customize_btns[k]: gr.update(visible=True, value=v['Title'])})
else: ret.update({predefined_btns[k]: gr.update(visible=True, value=v['Title'])})
return ret
# 功能区显示开关与功能区的互动
def fn_area_visibility(a):
ret = {}
ret.update({area_input_primary: gr.update(visible=("浮动输入区" not in a))})
ret.update({area_input_secondary: gr.update(visible=("浮动输入区" in a))})
ret.update({plugin_advanced_arg: gr.update(visible=("插件参数区" in a))})
if "浮动输入区" in a: ret.update({txt: gr.update(value="")})
return ret
checkboxes.select(fn_area_visibility, [checkboxes], [area_basic_fn, area_crazy_fn, area_input_primary, area_input_secondary, txt, txt2, plugin_advanced_arg] )
checkboxes.select(None, [checkboxes], None, _js=js_code_show_or_hide)
# 功能区显示开关与功能区的互动
def fn_area_visibility_2(a):
ret = {}
ret.update({area_customize: gr.update(visible=("自定义菜单" in a))})
return ret
checkboxes_2.select(fn_area_visibility_2, [checkboxes_2], [area_customize] )
checkboxes_2.select(None, [checkboxes_2], None, _js=js_code_show_or_hide_group2)
# 整理反复出现的控件句柄组合
input_combo = [cookies, max_length_sl, md_dropdown, txt, txt2, top_p, temperature, chatbot, history, system_prompt, plugin_advanced_arg]
output_combo = [cookies, chatbot, history, status]
predict_args = dict(fn=ArgsGeneralWrapper(predict), inputs=[*input_combo, gr.State(True)], outputs=output_combo)
# 提交按钮、重置按钮
cancel_handles.append(txt.submit(**predict_args))
cancel_handles.append(txt2.submit(**predict_args))
cancel_handles.append(submitBtn.click(**predict_args))
cancel_handles.append(submitBtn2.click(**predict_args))
resetBtn.click(None, None, [chatbot, history, status], _js=js_code_reset) # 先在前端快速清除chatbot&status
resetBtn2.click(None, None, [chatbot, history, status], _js=js_code_reset) # 先在前端快速清除chatbot&status
resetBtn.click(lambda: ([], [], "已重置"), None, [chatbot, history, status]) # 再在后端清除history
resetBtn2.click(lambda: ([], [], "已重置"), None, [chatbot, history, status]) # 再在后端清除history
clearBtn.click(None, None, [txt, txt2], _js=js_code_clear)
clearBtn2.click(None, None, [txt, txt2], _js=js_code_clear)
if AUTO_CLEAR_TXT:
submitBtn.click(None, None, [txt, txt2], _js=js_code_clear)
submitBtn2.click(None, None, [txt, txt2], _js=js_code_clear)
txt.submit(None, None, [txt, txt2], _js=js_code_clear)
txt2.submit(None, None, [txt, txt2], _js=js_code_clear)
# 基础功能区的回调函数注册
for k in functional:
if ("Visible" in functional[k]) and (not functional[k]["Visible"]): continue
click_handle = functional[k]["Button"].click(fn=ArgsGeneralWrapper(predict), inputs=[*input_combo, gr.State(True), gr.State(k)], outputs=output_combo)
cancel_handles.append(click_handle)
for btn in customize_btns.values():
click_handle = btn.click(fn=ArgsGeneralWrapper(predict), inputs=[*input_combo, gr.State(True), gr.State(btn.value)], outputs=output_combo)
cancel_handles.append(click_handle)
# 文件上传区接收文件后与chatbot的互动
file_upload.upload(on_file_uploaded, [file_upload, chatbot, txt, txt2, checkboxes, cookies], [chatbot, txt, txt2, cookies]).then(None, None, None, _js=r"()=>{toast_push('上传完毕 ...'); cancel_loading_status();}")
file_upload_2.upload(on_file_uploaded, [file_upload_2, chatbot, txt, txt2, checkboxes, cookies], [chatbot, txt, txt2, cookies]).then(None, None, None, _js=r"()=>{toast_push('上传完毕 ...'); cancel_loading_status();}")
# 函数插件-固定按钮区
for k in plugins:
if not plugins[k].get("AsButton", True): continue
click_handle = plugins[k]["Button"].click(ArgsGeneralWrapper(plugins[k]["Function"]), [*input_combo], output_combo)
click_handle.then(on_report_generated, [cookies, file_upload, chatbot], [cookies, file_upload, chatbot])
cancel_handles.append(click_handle)
# 函数插件-下拉菜单与随变按钮的互动
def on_dropdown_changed(k):
variant = plugins[k]["Color"] if "Color" in plugins[k] else "secondary"
info = plugins[k].get("Info", k)
ret = {switchy_bt: gr.update(value=k, variant=variant, info_str=f'函数插件区: {info}')}
if plugins[k].get("AdvancedArgs", False): # 是否唤起高级插件参数区
ret.update({plugin_advanced_arg: gr.update(visible=True, label=f"插件[{k}]的高级参数说明:" + plugins[k].get("ArgsReminder", [f"没有提供高级参数功能说明"]))})
else:
ret.update({plugin_advanced_arg: gr.update(visible=False, label=f"插件[{k}]不需要高级参数。")})
return ret
dropdown.select(on_dropdown_changed, [dropdown], [switchy_bt, plugin_advanced_arg] )
def on_md_dropdown_changed(k):
return {chatbot: gr.update(label="当前模型:"+k)}
md_dropdown.select(on_md_dropdown_changed, [md_dropdown], [chatbot] )
def on_theme_dropdown_changed(theme, secret_css):
adjust_theme, css_part1, _, adjust_dynamic_theme = load_dynamic_theme(theme)
if adjust_dynamic_theme:
css_part2 = adjust_dynamic_theme._get_theme_css()
else:
css_part2 = adjust_theme()._get_theme_css()
return css_part2 + css_part1
theme_handle = theme_dropdown.select(on_theme_dropdown_changed, [theme_dropdown, secret_css], [secret_css])
theme_handle.then(
None,
[secret_css],
None,
_js=js_code_for_css_changing
)
# 随变按钮的回调函数注册
def route(request: gr.Request, k, *args, **kwargs):
if k in [r"打开插件列表", r"请先从插件列表中选择"]: return
yield from ArgsGeneralWrapper(plugins[k]["Function"])(request, *args, **kwargs)
click_handle = switchy_bt.click(route,[switchy_bt, *input_combo], output_combo)
click_handle.then(on_report_generated, [cookies, file_upload, chatbot], [cookies, file_upload, chatbot])
cancel_handles.append(click_handle)
# 终止按钮的回调函数注册
stopBtn.click(fn=None, inputs=None, outputs=None, cancels=cancel_handles)
stopBtn2.click(fn=None, inputs=None, outputs=None, cancels=cancel_handles)
plugins_as_btn = {name:plugin for name, plugin in plugins.items() if plugin.get('Button', None)}
def on_group_change(group_list):
btn_list = []
fns_list = []
if not group_list: # 处理特殊情况:没有选择任何插件组
return [*[plugin['Button'].update(visible=False) for _, plugin in plugins_as_btn.items()], gr.Dropdown.update(choices=[])]
for k, plugin in plugins.items():
if plugin.get("AsButton", True):
btn_list.append(plugin['Button'].update(visible=match_group(plugin['Group'], group_list))) # 刷新按钮
if plugin.get('AdvancedArgs', False): dropdown_fn_list.append(k) # 对于需要高级参数的插件,亦在下拉菜单中显示
elif match_group(plugin['Group'], group_list): fns_list.append(k) # 刷新下拉列表
return [*btn_list, gr.Dropdown.update(choices=fns_list)]
plugin_group_sel.select(fn=on_group_change, inputs=[plugin_group_sel], outputs=[*[plugin['Button'] for name, plugin in plugins_as_btn.items()], dropdown])
if ENABLE_AUDIO:
from crazy_functions.live_audio.audio_io import RealtimeAudioDistribution
rad = RealtimeAudioDistribution()
def deal_audio(audio, cookies):
rad.feed(cookies['uuid'].hex, audio)
audio_mic.stream(deal_audio, inputs=[audio_mic, cookies])
demo.load(init_cookie, inputs=[cookies], outputs=[cookies])
demo.load(persistent_cookie_reload, inputs = [py_pickle_cookie, cookies],
outputs = [py_pickle_cookie, cookies, *customize_btns.values(), *predefined_btns.values()], _js=js_code_for_persistent_cookie_init)
demo.load(None, inputs=[dark_mode], outputs=None, _js="""(dark_mode)=>{apply_cookie_for_checkbox(dark_mode);}""") # 配置暗色主题或亮色主题
demo.load(None, inputs=[gr.Textbox(LAYOUT, visible=False)], outputs=None, _js='(LAYOUT)=>{GptAcademicJavaScriptInit(LAYOUT);}')
# gradio的inbrowser触发不太稳定回滚代码到原始的浏览器打开函数
def run_delayed_tasks():
import threading, webbrowser, time
print(f"如果浏览器没有自动打开请复制并转到以下URL")
if DARK_MODE: print(f"\t「暗色主题已启用(支持动态切换主题)」: http://localhost:{PORT}")
else: print(f"\t「亮色主题已启用(支持动态切换主题)」: http://localhost:{PORT}")
def auto_updates(): time.sleep(0); auto_update()
def open_browser(): time.sleep(2); webbrowser.open_new_tab(f"http://localhost:{PORT}")
def warm_up_mods(): time.sleep(6); warm_up_modules()
threading.Thread(target=auto_updates, name="self-upgrade", daemon=True).start() # 查看自动更新
threading.Thread(target=open_browser, name="open-browser", daemon=True).start() # 打开浏览器页面
threading.Thread(target=warm_up_mods, name="warm-up", daemon=True).start() # 预热tiktoken模块
run_delayed_tasks()
demo.queue(concurrency_count=CONCURRENT_COUNT).launch(server_name="0.0.0.0", share=False, favicon_path="docs/logo.png", blocked_paths=["config.py","config_private.py","docker-compose.yml","Dockerfile"])
# 如果需要在二级路径下运行
# CUSTOM_PATH = get_conf('CUSTOM_PATH')
# if CUSTOM_PATH != "/":
# from toolbox import run_gradio_in_subpath
# run_gradio_in_subpath(demo, auth=AUTHENTICATION, port=PORT, custom_path=CUSTOM_PATH)
# else:
# demo.launch(server_name="0.0.0.0", server_port=PORT, auth=AUTHENTICATION, favicon_path="docs/logo.png",
# blocked_paths=["config.py","config_private.py","docker-compose.yml","Dockerfile",f"{PATH_LOGGING}/admin"])
if __name__ == "__main__":
main()

View File

@@ -1,77 +1,37 @@
from loguru import logger
def check_proxy(proxies, return_ip=False):
"""
检查代理配置并返回结果。
Args:
proxies (dict): 包含http和https代理配置的字典。
return_ip (bool, optional): 是否返回代理的IP地址。默认为False。
Returns:
str or None: 检查的结果信息或代理的IP地址如果`return_ip`为True
"""
def check_proxy(proxies):
import requests
proxies_https = proxies['https'] if proxies is not None else ''
ip = None
try:
response = requests.get("https://ipapi.co/json/", proxies=proxies, timeout=4) # ⭐ 执行GET请求以获取代理信息
response = requests.get("https://ipapi.co/json/", proxies=proxies, timeout=4)
data = response.json()
if 'country_name' in data:
country = data['country_name']
result = f"代理配置 {proxies_https}, 代理所在地:{country}"
if 'ip' in data:
ip = data['ip']
elif 'error' in data:
alternative, ip = _check_with_backup_source(proxies) # ⭐ 调用备用方法检查代理配置
alternative = _check_with_backup_source(proxies)
if alternative is None:
result = f"代理配置 {proxies_https}, 代理所在地未知IP查询频率受限"
else:
result = f"代理配置 {proxies_https}, 代理所在地:{alternative}"
else:
result = f"代理配置 {proxies_https}, 代理数据解析失败:{data}"
if not return_ip:
logger.warning(result)
print(result)
return result
else:
return ip
except:
result = f"代理配置 {proxies_https}, 代理所在地查询超时,代理可能无效"
if not return_ip:
logger.warning(result)
print(result)
return result
else:
return ip
def _check_with_backup_source(proxies):
"""
通过备份源检查代理,并获取相应信息。
Args:
proxies (dict): 包含代理信息的字典。
Returns:
tuple: 代理信息(geo)和IP地址(ip)的元组。
"""
import random, string, requests
random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=32))
try:
res_json = requests.get(f"http://{random_string}.edns.ip-api.com/json", proxies=proxies, timeout=4).json() # ⭐ 执行代理检查和备份源请求
return res_json['dns']['geo'], res_json['dns']['ip']
except:
return None, None
try: return requests.get(f"http://{random_string}.edns.ip-api.com/json", proxies=proxies, timeout=4).json()['dns']['geo']
except: return None
def backup_and_download(current_version, remote_version):
"""
一键更新协议:备份当前版本,下载远程版本并解压缩。
Args:
current_version (str): 当前版本号。
remote_version (str): 远程版本号。
Returns:
str: 新版本目录的路径。
一键更新协议:备份和下载
"""
from toolbox import get_conf
import shutil
@@ -87,8 +47,8 @@ def backup_and_download(current_version, remote_version):
shutil.copytree('./', backup_dir, ignore=lambda x, y: ['history'])
proxies = get_conf('proxies')
try: r = requests.get('https://github.com/binary-husky/chatgpt_academic/archive/refs/heads/master.zip', proxies=proxies, stream=True)
except: r = requests.get('https://public.agent-matrix.com/publish/master.zip', proxies=proxies, stream=True)
zip_file_path = backup_dir+'/master.zip' # ⭐ 保存备份文件的路径
except: r = requests.get('https://public.gpt-academic.top/publish/master.zip', proxies=proxies, stream=True)
zip_file_path = backup_dir+'/master.zip'
with open(zip_file_path, 'wb+') as f:
f.write(r.content)
dst_path = new_version_dir
@@ -104,17 +64,6 @@ def backup_and_download(current_version, remote_version):
def patch_and_restart(path):
"""
一键更新协议:覆盖和重启
Args:
path (str): 新版本代码所在的路径
注意事项:
如果您的程序没有使用config_private.py私密配置文件则会将config.py重命名为config_private.py以避免配置丢失。
更新流程:
- 复制最新版本代码到当前目录
- 更新pip包依赖
- 如果更新失败,则提示手动安装依赖库并重启
"""
from distutils import dir_util
import shutil
@@ -122,44 +71,33 @@ def patch_and_restart(path):
import sys
import time
import glob
from shared_utils.colorful import log亮黄, log亮绿, log亮红
from colorful import print亮黄, print亮绿, print亮红
# if not using config_private, move origin config.py as config_private.py
if not os.path.exists('config_private.py'):
log亮黄('由于您没有设置config_private.py私密配置现将您的现有配置移动至config_private.py以防止配置丢失',
print亮黄('由于您没有设置config_private.py私密配置现将您的现有配置移动至config_private.py以防止配置丢失',
'另外您可以随时在history子文件夹下找回旧版的程序。')
shutil.copyfile('config.py', 'config_private.py')
path_new_version = glob.glob(path + '/*-master')[0]
dir_util.copy_tree(path_new_version, './') # ⭐ 将最新版本代码复制到当前目录
log亮绿('代码已经更新即将更新pip包依赖……')
for i in reversed(range(5)): time.sleep(1); log亮绿(i)
dir_util.copy_tree(path_new_version, './')
print亮绿('代码已经更新即将更新pip包依赖……')
for i in reversed(range(5)): time.sleep(1); print(i)
try:
import subprocess
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'])
except:
log亮红('pip包依赖安装出现问题需要手动安装新增的依赖库 `python -m pip install -r requirements.txt`,然后在用常规的`python main.py`的方式启动。')
log亮绿('更新完成您可以随时在history子文件夹下找回旧版的程序5s之后重启')
log亮红('假如重启失败,您可能需要手动安装新增的依赖库 `python -m pip install -r requirements.txt`,然后在用常规的`python main.py`的方式启动。')
log亮绿(' ------------------------------ -----------------------------------')
for i in reversed(range(8)): time.sleep(1); log亮绿(i)
os.execl(sys.executable, sys.executable, *sys.argv) # 重启程序
print亮红('pip包依赖安装出现问题需要手动安装新增的依赖库 `python -m pip install -r requirements.txt`,然后在用常规的`python main.py`的方式启动。')
print亮绿('更新完成您可以随时在history子文件夹下找回旧版的程序5s之后重启')
print亮红('假如重启失败,您可能需要手动安装新增的依赖库 `python -m pip install -r requirements.txt`,然后在用常规的`python main.py`的方式启动。')
print(' ------------------------------ -----------------------------------')
for i in reversed(range(8)): time.sleep(1); print(i)
os.execl(sys.executable, sys.executable, *sys.argv)
def get_current_version():
"""
获取当前的版本号。
Returns:
str: 当前的版本号。如果无法获取版本号,则返回空字符串。
"""
import json
try:
with open('./version', 'r', encoding='utf8') as f:
current_version = json.loads(f.read())['version'] # ⭐ 从读取的json数据中提取版本号
current_version = json.loads(f.read())['version']
except:
current_version = ""
return current_version
@@ -168,12 +106,6 @@ def get_current_version():
def auto_update(raise_error=False):
"""
一键更新协议:查询版本和用户意见
Args:
raise_error (bool, optional): 是否在出错时抛出错误。默认为 False。
Returns:
None
"""
try:
from toolbox import get_conf
@@ -181,7 +113,7 @@ def auto_update(raise_error=False):
import json
proxies = get_conf('proxies')
try: response = requests.get("https://raw.githubusercontent.com/binary-husky/chatgpt_academic/master/version", proxies=proxies, timeout=5)
except: response = requests.get("https://public.agent-matrix.com/publish/version", proxies=proxies, timeout=5)
except: response = requests.get("https://public.gpt-academic.top/publish/version", proxies=proxies, timeout=5)
remote_json_data = json.loads(response.text)
remote_version = remote_json_data['version']
if remote_json_data["show_feature"]:
@@ -192,22 +124,22 @@ def auto_update(raise_error=False):
current_version = f.read()
current_version = json.loads(current_version)['version']
if (remote_version - current_version) >= 0.01-1e-5:
from shared_utils.colorful import log亮黄
log亮黄(f'\n新版本可用。新版本:{remote_version},当前版本:{current_version}{new_feature}') # ⭐ 在控制台打印新版本信息
logger.info('1Github更新地址:\nhttps://github.com/binary-husky/chatgpt_academic\n')
from colorful import print亮黄
print亮黄(f'\n新版本可用。新版本:{remote_version},当前版本:{current_version}{new_feature}')
print('1Github更新地址:\nhttps://github.com/binary-husky/chatgpt_academic\n')
user_instruction = input('2是否一键更新代码Y+回车=确认,输入其他/无输入+回车=不更新)?')
if user_instruction in ['Y', 'y']:
path = backup_and_download(current_version, remote_version) # ⭐ 备份并下载文件
path = backup_and_download(current_version, remote_version)
try:
patch_and_restart(path) # ⭐ 执行覆盖并重启操作
patch_and_restart(path)
except:
msg = '更新失败。'
if raise_error:
from toolbox import trimmed_format_exc
msg += trimmed_format_exc()
logger.warning(msg)
print(msg)
else:
logger.info('自动更新程序:已禁用')
print('自动更新程序:已禁用')
return
else:
return
@@ -216,13 +148,10 @@ def auto_update(raise_error=False):
if raise_error:
from toolbox import trimmed_format_exc
msg += trimmed_format_exc()
logger.info(msg)
print(msg)
def warm_up_modules():
"""
预热模块,加载特定模块并执行预热操作。
"""
logger.info('正在执行一些模块的预热 ...')
print('正在执行一些模块的预热 ...')
from toolbox import ProxyNetworkActivate
from request_llms.bridge_all import model_info
with ProxyNetworkActivate("Warmup_Modules"):
@@ -232,17 +161,7 @@ def warm_up_modules():
enc.encode("模块预热", disallowed_special=())
def warm_up_vectordb():
"""
执行一些模块的预热操作。
本函数主要用于执行一些模块的预热操作,确保在后续的流程中能够顺利运行。
⭐ 关键作用:预热模块
Returns:
None
"""
logger.info('正在执行一些模块的预热 ...')
print('正在执行一些模块的预热 ...')
from toolbox import ProxyNetworkActivate
with ProxyNetworkActivate("Warmup_Modules"):
import nltk

View File

@@ -1,6 +1,5 @@
import platform
from sys import stdout
from loguru import logger
if platform.system()=="Linux":
pass
@@ -60,29 +59,3 @@ def sprint亮紫(*kw):
return "\033[1;35m"+' '.join(kw)+"\033[0m"
def sprint亮靛(*kw):
return "\033[1;36m"+' '.join(kw)+"\033[0m"
def log红(*kw,**kargs):
logger.opt(depth=1).info(sprint红(*kw))
def log绿(*kw,**kargs):
logger.opt(depth=1).info(sprint绿(*kw))
def log黄(*kw,**kargs):
logger.opt(depth=1).info(sprint黄(*kw))
def log蓝(*kw,**kargs):
logger.opt(depth=1).info(sprint蓝(*kw))
def log紫(*kw,**kargs):
logger.opt(depth=1).info(sprint紫(*kw))
def log靛(*kw,**kargs):
logger.opt(depth=1).info(sprint靛(*kw))
def log亮红(*kw,**kargs):
logger.opt(depth=1).info(sprint亮红(*kw))
def log亮绿(*kw,**kargs):
logger.opt(depth=1).info(sprint亮绿(*kw))
def log亮黄(*kw,**kargs):
logger.opt(depth=1).info(sprint亮黄(*kw))
def log亮蓝(*kw,**kargs):
logger.opt(depth=1).info(sprint亮蓝(*kw))
def log亮紫(*kw,**kargs):
logger.opt(depth=1).info(sprint亮紫(*kw))
def log亮靛(*kw,**kargs):
logger.opt(depth=1).info(sprint亮靛(*kw))

155
config.py
View File

@@ -11,6 +11,10 @@
API_KEY = "此处填API密钥" # 可同时填写多个API-KEY用英文逗号分割例如API_KEY = "sk-openaikey1,sk-openaikey2,fkxxxx-api2dkey3,azure-apikey4"
# [step 1]>> API_KEY = "sk-123456789xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx123456789"。极少数情况下还需要填写组织格式如org-123456789abcdefghijklmno的请向下翻找 API_ORG 设置项
API_KEY = "此处填API密钥" # 可同时填写多个API-KEY用英文逗号分割例如API_KEY = "sk-openaikey1,sk-openaikey2,fkxxxx-api2dkey3,azure-apikey4"
# [step 2]>> 改为True应用代理如果直接在海外服务器部署此处不修改如果使用本地或无地域限制的大模型时此处也不需要修改
USE_PROXY = False
if USE_PROXY:
@@ -30,44 +34,11 @@ if USE_PROXY:
else:
proxies = None
# [step 3]>> 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 )
LLM_MODEL = "gpt-3.5-turbo-16k" # 可选 ↓↓↓
AVAIL_LLM_MODELS = ["gpt-4-1106-preview", "gpt-4-turbo-preview", "gpt-4-vision-preview",
"gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4-turbo-2024-04-09",
"gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5",
"gpt-4", "gpt-4-32k", "azure-gpt-4", "glm-4", "glm-4v", "glm-3-turbo",
"gemini-1.5-pro", "chatglm3"
]
EMBEDDING_MODEL = "text-embedding-3-small"
# --- --- --- ---
# P.S. 其他可用的模型还包括
# AVAIL_LLM_MODELS = [
# "glm-4-0520", "glm-4-air", "glm-4-airx", "glm-4-flash",
# "qianfan", "deepseekcoder",
# "spark", "sparkv2", "sparkv3", "sparkv3.5", "sparkv4",
# "qwen-turbo", "qwen-plus", "qwen-max", "qwen-local",
# "moonshot-v1-128k", "moonshot-v1-32k", "moonshot-v1-8k",
# "gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-0125", "gpt-4o-2024-05-13"
# "claude-3-haiku-20240307","claude-3-sonnet-20240229","claude-3-opus-20240229", "claude-2.1", "claude-instant-1.2",
# "moss", "llama2", "chatglm_onnx", "internlm", "jittorllms_pangualpha", "jittorllms_llama",
# "deepseek-chat" ,"deepseek-coder",
# "gemini-1.5-flash",
# "yi-34b-chat-0205","yi-34b-chat-200k","yi-large","yi-medium","yi-spark","yi-large-turbo","yi-large-preview",
# ]
# --- --- --- ---
# 此外您还可以在接入one-api/vllm/ollama/Openroute时
# 使用"one-api-*","vllm-*","ollama-*","openrouter-*"前缀直接使用非标准方式接入的模型,例如
# AVAIL_LLM_MODELS = ["one-api-claude-3-sonnet-20240229(max_token=100000)", "ollama-phi3(max_token=4096)","openrouter-openai/gpt-4o-mini","openrouter-openai/chatgpt-4o-latest"]
# --- --- --- ---
# --------------- 以下配置可以优化体验 ---------------
# ------------------------------------ 以下配置可以优化体验, 但大部分场合下并不需要修改 ------------------------------------
# 重新URL重新定向实现更换API_URL的作用高危设置! 常规情况下不要修改! 通过修改此设置您将把您的API-KEY和对话隐私完全暴露给您设定的中间人
# 格式: API_URL_REDIRECT = {"https://api.openai.com/v1/chat/completions": "在这里填写重定向的api.openai.com的URL"}
# 举例: API_URL_REDIRECT = {"https://api.openai.com/v1/chat/completions": "https://reverse-proxy-url/v1/chat/completions", "http://localhost:11434/api/chat": "在这里填写您ollama的URL"}
# 举例: API_URL_REDIRECT = {"https://api.openai.com/v1/chat/completions": "https://reverse-proxy-url/v1/chat/completions"}
API_URL_REDIRECT = {}
@@ -78,7 +49,7 @@ DEFAULT_WORKER_NUM = 3
# 色彩主题, 可选 ["Default", "Chuanhu-Small-and-Beautiful", "High-Contrast"]
# 更多主题, 请查阅Gradio主题商店: https://huggingface.co/spaces/gradio/theme-gallery 可选 ["Gstaff/Xkcd", "NoCrypt/Miku", ...]
THEME = "Default"
THEME = "Chuanhu-Small-and-Beautiful"
AVAIL_THEMES = ["Default", "Chuanhu-Small-and-Beautiful", "High-Contrast", "Gstaff/Xkcd", "NoCrypt/Miku"]
@@ -99,7 +70,7 @@ LAYOUT = "LEFT-RIGHT" # "LEFT-RIGHT"(左右布局) # "TOP-DOWN"(上下
# 暗色模式 / 亮色模式
DARK_MODE = True
DARK_MODE = False
# 发送请求到OpenAI后等待多久判定为超时
@@ -110,18 +81,31 @@ TIMEOUT_SECONDS = 30
WEB_PORT = -1
# 是否自动打开浏览器页面
AUTO_OPEN_BROWSER = True
# 如果OpenAI不响应网络卡顿、代理失败、KEY失效重试的次数限制
MAX_RETRY = 2
# OpenAI模型选择是gpt4现在只对申请成功的人开放
LLM_MODEL = "gpt-3.5-turbo" # 可选 "chatglm"
AVAIL_LLM_MODELS = ["gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "api2d-gpt-3.5-turbo", "spark", "azure-gpt-3.5"]
# 插件分类默认选项
DEFAULT_FN_GROUPS = ['对话', '编程', '学术', '智能体']
# 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 )
LLM_MODEL = "gpt-3.5-turbo-16k" # 可选 ↓↓↓
AVAIL_LLM_MODELS = ["gpt-4-1106-preview", "gpt-4-turbo-preview", "gpt-4-vision-preview",
"gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5",
"gpt-4", "gpt-4-32k", "azure-gpt-4", "glm-4", "glm-3-turbo",
"gemini-pro", "chatglm3", "claude-2"]
# P.S. 其他可用的模型还包括 [
# "moss", "qwen-turbo", "qwen-plus", "qwen-max"
# "zhipuai", "qianfan", "deepseekcoder", "llama2", "qwen-local", "gpt-3.5-turbo-0613",
# "gpt-3.5-turbo-16k-0613", "gpt-3.5-random", "api2d-gpt-3.5-turbo", 'api2d-gpt-3.5-turbo-16k',
# "spark", "sparkv2", "sparkv3", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_pangualpha", "jittorllms_llama"
# ]
# 定义界面上“询问多个GPT模型”插件应该使用哪些模型请从AVAIL_LLM_MODELS中选择并在不同模型之间用`&`间隔,例如"gpt-3.5-turbo&chatglm3&azure-gpt-4"
MULTI_QUERY_LLM_MODELS = "gpt-3.5-turbo&chatglm3"
@@ -139,7 +123,7 @@ DASHSCOPE_API_KEY = "" # 阿里灵积云API_KEY
# 百度千帆LLM_MODEL="qianfan"
BAIDU_CLOUD_API_KEY = ''
BAIDU_CLOUD_SECRET_KEY = ''
BAIDU_CLOUD_QIANFAN_MODEL = 'ERNIE-Bot' # 可选 "ERNIE-Bot-4"(文心大模型4.0), "ERNIE-Bot"(文心一言), "ERNIE-Bot-turbo", "BLOOMZ-7B", "Llama-2-70B-Chat", "Llama-2-13B-Chat", "Llama-2-7B-Chat", "ERNIE-Speed-128K", "ERNIE-Speed-8K", "ERNIE-Lite-8K"
BAIDU_CLOUD_QIANFAN_MODEL = 'ERNIE-Bot' # 可选 "ERNIE-Bot-4"(文心大模型4.0), "ERNIE-Bot"(文心一言), "ERNIE-Bot-turbo", "BLOOMZ-7B", "Llama-2-70B-Chat", "Llama-2-13B-Chat", "Llama-2-7B-Chat"
# 如果使用ChatGLM2微调模型请把 LLM_MODEL="chatglmft",并在此处指定模型路径
@@ -150,7 +134,6 @@ CHATGLM_PTUNING_CHECKPOINT = "" # 例如"/home/hmp/ChatGLM2-6B/ptuning/output/6b
LOCAL_MODEL_DEVICE = "cpu" # 可选 "cuda"
LOCAL_MODEL_QUANT = "FP16" # 默认 "FP16" "INT4" 启用量化INT4版本 "INT8" 启用量化INT8版本
# 设置gradio的并行线程数不需要修改
CONCURRENT_COUNT = 100
@@ -160,7 +143,7 @@ AUTO_CLEAR_TXT = False
# 加一个live2d装饰
ADD_WAIFU = False
ADD_WAIFU = True
# 设置用户名和密码不需要修改相关功能不稳定与gradio版本和网络都相关如果本地使用不建议加这个
@@ -168,8 +151,7 @@ ADD_WAIFU = False
AUTHENTICATION = []
# 如果需要在二级路径下运行(常规情况下,不要修改!!
# (举例 CUSTOM_PATH = "/gpt_academic",可以让软件运行在 http://ip:port/gpt_academic/ 下。)
# 如果需要在二级路径下运行(常规情况下,不要修改!!需要配合修改main.py才能生效!
CUSTOM_PATH = "/"
@@ -197,8 +179,14 @@ AZURE_ENGINE = "填入你亲手写的部署名" # 读 docs\use_azure.
AZURE_CFG_ARRAY = {}
# 阿里云实时语音识别 配置难度较高
# 参考 https://github.com/binary-husky/gpt_academic/blob/master/docs/use_audio.md
# 使用Newbing (不推荐使用,未来将删除)
NEWBING_STYLE = "creative" # ["creative", "balanced", "precise"]
NEWBING_COOKIES = """
put your new bing cookies here
"""
# 阿里云实时语音识别 配置难度较高 仅建议高手用户使用 参考 https://github.com/binary-husky/gpt_academic/blob/master/docs/use_audio.md
ENABLE_AUDIO = False
ALIYUN_TOKEN="" # 例如 f37f30e0f9934c34a992f6f64f7eba4f
ALIYUN_APPKEY="" # 例如 RoPlZrM88DnAFkZK
@@ -206,12 +194,6 @@ ALIYUN_ACCESSKEY="" # (无需填写)
ALIYUN_SECRET="" # (无需填写)
# GPT-SOVITS 文本转语音服务的运行地址(将语言模型的生成文本朗读出来)
TTS_TYPE = "EDGE_TTS" # EDGE_TTS / LOCAL_SOVITS_API / DISABLE
GPT_SOVITS_URL = ""
EDGE_TTS_VOICE = "zh-CN-XiaoxiaoNeural"
# 接入讯飞星火大模型 https://console.xfyun.cn/services/iat
XFYUN_APPID = "00000000"
XFYUN_API_SECRET = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
@@ -223,35 +205,21 @@ ZHIPUAI_API_KEY = ""
ZHIPUAI_MODEL = "" # 此选项已废弃,不再需要填写
# # 火山引擎YUNQUE大模型
# YUNQUE_SECRET_KEY = ""
# YUNQUE_ACCESS_KEY = ""
# YUNQUE_MODEL = ""
# Claude API KEY
ANTHROPIC_API_KEY = ""
# 月之暗面 API KEY
MOONSHOT_API_KEY = ""
# 零一万物(Yi Model) API KEY
YIMODEL_API_KEY = ""
# 深度求索(DeepSeek) API KEY默认请求地址为"https://api.deepseek.com/v1/chat/completions"
DEEPSEEK_API_KEY = ""
# 紫东太初大模型 https://ai-maas.wair.ac.cn
TAICHU_API_KEY = ""
# Mathpix 拥有执行PDF的OCR功能但是需要注册账号
MATHPIX_APPID = ""
MATHPIX_APPKEY = ""
# DOC2X的PDF解析服务注册账号并获取API KEY: https://doc2x.noedgeai.com/login
DOC2X_API_KEY = ""
# 自定义API KEY格式
CUSTOM_API_KEY_PATTERN = ""
@@ -273,10 +241,6 @@ GROBID_URLS = [
]
# Searxng互联网检索服务
SEARXNG_URL = "https://cloud-1.agent-matrix.com/"
# 是否允许通过自然语言描述修改本页的配置,该功能具有一定的危险性,默认关闭
ALLOW_RESET_CONFIG = False
@@ -285,21 +249,21 @@ ALLOW_RESET_CONFIG = False
AUTOGEN_USE_DOCKER = False
# 临时的上传文件夹位置,请尽量不要修改
# 临时的上传文件夹位置,请修改
PATH_PRIVATE_UPLOAD = "private_upload"
# 日志文件夹的位置,请尽量不要修改
# 日志文件夹的位置,请修改
PATH_LOGGING = "gpt_log"
# 存储翻译好的arxiv论文的路径请尽量不要修改
ARXIV_CACHE_DIR = "gpt_log/arxiv_cache"
# 除了连接OpenAI之外还有哪些场合允许使用代理请尽量不要修改
# 除了连接OpenAI之外还有哪些场合允许使用代理请勿修改
WHEN_TO_USE_PROXY = ["Download_LLM", "Download_Gradio_Theme", "Connect_Grobid",
"Warmup_Modules", "Nougat_Download", "AutoGen", "Connect_OpenAI_Embedding"]
"Warmup_Modules", "Nougat_Download", "AutoGen"]
# *实验性功能*: 自动检测并屏蔽失效的KEY请勿使用
BLOCK_INVALID_APIKEY = False
# 启用插件热加载
@@ -309,11 +273,7 @@ PLUGIN_HOT_RELOAD = False
# 自定义按钮的最大数量限制
NUM_CUSTOM_BASIC_BTN = 4
"""
--------------- 配置关联关系说明 ---------------
在线大模型配置关联关系示意图
├── "gpt-3.5-turbo" 等openai模型
@@ -337,7 +297,7 @@ NUM_CUSTOM_BASIC_BTN = 4
│ ├── XFYUN_API_SECRET
│ └── XFYUN_API_KEY
├── "claude-3-opus-20240229" 等claude模型
├── "claude-1-100k" 等claude模型
│ └── ANTHROPIC_API_KEY
├── "stack-claude"
@@ -352,19 +312,15 @@ NUM_CUSTOM_BASIC_BTN = 4
├── "glm-4", "glm-3-turbo", "zhipuai" 智谱AI大模型
│ └── ZHIPUAI_API_KEY
├── "yi-34b-chat-0205", "yi-34b-chat-200k" 等零一万物(Yi Model)大模型
│ └── YIMODEL_API_KEY
├── "qwen-turbo" 等通义千问大模型
│ └── DASHSCOPE_API_KEY
├── "Gemini"
│ └── GEMINI_API_KEY
└── "one-api-...(max_token=...)" 用一种更方便的方式接入one-api多模型管理界面
├── AVAIL_LLM_MODELS
── API_KEY
└── API_URL_REDIRECT
└── "newbing" Newbing接口不再稳定不推荐使用
├── NEWBING_STYLE
── NEWBING_COOKIES
本地大模型示意图
@@ -398,9 +354,6 @@ NUM_CUSTOM_BASIC_BTN = 4
插件在线服务配置依赖关系示意图
├── 互联网检索
│ └── SEARXNG_URL
├── 语音功能
│ ├── ENABLE_AUDIO
│ ├── ALIYUN_TOKEN

View File

@@ -17,7 +17,7 @@ def get_core_functions():
text_show_english=
r"Below is a paragraph from an academic paper. Polish the writing to meet the academic style, "
r"improve the spelling, grammar, clarity, concision and overall readability. When necessary, rewrite the whole sentence. "
r"Firstly, you should provide the polished paragraph (in English). "
r"Firstly, you should provide the polished paragraph. "
r"Secondly, you should list all your modification and explain the reasons to do so in markdown table.",
text_show_chinese=
r"作为一名中文学术论文写作改进助理,你的任务是改进所提供文本的拼写、语法、清晰、简洁和整体可读性,"
@@ -33,19 +33,17 @@ def get_core_functions():
"AutoClearHistory": False,
# [6] 文本预处理 (可选参数,默认 None举例写个函数移除所有的换行符
"PreProcess": None,
# [7] 模型选择 (可选参数。如不设置,则使用当前全局模型;如设置,则用指定模型覆盖全局模型。)
# "ModelOverride": "gpt-3.5-turbo", # 主要用途:强制点击此基础功能按钮时,使用指定的模型。
},
"总结绘制脑图": {
# 前缀,会被加在你的输入之前。例如,用来描述你的要求,例如翻译、解释代码、润色等等
"Prefix": '''"""\n\n''',
"Prefix": r"",
# 后缀,会被加在你的输入之后。例如,配合前缀可以把你的输入内容用引号圈起来
"Suffix":
# dedent() 函数用于去除多行字符串的缩进
dedent("\n\n"+r'''
"""
dedent("\n"+r'''
==============================
使用mermaid flowchart对以上文本进行总结概括上述段落的内容以及内在逻辑关系例如
@@ -59,7 +57,7 @@ def get_core_functions():
C --> |"箭头名2"| F["节点名6"]
```
注意
警告
1使用中文
2节点名字使用引号包裹如["Laptop"]
3`|` 和 `"`之间不要存在空格

View File

@@ -1,66 +1,68 @@
from toolbox import HotReload # HotReload 的意思是热更新,修改函数插件后,不需要重启程序,代码直接生效
from toolbox import trimmed_format_exc
from loguru import logger
def get_crazy_functions():
from crazy_functions.读文章写摘要 import 读文章写摘要
from crazy_functions.生成函数注释 import 批量生成函数注释
from crazy_functions.SourceCode_Analyse import 解析项目本身
from crazy_functions.SourceCode_Analyse import 解析一个Python项目
from crazy_functions.SourceCode_Analyse import 解析一个Matlab项目
from crazy_functions.SourceCode_Analyse import 解析一个C项目的头文件
from crazy_functions.SourceCode_Analyse import 解析一个C项目
from crazy_functions.SourceCode_Analyse import 解析一个Golang项目
from crazy_functions.SourceCode_Analyse import 解析一个Rust项目
from crazy_functions.SourceCode_Analyse import 解析一个Java项目
from crazy_functions.SourceCode_Analyse import 解析一个前端项目
from crazy_functions.Arxiv_论文对话 import Arxiv论文对话
from crazy_functions.解析项目源代码 import 解析项目本身
from crazy_functions.解析项目源代码 import 解析一个Python项目
from crazy_functions.解析项目源代码 import 解析一个Matlab项目
from crazy_functions.解析项目源代码 import 解析一个C项目的头文件
from crazy_functions.解析项目源代码 import 解析一个C项目
from crazy_functions.解析项目源代码 import 解析一个Golang项目
from crazy_functions.解析项目源代码 import 解析一个Rust项目
from crazy_functions.解析项目源代码 import 解析一个Java项目
from crazy_functions.解析项目源代码 import 解析一个前端项目
from crazy_functions.高级功能函数模板 import 高阶功能模板函数
from crazy_functions.高级功能函数模板 import Demo_Wrap
from crazy_functions.Latex全文润色 import Latex英文润色
from crazy_functions.询问多个大语言模型 import 同时问询
from crazy_functions.SourceCode_Analyse import 解析一个Lua项目
from crazy_functions.SourceCode_Analyse import 解析一个CSharp项目
from crazy_functions.解析项目源代码 import 解析一个Lua项目
from crazy_functions.解析项目源代码 import 解析一个CSharp项目
from crazy_functions.总结word文档 import 总结word文档
from crazy_functions.解析JupyterNotebook import 解析ipynb文件
from crazy_functions.Conversation_To_File import 载入对话历史存档
from crazy_functions.Conversation_To_File import 对话历史存档
from crazy_functions.Conversation_To_File import Conversation_To_File_Wrap
from crazy_functions.Conversation_To_File import 删除所有本地对话历史记录
from crazy_functions.对话历史存档 import 对话历史存档
from crazy_functions.对话历史存档 import 载入对话历史存档
from crazy_functions.对话历史存档 import 删除所有本地对话历史记录
from crazy_functions.辅助功能 import 清除缓存
from crazy_functions.Markdown_Translate import Markdown英译中
from crazy_functions.批量Markdown翻译 import Markdown英译中
from crazy_functions.批量总结PDF文档 import 批量总结PDF文档
from crazy_functions.PDF_Translate import 批量翻译PDF文档
from crazy_functions.批量文件询问 import 批量文件询问
from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
from crazy_functions.谷歌检索小助手 import 谷歌检索小助手
from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入
from crazy_functions.Latex全文润色 import Latex中文润色
from crazy_functions.Latex全文润色 import Latex英文纠错
from crazy_functions.Markdown_Translate import Markdown中译英
from crazy_functions.批量Markdown翻译 import Markdown中译英
from crazy_functions.虚空终端 import 虚空终端
from crazy_functions.生成多种Mermaid图表 import Mermaid_Gen
from crazy_functions.PDF_Translate_Wrap import PDF_Tran
from crazy_functions.Latex_Function import Latex英文纠错加PDF对比
from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF
from crazy_functions.Latex_Function import PDF翻译中文并重新编译PDF
from crazy_functions.Latex_Function_Wrap import Arxiv_Localize
from crazy_functions.Latex_Function_Wrap import PDF_Localize
from crazy_functions.Internet_GPT import 连接网络回答问题
from crazy_functions.Internet_GPT_Wrap import NetworkGPT_Wrap
from crazy_functions.Image_Generate import 图片生成_DALLE2, 图片生成_DALLE3, 图片修改_DALLE2
from crazy_functions.Image_Generate_Wrap import ImageGen_Wrap
from crazy_functions.SourceCode_Comment import 注释Python项目
from crazy_functions.SourceCode_Comment_Wrap import SourceCodeComment_Wrap
from crazy_functions.生成多种Mermaid图表 import 生成多种Mermaid图表
function_plugins = {
"虚空终端": {
"Group": "对话|编程|学术|智能体",
"Color": "stop",
"AsButton": True,
"Info": "使用自然语言实现您的想法",
"Function": HotReload(虚空终端),
},
"解析整个Python项目": {
"Group": "编程",
"Color": "stop",
"AsButton": True,
"Info": "解析一个Python项目的所有源文件(.py) | 输入参数为路径",
"Function": HotReload(解析一个Python项目),
},
"载入对话历史存档(先上传存档或输入路径)": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"Info": "载入对话历史存档 | 输入参数为路径",
"Function": HotReload(载入对话历史存档),
},
"删除所有本地对话历史记录(谨慎操作)": {
"Group": "对话",
"AsButton": False,
"Info": "删除所有本地对话历史记录,谨慎操作 | 不需要输入参数",
"Function": HotReload(删除所有本地对话历史记录),
},
"清除所有缓存文件(谨慎操作)": {
"Group": "对话",
"Color": "stop",
@@ -68,33 +70,100 @@ def get_crazy_functions():
"Info": "清除所有缓存文件,谨慎操作 | 不需要输入参数",
"Function": HotReload(清除缓存),
},
"Arxiv论文翻译": {
"生成多种Mermaid图表(从当前对话或路径(.pdf/.md/.docx)中生产图表)": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"Info" : "基于当前对话或文件生成多种Mermaid图表,图表类型由模型判断",
"Function": HotReload(生成多种Mermaid图表),
"AdvancedArgs": True,
"ArgsReminder": "请输入图类型对应的数字,不输入则为模型自行判断:1-流程图,2-序列图,3-类图,4-饼图,5-甘特图,6-状态图,7-实体关系图,8-象限提示图,9-思维导图",
},
"批量总结Word文档": {
"Group": "学术",
"Color": "stop",
"AsButton": True,
"Info": "Arixv论文精细翻译 | 输入参数arxiv论文的ID比如1812.10695",
"Function": HotReload(Latex翻译中文并重新编译PDF), # 当注册Class后Function旧接口仅会在“虚空终端”中起作用
"Class": Arxiv_Localize, # 新一代插件需要注册Class
"Info": "批量总结word文档 | 输入参数为路径",
"Function": HotReload(总结word文档),
},
"批量文件询问": {
"解析整个Matlab项目": {
"Group": "编程",
"Color": "stop",
"AsButton": False,
"Info": "解析一个Matlab项目的所有源文件(.m) | 输入参数为路径",
"Function": HotReload(解析一个Matlab项目),
},
"解析整个C++项目头文件": {
"Group": "编程",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Info": "解析一个C++项目的所有头文件(.h/.hpp) | 输入参数为路径",
"Function": HotReload(解析一个C项目的头文件),
},
"解析整个C++项目(.cpp/.hpp/.c/.h": {
"Group": "编程",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Info": "解析一个C++项目的所有源文件(.cpp/.hpp/.c/.h| 输入参数为路径",
"Function": HotReload(解析一个C项目),
},
"解析整个Go项目": {
"Group": "编程",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Info": "解析一个Go项目的所有源文件 | 输入参数为路径",
"Function": HotReload(解析一个Golang项目),
},
"解析整个Rust项目": {
"Group": "编程",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Info": "解析一个Rust项目的所有源文件 | 输入参数为路径",
"Function": HotReload(解析一个Rust项目),
},
"解析整个Java项目": {
"Group": "编程",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Info": "解析一个Java项目的所有源文件 | 输入参数为路径",
"Function": HotReload(解析一个Java项目),
},
"解析整个前端项目js,ts,css等": {
"Group": "编程",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Info": "解析一个前端项目的所有源文件js,ts,css等 | 输入参数为路径",
"Function": HotReload(解析一个前端项目),
},
"解析整个Lua项目": {
"Group": "编程",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Info": "解析一个Lua项目的所有源文件 | 输入参数为路径",
"Function": HotReload(解析一个Lua项目),
},
"解析整个CSharp项目": {
"Group": "编程",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Info": "解析一个CSharp项目的所有源文件 | 输入参数为路径",
"Function": HotReload(解析一个CSharp项目),
},
"解析Jupyter Notebook文件": {
"Group": "编程",
"Color": "stop",
"AsButton": False,
"Info": "解析Jupyter Notebook文件 | 输入参数为路径",
"Function": HotReload(解析ipynb文件),
"AdvancedArgs": True, # 调用时唤起高级参数输入区默认False
"ArgsReminder": "若输入0则不解析notebook中的Markdown块", # 高级参数输入区的显示提示
},
"读Tex论文写摘要": {
"Group": "学术",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True,
"Info": "通过在高级参数区写入prompt可自定义询问逻辑默认情况下为总结逻辑 | 输入参数为路径",
"ArgsReminder": r"1、请不要更改上方输入框中以“private_upload/...”开头的路径。 "
r"2、请在下方高级参数区中输入你的prompt文档中的内容将被添加你的prompt后。3、示例“请总结下面的内容此时文档内容将添加在“”后 ",
"Function": HotReload(批量文件询问),
},
"Arxiv论文对话": {
"Group": "学术",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True,
"Info": "在输入区中输入论文ID在高级参数区中输入问题",
"ArgsReminder": r"1、请在输入区中输入arxiv ID。 "
r"2、请在下方高级参数区中输入你的问题示例“这篇文章的方法是什么请用中文回答我” ",
"Function": HotReload(Arxiv论文对话),
"Info": "读取Tex论文并写摘要 | 输入参数为路径",
"Function": HotReload(读文章写摘要),
},
"翻译README或MD": {
"Group": "编程",
@@ -119,42 +188,28 @@ def get_crazy_functions():
},
"保存当前的对话": {
"Group": "对话",
"Color": "stop",
"AsButton": True,
"Info": "保存当前的对话 | 不需要输入参数",
"Function": HotReload(对话历史存档), # 当注册Class后Function旧接口仅会在“虚空终端”中起作用
"Class": Conversation_To_File_Wrap # 新一代插件需要注册Class
"Function": HotReload(对话历史存档),
},
"[多线程Demo]解析此项目本身(源码自译解)": {
"Group": "对话|编程",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Info": "多线程解析并翻译此项目的源码 | 不需要输入参数",
"Function": HotReload(解析项目本身),
},
"查互联网后回答": {
"Group": "对话",
"Color": "stop",
"AsButton": True, # 加入下拉菜单中
# "Info": "连接网络回答问题(需要访问谷歌)| 输入参数是一个问题",
"Function": HotReload(连接网络回答问题),
"Class": NetworkGPT_Wrap # 新一代插件需要注册Class
},
"历史上的今天": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"AsButton": True,
"Info": "查看历史上的今天事件 (这是一个面向开发者的插件Demo) | 不需要输入参数",
"Function": None,
"Class": Demo_Wrap, # 新一代插件需要注册Class
"Function": HotReload(高阶功能模板函数),
},
"精准翻译PDF论文": {
"Group": "学术",
"Color": "stop",
"AsButton": True,
"Info": "精准翻译PDF论文为中文 | 输入参数为路径",
"Function": HotReload(批量翻译PDF文档), # 当注册Class后Function旧接口仅会在“虚空终端”中起作用
"Class": PDF_Tran, # 新一代插件需要注册Class
"Function": HotReload(批量翻译PDF文档),
},
"询问多个GPT模型": {
"Group": "对话",
@@ -229,6 +284,260 @@ def get_crazy_functions():
"Info": "批量将Markdown文件中文翻译为英文 | 输入参数为路径或上传压缩包",
"Function": HotReload(Markdown中译英),
},
}
# -=--=- 尚未充分测试的实验性插件 & 需要额外依赖的插件 -=--=-
try:
from crazy_functions.下载arxiv论文翻译摘要 import 下载arxiv论文并翻译摘要
function_plugins.update(
{
"一键下载arxiv论文并翻译摘要先在input输入编号如1812.10695": {
"Group": "学术",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
# "Info": "下载arxiv论文并翻译摘要 | 输入参数为arxiv编号如1812.10695",
"Function": HotReload(下载arxiv论文并翻译摘要),
}
}
)
except:
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.联网的ChatGPT import 连接网络回答问题
function_plugins.update(
{
"连接网络回答问题(输入问题后点击该插件,需要访问谷歌)": {
"Group": "对话",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
# "Info": "连接网络回答问题(需要访问谷歌)| 输入参数是一个问题",
"Function": HotReload(连接网络回答问题),
}
}
)
from crazy_functions.联网的ChatGPT_bing版 import 连接bing搜索回答问题
function_plugins.update(
{
"连接网络回答问题中文Bing版输入问题后点击该插件": {
"Group": "对话",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Info": "连接网络回答问题需要访问中文Bing| 输入参数是一个问题",
"Function": HotReload(连接bing搜索回答问题),
}
}
)
except:
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.解析项目源代码 import 解析任意code项目
function_plugins.update(
{
"解析项目源代码(手动指定和筛选源代码文件类型)": {
"Group": "编程",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True, # 调用时唤起高级参数输入区默认False
"ArgsReminder": '输入时用逗号隔开, *代表通配符, 加了^代表不匹配; 不输入代表全部匹配。例如: "*.c, ^*.cpp, config.toml, ^*.toml"', # 高级参数输入区的显示提示
"Function": HotReload(解析任意code项目),
},
}
)
except:
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.询问多个大语言模型 import 同时问询_指定模型
function_plugins.update(
{
"询问多个GPT模型手动指定询问哪些模型": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True, # 调用时唤起高级参数输入区默认False
"ArgsReminder": "支持任意数量的llm接口用&符号分隔。例如chatglm&gpt-3.5-turbo&gpt-4", # 高级参数输入区的显示提示
"Function": HotReload(同时问询_指定模型),
},
}
)
except:
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.图片生成 import 图片生成_DALLE2, 图片生成_DALLE3, 图片修改_DALLE2
function_plugins.update(
{
"图片生成_DALLE2 先切换模型到gpt-*": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True, # 调用时唤起高级参数输入区默认False
"ArgsReminder": "在这里输入分辨率, 如1024x1024默认支持 256x256, 512x512, 1024x1024", # 高级参数输入区的显示提示
"Info": "使用DALLE2生成图片 | 输入参数字符串,提供图像的内容",
"Function": HotReload(图片生成_DALLE2),
},
}
)
function_plugins.update(
{
"图片生成_DALLE3 先切换模型到gpt-*": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True, # 调用时唤起高级参数输入区默认False
"ArgsReminder": "在这里输入自定义参数「分辨率-质量(可选)-风格(可选)」, 参数示例「1024x1024-hd-vivid」 || 分辨率支持 「1024x1024」(默认) /「1792x1024」/「1024x1792」 || 质量支持 「-standard」(默认) /「-hd」 || 风格支持 「-vivid」(默认) /「-natural」", # 高级参数输入区的显示提示
"Info": "使用DALLE3生成图片 | 输入参数字符串,提供图像的内容",
"Function": HotReload(图片生成_DALLE3),
},
}
)
function_plugins.update(
{
"图片修改_DALLE2 先切换模型到gpt-*": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": False, # 调用时唤起高级参数输入区默认False
# "Info": "使用DALLE2修改图片 | 输入参数字符串,提供图像的内容",
"Function": HotReload(图片修改_DALLE2),
},
}
)
except:
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.总结音视频 import 总结音视频
function_plugins.update(
{
"批量总结音视频(输入路径或上传压缩包)": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True,
"ArgsReminder": "调用openai api 使用whisper-1模型, 目前支持的格式:mp4, m4a, wav, mpga, mpeg, mp3。此处可以输入解析提示例如解析为简体中文默认",
"Info": "批量总结音频或视频 | 输入参数为路径",
"Function": HotReload(总结音视频),
}
}
)
except:
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.数学动画生成manim import 动画生成
function_plugins.update(
{
"数学动画生成Manim": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"Info": "按照自然语言描述生成一个动画 | 输入参数是一段话",
"Function": HotReload(动画生成),
}
}
)
except:
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.批量Markdown翻译 import Markdown翻译指定语言
function_plugins.update(
{
"Markdown翻译指定翻译成何种语言": {
"Group": "编程",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True,
"ArgsReminder": "请输入要翻译成哪种语言默认为Chinese。",
"Function": HotReload(Markdown翻译指定语言),
}
}
)
except:
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.知识库问答 import 知识库文件注入
function_plugins.update(
{
"构建知识库(先上传文件素材,再运行此插件)": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True,
"ArgsReminder": "此处待注入的知识库名称id, 默认为default。文件进入知识库后可长期保存。可以通过再次调用本插件的方式向知识库追加更多文档。",
"Function": HotReload(知识库文件注入),
}
}
)
except:
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.知识库问答 import 读取知识库作答
function_plugins.update(
{
"知识库文件注入(构建知识库后,再运行此插件)": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True,
"ArgsReminder": "待提取的知识库名称id, 默认为default, 您需要构建知识库后再运行此插件。",
"Function": HotReload(读取知识库作答),
}
}
)
except:
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.交互功能函数模板 import 交互功能模板函数
function_plugins.update(
{
"交互功能模板Demo函数查找wallhaven.cc的壁纸": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"Function": HotReload(交互功能模板函数),
}
}
)
except:
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.Latex输出PDF import Latex英文纠错加PDF对比
from crazy_functions.Latex输出PDF import Latex翻译中文并重新编译PDF
from crazy_functions.Latex输出PDF import PDF翻译中文并重新编译PDF
function_plugins.update(
{
"Latex英文纠错+高亮修正位置 [需Latex]": {
"Group": "学术",
"Color": "stop",
@@ -237,7 +546,7 @@ def get_crazy_functions():
"ArgsReminder": "如果有必要, 请在此处追加更细致的矫错指令(使用英文)。",
"Function": HotReload(Latex英文纠错加PDF对比),
},
"📚Arxiv论文精细翻译输入arxivID[需Latex]": {
"Arxiv论文精细翻译输入arxivID[需Latex]": {
"Group": "学术",
"Color": "stop",
"AsButton": False,
@@ -246,10 +555,9 @@ def get_crazy_functions():
r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: "
r'If the term "agent" is used in this section, it should be translated to "智能体". ',
"Info": "Arixv论文精细翻译 | 输入参数arxiv论文的ID比如1812.10695",
"Function": HotReload(Latex翻译中文并重新编译PDF), # 当注册Class后Function旧接口仅会在“虚空终端”中起作用
"Class": Arxiv_Localize, # 新一代插件需要注册Class
"Function": HotReload(Latex翻译中文并重新编译PDF),
},
"📚本地Latex论文精细翻译上传Latex项目[需Latex]": {
"本地Latex论文精细翻译上传Latex项目[需Latex]": {
"Group": "学术",
"Color": "stop",
"AsButton": False,
@@ -269,247 +577,13 @@ def get_crazy_functions():
r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: "
r'If the term "agent" is used in this section, it should be translated to "智能体". ',
"Info": "PDF翻译中文并重新编译PDF | 输入参数为路径",
"Function": HotReload(PDF翻译中文并重新编译PDF), # 当注册Class后Function旧接口仅会在“虚空终端”中起作用
"Class": PDF_Localize # 新一代插件需要注册Class
}
}
function_plugins.update(
{
"🎨图片生成DALLE2/DALLE3, 使用前切换到GPT系列模型": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"Info": "使用 DALLE2/DALLE3 生成图片 | 输入参数字符串,提供图像的内容",
"Function": HotReload(图片生成_DALLE2), # 当注册Class后Function旧接口仅会在“虚空终端”中起作用
"Class": ImageGen_Wrap # 新一代插件需要注册Class
},
}
)
function_plugins.update(
{
"🎨图片修改_DALLE2 使用前请切换模型到GPT系列": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": False, # 调用时唤起高级参数输入区默认False
# "Info": "使用DALLE2修改图片 | 输入参数字符串,提供图像的内容",
"Function": HotReload(图片修改_DALLE2),
},
}
)
# -=--=- 尚未充分测试的实验性插件 & 需要额外依赖的插件 -=--=-
try:
from crazy_functions.下载arxiv论文翻译摘要 import 下载arxiv论文并翻译摘要
function_plugins.update(
{
"一键下载arxiv论文并翻译摘要先在input输入编号如1812.10695": {
"Group": "学术",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
# "Info": "下载arxiv论文并翻译摘要 | 输入参数为arxiv编号如1812.10695",
"Function": HotReload(下载arxiv论文并翻译摘要),
"Function": HotReload(PDF翻译中文并重新编译PDF)
}
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
# try:
# from crazy_functions.联网的ChatGPT import 连接网络回答问题
# function_plugins.update(
# {
# "连接网络回答问题(输入问题后点击该插件,需要访问谷歌)": {
# "Group": "对话",
# "Color": "stop",
# "AsButton": False, # 加入下拉菜单中
# # "Info": "连接网络回答问题(需要访问谷歌)| 输入参数是一个问题",
# "Function": HotReload(连接网络回答问题),
# }
# }
# )
# from crazy_functions.联网的ChatGPT_bing版 import 连接bing搜索回答问题
# function_plugins.update(
# {
# "连接网络回答问题中文Bing版输入问题后点击该插件": {
# "Group": "对话",
# "Color": "stop",
# "AsButton": False, # 加入下拉菜单中
# "Info": "连接网络回答问题需要访问中文Bing| 输入参数是一个问题",
# "Function": HotReload(连接bing搜索回答问题),
# }
# }
# )
# except:
# logger.error(trimmed_format_exc())
# logger.error("Load function plugin failed")
try:
from crazy_functions.SourceCode_Analyse import 解析任意code项目
function_plugins.update(
{
"解析项目源代码(手动指定和筛选源代码文件类型)": {
"Group": "编程",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True, # 调用时唤起高级参数输入区默认False
"ArgsReminder": '输入时用逗号隔开, *代表通配符, 加了^代表不匹配; 不输入代表全部匹配。例如: "*.c, ^*.cpp, config.toml, ^*.toml"', # 高级参数输入区的显示提示
"Function": HotReload(解析任意code项目),
},
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
try:
from crazy_functions.询问多个大语言模型 import 同时问询_指定模型
function_plugins.update(
{
"询问多个GPT模型手动指定询问哪些模型": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True, # 调用时唤起高级参数输入区默认False
"ArgsReminder": "支持任意数量的llm接口用&符号分隔。例如chatglm&gpt-3.5-turbo&gpt-4", # 高级参数输入区的显示提示
"Function": HotReload(同时问询_指定模型),
},
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
try:
from crazy_functions.总结音视频 import 总结音视频
function_plugins.update(
{
"批量总结音视频(输入路径或上传压缩包)": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True,
"ArgsReminder": "调用openai api 使用whisper-1模型, 目前支持的格式:mp4, m4a, wav, mpga, mpeg, mp3。此处可以输入解析提示例如解析为简体中文默认",
"Info": "批量总结音频或视频 | 输入参数为路径",
"Function": HotReload(总结音视频),
}
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
try:
from crazy_functions.数学动画生成manim import 动画生成
function_plugins.update(
{
"数学动画生成Manim": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"Info": "按照自然语言描述生成一个动画 | 输入参数是一段话",
"Function": HotReload(动画生成),
}
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
try:
from crazy_functions.Markdown_Translate import Markdown翻译指定语言
function_plugins.update(
{
"Markdown翻译指定翻译成何种语言": {
"Group": "编程",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True,
"ArgsReminder": "请输入要翻译成哪种语言默认为Chinese。",
"Function": HotReload(Markdown翻译指定语言),
}
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
try:
from crazy_functions.知识库问答 import 知识库文件注入
function_plugins.update(
{
"构建知识库(先上传文件素材,再运行此插件)": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True,
"ArgsReminder": "此处待注入的知识库名称id, 默认为default。文件进入知识库后可长期保存。可以通过再次调用本插件的方式向知识库追加更多文档。",
"Function": HotReload(知识库文件注入),
}
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
try:
from crazy_functions.知识库问答 import 读取知识库作答
function_plugins.update(
{
"知识库文件注入(构建知识库后,再运行此插件)": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"AdvancedArgs": True,
"ArgsReminder": "待提取的知识库名称id, 默认为default, 您需要构建知识库后再运行此插件。",
"Function": HotReload(读取知识库作答),
}
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
try:
from crazy_functions.交互功能函数模板 import 交互功能模板函数
function_plugins.update(
{
"交互功能模板Demo函数查找wallhaven.cc的壁纸": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"Function": HotReload(交互功能模板函数),
}
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from toolbox import get_conf
@@ -530,8 +604,8 @@ def get_crazy_functions():
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.批量翻译PDF文档_NOUGAT import 批量翻译PDF文档
@@ -547,8 +621,8 @@ def get_crazy_functions():
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.函数动态生成 import 函数动态生成
@@ -564,8 +638,8 @@ def get_crazy_functions():
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.多智能体 import 多智能体终端
@@ -581,8 +655,8 @@ def get_crazy_functions():
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
print(trimmed_format_exc())
print("Load function plugin failed")
try:
from crazy_functions.互动小游戏 import 随机小游戏
@@ -598,33 +672,8 @@ def get_crazy_functions():
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
try:
from crazy_functions.Rag_Interface import Rag问答
function_plugins.update(
{
"Rag智能召回": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"Info": "将问答数据记录到向量库中,作为长期参考。",
"Function": HotReload(Rag问答),
},
}
)
except:
logger.error(trimmed_format_exc())
logger.error("Load function plugin failed")
print(trimmed_format_exc())
print("Load function plugin failed")
# try:
# from crazy_functions.高级功能函数模板 import 测试图表渲染
@@ -637,7 +686,7 @@ def get_crazy_functions():
# }
# })
# except:
# logger.error(trimmed_format_exc())
# print(trimmed_format_exc())
# print('Load function plugin failed')
# try:

View File

@@ -1,573 +0,0 @@
import asyncio
import logging
import os
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
from pathlib import Path
from threading import Lock as ThreadLock
from typing import Generator
from typing import List, Dict, Optional
from crazy_functions.crazy_utils import input_clipping
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from crazy_functions.rag_fns.arxiv_fns.arxiv_splitter import ArxivSplitter, save_fragments_to_file, process_arxiv_sync
from crazy_functions.rag_fns.arxiv_fns.section_fragment import SectionFragment as Fragment
from crazy_functions.rag_fns.llama_index_worker import LlamaIndexRagWorker
from toolbox import CatchException, update_ui, get_log_folder, update_ui_lastest_msg
# 全局常量配置
MAX_HISTORY_ROUND = 5 # 最大历史对话轮数
MAX_CONTEXT_TOKEN_LIMIT = 4096 # 上下文最大token数
REMEMBER_PREVIEW = 1000 # 记忆预览长度
VECTOR_STORE_TYPE = "Simple" # 向量存储类型Simple或Milvus
MAX_CONCURRENT_PAPERS = 20 # 最大并行处理论文数
MAX_WORKERS = 3 # 最大工作线程数
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
logger = logging.getLogger(__name__)
@dataclass
class ProcessingTask:
"""论文处理任务数据类"""
arxiv_id: str
status: str = "pending" # pending, processing, completed, failed
error: Optional[str] = None
fragments: List[Fragment] = None
start_time: float = field(default_factory=time.time)
class ArxivRagWorker:
def __init__(self, user_name: str, llm_kwargs: Dict, arxiv_id: str = None):
"""初始化ArxivRagWorker"""
self.user_name = user_name
self.llm_kwargs = llm_kwargs
self.arxiv_id = self._normalize_arxiv_id(arxiv_id) if arxiv_id else None
self.fragments = None
# 初始化基础目录
self.base_dir = Path(get_log_folder( plugin_name='arxiv_rag_cache'))
self._setup_directories()
# 初始化处理状态
# 线程安全的计数器和集合
self._processing_lock = ThreadLock()
self._processed_fragments = set()
self._processed_count = 0
# 优化的线程池配置
cpu_count = os.cpu_count() or 1
self.thread_pool = ThreadPoolExecutor(
max_workers=min(32, cpu_count * 4),
thread_name_prefix="arxiv_worker"
)
# 批处理配置
self._batch_size = min(20, cpu_count * 2) # 动态设置批大小
self.max_concurrent_papers = MAX_CONCURRENT_PAPERS
self._semaphore = None
self._loop = None
# 初始化处理队列
self.processing_queue = {}
# 初始化工作组件
self._init_workers()
def _setup_directories(self):
"""设置工作目录"""
if self.arxiv_id:
self.checkpoint_dir = self.base_dir / self.arxiv_id
self.vector_store_dir = self.checkpoint_dir / "vector_store"
self.fragment_store_dir = self.checkpoint_dir / "fragments"
else:
self.checkpoint_dir = self.base_dir
self.vector_store_dir = self.base_dir / "vector_store"
self.fragment_store_dir = self.base_dir / "fragments"
self.paper_path = self.checkpoint_dir / f"{self.arxiv_id}.processed"
self.loading = self.paper_path.exists()
# 创建必要的目录
for directory in [self.checkpoint_dir, self.vector_store_dir, self.fragment_store_dir]:
directory.mkdir(parents=True, exist_ok=True)
logger.info(f"Created directory: {directory}")
def _init_workers(self):
"""初始化工作组件"""
try:
self.rag_worker = LlamaIndexRagWorker(
user_name=self.user_name,
llm_kwargs=self.llm_kwargs,
checkpoint_dir=str(self.vector_store_dir),
auto_load_checkpoint=True
)
self.arxiv_splitter = ArxivSplitter(
root_dir=str(self.checkpoint_dir / "arxiv_cache")
)
except Exception as e:
logger.error(f"Error initializing workers: {str(e)}")
raise
def _ensure_loop(self):
"""确保存在事件循环"""
if threading.current_thread() is threading.main_thread():
if self._loop is None:
self._loop = asyncio.get_event_loop()
else:
try:
self._loop = asyncio.get_event_loop()
except RuntimeError:
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)
return self._loop
@property
def semaphore(self):
"""延迟创建semaphore"""
if self._semaphore is None:
self._semaphore = asyncio.Semaphore(self.max_concurrent_papers)
return self._semaphore
async def _process_fragments(self, fragments: List[Fragment]) -> None:
"""优化的并行处理论文片段"""
if not fragments:
logger.warning("No fragments to process")
return
start_time = time.time()
total_fragments = len(fragments)
try:
# 1. 处理论文概述
overview = self._create_overview(fragments[0])
overview_success = self._safe_add_to_vector_store_sync(overview['text'])
if not overview_success:
raise RuntimeError("Failed to add overview to vector store")
# 2. 并行处理片段
successful_fragments = await self._parallel_process_fragments(fragments)
# 3. 保存处理结果
if successful_fragments > 0:
await self._save_results(fragments, overview['arxiv_id'], successful_fragments)
except Exception as e:
logger.error(f"Error in fragment processing: {str(e)}")
raise
finally:
self._log_processing_stats(start_time, total_fragments)
def _create_overview(self, first_fragment: Fragment) -> Dict:
"""创建论文概述"""
return {
'arxiv_id': first_fragment.arxiv_id,
'text': (
f"Paper Title: {first_fragment.title}\n"
f"ArXiv ID: {first_fragment.arxiv_id}\n"
f"Abstract: {first_fragment.abstract}\n"
f"Table of contents:{first_fragment.catalogs}\n"
f"Type: OVERVIEW"
)
}
async def _parallel_process_fragments(self, fragments: List[Fragment]) -> int:
"""并行处理所有片段"""
successful_count = 0
loop = self._ensure_loop()
for i in range(0, len(fragments), self._batch_size):
batch = fragments[i:i + self._batch_size]
batch_futures = []
for j, fragment in enumerate(batch):
if not self._is_fragment_processed(fragment, i + j):
future = loop.run_in_executor(
self.thread_pool,
self._process_single_fragment_sync,
fragment,
i + j
)
batch_futures.append(future)
if batch_futures:
results = await asyncio.gather(*batch_futures, return_exceptions=True)
successful_count += sum(1 for r in results if isinstance(r, bool) and r)
return successful_count
def _is_fragment_processed(self, fragment: Fragment, index: int) -> bool:
"""检查片段是否已处理"""
fragment_id = f"{fragment.arxiv_id}_{index}"
with self._processing_lock:
return fragment_id in self._processed_fragments
def _safe_add_to_vector_store_sync(self, text: str) -> bool:
"""线程安全的向量存储添加"""
with self._processing_lock:
try:
self.rag_worker.add_text_to_vector_store(text)
return True
except Exception as e:
logger.error(f"Error adding to vector store: {str(e)}")
return False
def _process_single_fragment_sync(self, fragment: Fragment, index: int) -> bool:
"""处理单个片段"""
fragment_id = f"{fragment.arxiv_id}_{index}"
try:
text = self._build_fragment_text(fragment)
if self._safe_add_to_vector_store_sync(text):
with self._processing_lock:
self._processed_fragments.add(fragment_id)
self._processed_count += 1
logger.info(f"Successfully processed fragment {index}")
return True
return False
except Exception as e:
logger.error(f"Error processing fragment {index}: {str(e)}")
return False
def _build_fragment_text(self, fragment: Fragment) -> str:
"""构建片段文本"""
return "".join([
f"Paper Title: {fragment.title}\n",
f"Section: {fragment.current_section}\n",
f"Content: {fragment.content}\n",
f"Bibliography: {fragment.bibliography}\n",
"Type: FRAGMENT"
])
async def _save_results(self, fragments: List[Fragment], arxiv_id: str, successful_count: int) -> None:
"""保存处理结果"""
if successful_count > 0:
loop = self._ensure_loop()
await loop.run_in_executor(
self.thread_pool,
save_fragments_to_file,
fragments,
str(self.fragment_store_dir / f"{arxiv_id}_fragments.json")
)
def _log_processing_stats(self, start_time: float, total_fragments: int) -> None:
"""记录处理统计信息"""
elapsed_time = time.time() - start_time
processing_rate = total_fragments / elapsed_time if elapsed_time > 0 else 0
logger.info(
f"Processed {self._processed_count}/{total_fragments} fragments "
f"in {elapsed_time:.2f}s (rate: {processing_rate:.2f} fragments/s)"
)
async def process_paper(self, fragments: List[Fragment]) -> bool:
"""处理论文主函数"""
try:
if self.paper_path.exists():
logger.info(f"Paper {self.arxiv_id} already processed")
return True
task = self._create_processing_task(self.arxiv_id)
try:
async with self.semaphore:
await self._process_fragments(fragments)
self._complete_task(task, fragments, self.paper_path)
return True
except Exception as e:
self._fail_task(task, str(e))
raise
except Exception as e:
logger.error(f"Error processing paper {self.arxiv_id}: {str(e)}")
return False
def _create_processing_task(self, arxiv_id: str) -> ProcessingTask:
"""创建处理任务"""
task = ProcessingTask(arxiv_id=arxiv_id)
with self._processing_lock:
self.processing_queue[arxiv_id] = task
task.status = "processing"
return task
def _complete_task(self, task: ProcessingTask, fragments: List[Fragment], paper_path: Path) -> None:
"""完成任务处理"""
with self._processing_lock:
task.status = "completed"
task.fragments = fragments
paper_path.touch()
logger.info(f"Paper {task.arxiv_id} processed successfully with {self._processed_count} fragments")
def _fail_task(self, task: ProcessingTask, error: str) -> None:
"""任务失败处理"""
with self._processing_lock:
task.status = "failed"
task.error = error
def _normalize_arxiv_id(self, input_str: str) -> str:
"""规范化ArXiv ID"""
if not input_str:
return ""
input_str = input_str.strip().lower()
if 'arxiv.org/' in input_str:
if '/pdf/' in input_str:
arxiv_id = input_str.split('/pdf/')[-1]
else:
arxiv_id = input_str.split('/abs/')[-1]
return arxiv_id.split('v')[0].strip()
return input_str.split('v')[0].strip()
async def wait_for_paper(self, arxiv_id: str, timeout: float = 300.0) -> bool:
"""等待论文处理完成"""
start_time = time.time()
try:
while True:
with self._processing_lock:
task = self.processing_queue.get(arxiv_id)
if not task:
return False
if task.status == "completed":
return True
if task.status == "failed":
return False
if time.time() - start_time > timeout:
logger.error(f"Processing paper {arxiv_id} timed out")
return False
await asyncio.sleep(0.1)
except Exception as e:
logger.error(f"Error waiting for paper {arxiv_id}: {str(e)}")
return False
def retrieve_and_generate(self, query: str) -> str:
"""检索相关内容并生成提示词"""
try:
nodes = self.rag_worker.retrieve_from_store_with_query(query)
return self.rag_worker.build_prompt(query=query, nodes=nodes)
except Exception as e:
logger.error(f"Error in retrieve and generate: {str(e)}")
return ""
def remember_qa(self, question: str, answer: str) -> None:
"""记忆问答对"""
try:
self.rag_worker.remember_qa(question, answer)
except Exception as e:
logger.error(f"Error remembering QA: {str(e)}")
async def auto_analyze_paper(self, chatbot: List, history: List, system_prompt: str) -> None:
"""自动分析论文的关键问题"""
key_questions = [
"What is the main research question or problem addressed in this paper?",
"What methods or approaches did the authors use to investigate the problem?",
"What are the key findings or results presented in the paper?",
"How do the findings of this paper contribute to the broader field or topic of study?",
"What are the limitations of this study, and what future research directions do the authors suggest?"
]
results = []
for question in key_questions:
try:
prompt = self.retrieve_and_generate(question)
if prompt:
response = await request_gpt_model_in_new_thread_with_ui_alive(
inputs=prompt,
inputs_show_user=question,
llm_kwargs=self.llm_kwargs,
chatbot=chatbot,
history=history,
sys_prompt=system_prompt
)
results.append(f"Q: {question}\nA: {response}\n")
self.remember_qa(question, response)
except Exception as e:
logger.error(f"Error in auto analysis: {str(e)}")
# 合并所有结果
summary = "\n\n".join(results)
chatbot[-1] = (chatbot[-1][0], f"论文已成功加载并完成初步分析:\n\n{summary}\n\n您现在可以继续提问更多细节。")
@CatchException
def Arxiv论文对话(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List,
history: List, system_prompt: str, web_port: str) -> Generator:
"""
Arxiv论文对话主函数
Args:
txt: arxiv ID/URL
llm_kwargs: LLM配置参数
plugin_kwargs: 插件配置参数,包含 advanced_arg 字段作为用户询问指令
chatbot: 对话历史
history: 聊天历史
system_prompt: 系统提示词
web_port: Web端口
"""
# 初始化时,提示用户需要 arxiv ID/URL
from toolbox import promote_file_to_downloadzone
if len(history) == 0 and not txt.lower().strip().startswith(('https://arxiv.org', 'arxiv.org', '0', '1', '2')):
chatbot.append((txt, "请先提供Arxiv论文链接或ID。"))
yield from update_ui(chatbot=chatbot, history=history)
return
user_name = chatbot.get_user()
arxiv_worker = ArxivRagWorker(user_name, llm_kwargs, arxiv_id=txt)
arxiv_id = arxiv_worker.arxiv_id
# 处理新论文的情况
if txt.lower().strip().startswith(('https://arxiv.org', 'arxiv.org', '0', '1', '2')) and not arxiv_worker.loading:
chatbot.append((txt, "正在处理论文,请稍等..."))
yield from update_ui(chatbot=chatbot, history=history)
fragments, formatted_content, fragment_output_files = process_arxiv_sync(arxiv_worker.arxiv_splitter, arxiv_id)
for file in fragment_output_files:
promote_file_to_downloadzone(file, chatbot=chatbot)
chatbot.append(["论文文字内容已保存至下载区,接下来将进行论文编码,请耐心等待三分钟,论文的文字内容为:", formatted_content])
yield from update_ui(chatbot=chatbot, history=history)
try:
# 创建新的事件循环
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
# 设置超时时间为5分钟
success = loop.run_until_complete(
asyncio.wait_for(arxiv_worker.process_paper(fragments), timeout=300)
)
if success:
success = loop.run_until_complete(
asyncio.wait_for(arxiv_worker.wait_for_paper(arxiv_id), timeout=60)
)
if success:
chatbot[-1] = (txt, "论文处理完成,您现在可以开始提问。")
else:
chatbot[-1] = (txt, "论文处理超时,请重试。")
else:
chatbot[-1] = (txt, "论文处理失败请检查论文ID是否正确或稍后重试。")
except asyncio.TimeoutError:
chatbot[-1] = (txt, "论文处理超时,请重试。")
success = False
finally:
loop.close()
if not success:
yield from update_ui(chatbot=chatbot, history=history)
return
except Exception as e:
logger.error(f"Error in main process: {str(e)}")
chatbot[-1] = (txt, f"处理过程中发生错误: {str(e)}")
yield from update_ui(chatbot=chatbot, history=history)
return
yield from update_ui(chatbot=chatbot, history=history)
return
# 处理用户询问的情况
# 获取用户询问指令
user_query = plugin_kwargs.get("advanced_arg",
"What is the main research question or problem addressed in this paper?")
if len(history)<2:
fragments, formatted_content, fragment_output_files = process_arxiv_sync(arxiv_worker.arxiv_splitter, arxiv_id)
for file in fragment_output_files:
promote_file_to_downloadzone(file, chatbot=chatbot)
chatbot.append(["论文文字内容已保存至下载区,论文的文字内容为:", formatted_content])
yield from update_ui(chatbot=chatbot, history=history)
if not user_query:
user_query = "What is the main research question or problem addressed in this paper?"
# chatbot.append((txt, "请提供您的问题。"))
# yield from update_ui(chatbot=chatbot, history=history)
# return
# 处理历史对话长度
if len(history) > MAX_HISTORY_ROUND * 2:
history = history[-(MAX_HISTORY_ROUND * 2):]
# 处理询问指令
query_clip, history, flags = input_clipping(
user_query,
history,
max_token_limit=MAX_CONTEXT_TOKEN_LIMIT,
return_clip_flags=True
)
if flags["original_input_len"] != flags["clipped_input_len"]:
yield from update_ui_lastest_msg('检测到长输入,正在处理...', chatbot, history, delay=0)
if len(user_query) > REMEMBER_PREVIEW:
HALF = REMEMBER_PREVIEW // 2
query_to_remember = user_query[
:HALF] + f" ...\n...(省略{len(user_query) - REMEMBER_PREVIEW}字)...\n... " + user_query[
-HALF:]
else:
query_to_remember = query_clip
else:
query_to_remember = query_clip
chatbot.append((user_query, "正在思考中..."))
yield from update_ui(chatbot=chatbot, history=history)
# 生成提示词
prompt = arxiv_worker.retrieve_and_generate(query_clip)
if not prompt:
chatbot[-1] = (user_query, "抱歉,处理您的问题时出现错误,请重试。")
yield from update_ui(chatbot=chatbot, history=history)
return
# 获取回答
response = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=prompt,
inputs_show_user=query_clip,
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history=history,
sys_prompt=system_prompt
)
# 记忆问答对
# worker.remember_qa(query_to_remember, response)
history.extend([user_query, response])
yield from update_ui(chatbot=chatbot, history=history)
if __name__ == "__main__":
# 测试代码
llm_kwargs = {
'api_key': os.getenv("one_api_key"),
'client_ip': '127.0.0.1',
'embed_model': 'text-embedding-3-small',
'llm_model': 'one-api-Qwen2.5-72B-Instruct',
'max_length': 4096,
'most_recent_uploaded': None,
'temperature': 1,
'top_p': 1
}
plugin_kwargs = {}
chatbot = []
history = []
system_prompt = "You are a helpful assistant."
web_port = "8080"
# 测试论文导入
arxiv_url = "https://arxiv.org/abs/2312.12345"
for response in Arxiv论文对话(
arxiv_url, llm_kwargs, plugin_kwargs,
chatbot, history, system_prompt, web_port
):
print(response)
# 测试问答
question = "这篇论文的主要贡献是什么?"
for response in Arxiv论文对话(
question, llm_kwargs, plugin_kwargs,
chatbot, history, system_prompt, web_port
):
print(response)

View File

@@ -1,56 +0,0 @@
from toolbox import get_conf, update_ui
from crazy_functions.Image_Generate import 图片生成_DALLE2, 图片生成_DALLE3, 图片修改_DALLE2
from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty
class ImageGen_Wrap(GptAcademicPluginTemplate):
def __init__(self):
"""
请注意`execute`会执行在不同的线程中,因此您在定义和使用类变量时,应当慎之又慎!
"""
pass
def define_arg_selection_menu(self):
"""
定义插件的二级选项菜单
第一个参数,名称`main_input`,参数`type`声明这是一个文本框,文本框上方显示`title`,文本框内部显示`description``default_value`为默认值;
第二个参数,名称`advanced_arg`,参数`type`声明这是一个文本框,文本框上方显示`title`,文本框内部显示`description``default_value`为默认值;
"""
gui_definition = {
"main_input":
ArgProperty(title="输入图片描述", description="需要生成图像的文本描述,尽量使用英文", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步
"model_name":
ArgProperty(title="模型", options=["DALLE2", "DALLE3"], default_value="DALLE3", description="", type="dropdown").model_dump_json(),
"resolution":
ArgProperty(title="分辨率", options=["256x256(限DALLE2)", "512x512(限DALLE2)", "1024x1024", "1792x1024(限DALLE3)", "1024x1792(限DALLE3)"], default_value="1024x1024", description="", type="dropdown").model_dump_json(),
"quality (仅DALLE3生效)":
ArgProperty(title="质量", options=["standard", "hd"], default_value="standard", description="", type="dropdown").model_dump_json(),
"style (仅DALLE3生效)":
ArgProperty(title="风格", options=["vivid", "natural"], default_value="vivid", description="", type="dropdown").model_dump_json(),
}
return gui_definition
def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
"""
执行插件
"""
# 分辨率
resolution = plugin_kwargs["resolution"].replace("(限DALLE2)", "").replace("(限DALLE3)", "")
if plugin_kwargs["model_name"] == "DALLE2":
plugin_kwargs["advanced_arg"] = resolution
yield from 图片生成_DALLE2(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
elif plugin_kwargs["model_name"] == "DALLE3":
quality = plugin_kwargs["quality (仅DALLE3生效)"]
style = plugin_kwargs["style (仅DALLE3生效)"]
plugin_kwargs["advanced_arg"] = f"{resolution}-{quality}-{style}"
yield from 图片生成_DALLE3(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
else:
chatbot.append([None, "抱歉,找不到该模型"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

View File

@@ -1,278 +0,0 @@
import requests
import random
import time
import re
import json
from bs4 import BeautifulSoup
from functools import lru_cache
from itertools import zip_longest
from check_proxy import check_proxy
from toolbox import CatchException, update_ui, get_conf
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, input_clipping
from request_llms.bridge_all import model_info
from request_llms.bridge_all import predict_no_ui_long_connection
from crazy_functions.prompts.internet import SearchOptimizerPrompt, SearchAcademicOptimizerPrompt
def search_optimizer(
query,
proxies,
history,
llm_kwargs,
optimizer=1,
categories="general",
searxng_url=None,
engines=None,
):
# ------------- < 第1步尝试进行搜索优化 > -------------
# * 增强优化,会尝试结合历史记录进行搜索优化
if optimizer == 2:
his = " "
if len(history) == 0:
pass
else:
for i, h in enumerate(history):
if i % 2 == 0:
his += f"Q: {h}\n"
else:
his += f"A: {h}\n"
if categories == "general":
sys_prompt = SearchOptimizerPrompt.format(query=query, history=his, num=4)
elif categories == "science":
sys_prompt = SearchAcademicOptimizerPrompt.format(query=query, history=his, num=4)
else:
his = " "
if categories == "general":
sys_prompt = SearchOptimizerPrompt.format(query=query, history=his, num=3)
elif categories == "science":
sys_prompt = SearchAcademicOptimizerPrompt.format(query=query, history=his, num=3)
mutable = ["", time.time(), ""]
llm_kwargs["temperature"] = 0.8
try:
querys_json = predict_no_ui_long_connection(
inputs=query,
llm_kwargs=llm_kwargs,
history=[],
sys_prompt=sys_prompt,
observe_window=mutable,
)
except Exception:
querys_json = "1234"
#* 尝试解码优化后的搜索结果
querys_json = re.sub(r"```json|```", "", querys_json)
try:
querys = json.loads(querys_json)
except Exception:
#* 如果解码失败,降低温度再试一次
try:
llm_kwargs["temperature"] = 0.4
querys_json = predict_no_ui_long_connection(
inputs=query,
llm_kwargs=llm_kwargs,
history=[],
sys_prompt=sys_prompt,
observe_window=mutable,
)
querys_json = re.sub(r"```json|```", "", querys_json)
querys = json.loads(querys_json)
except Exception:
#* 如果再次失败,直接返回原始问题
querys = [query]
links = []
success = 0
Exceptions = ""
for q in querys:
try:
link = searxng_request(q, proxies, categories, searxng_url, engines=engines)
if len(link) > 0:
links.append(link[:-5])
success += 1
except Exception:
Exceptions = Exception
pass
if success == 0:
raise ValueError(f"在线搜索失败!\n{Exceptions}")
# * 清洗搜索结果,依次放入每组第一,第二个搜索结果,并清洗重复的搜索结果
seen_links = set()
result = []
for tuple in zip_longest(*links, fillvalue=None):
for item in tuple:
if item is not None:
link = item["link"]
if link not in seen_links:
seen_links.add(link)
result.append(item)
return result
@lru_cache
def get_auth_ip():
ip = check_proxy(None, return_ip=True)
if ip is None:
return '114.114.114.' + str(random.randint(1, 10))
return ip
def searxng_request(query, proxies, categories='general', searxng_url=None, engines=None):
if searxng_url is None:
url = get_conf("SEARXNG_URL")
else:
url = searxng_url
if engines == "Mixed":
engines = None
if categories == 'general':
params = {
'q': query, # 搜索查询
'format': 'json', # 输出格式为JSON
'language': 'zh', # 搜索语言
'engines': engines,
}
elif categories == 'science':
params = {
'q': query, # 搜索查询
'format': 'json', # 输出格式为JSON
'language': 'zh', # 搜索语言
'categories': 'science'
}
else:
raise ValueError('不支持的检索类型')
headers = {
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Forwarded-For': get_auth_ip(),
'X-Real-IP': get_auth_ip()
}
results = []
response = requests.post(url, params=params, headers=headers, proxies=proxies, timeout=30)
if response.status_code == 200:
json_result = response.json()
for result in json_result['results']:
item = {
"title": result.get("title", ""),
"source": result.get("engines", "unknown"),
"content": result.get("content", ""),
"link": result["url"],
}
results.append(item)
return results
else:
if response.status_code == 429:
raise ValueError("Searxng在线搜索服务当前使用人数太多请稍后。")
else:
raise ValueError("在线搜索失败,状态码: " + str(response.status_code) + '\t' + response.content.decode('utf-8'))
def scrape_text(url, proxies) -> str:
"""Scrape text from a webpage
Args:
url (str): The URL to scrape text from
Returns:
str: The scraped text
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
'Content-Type': 'text/plain',
}
try:
response = requests.get(url, headers=headers, proxies=proxies, timeout=8)
if response.encoding == "ISO-8859-1": response.encoding = response.apparent_encoding
except:
return "无法连接到该网页"
soup = BeautifulSoup(response.text, "html.parser")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk)
return text
@CatchException
def 连接网络回答问题(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
optimizer_history = history[:-8]
history = [] # 清空历史,以免输入溢出
chatbot.append((f"请结合互联网信息回答以下问题:{txt}", "检索中..."))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# ------------- < 第1步爬取搜索引擎的结果 > -------------
from toolbox import get_conf
proxies = get_conf('proxies')
categories = plugin_kwargs.get('categories', 'general')
searxng_url = plugin_kwargs.get('searxng_url', None)
engines = plugin_kwargs.get('engine', None)
optimizer = plugin_kwargs.get('optimizer', "关闭")
if optimizer == "关闭":
urls = searxng_request(txt, proxies, categories, searxng_url, engines=engines)
else:
urls = search_optimizer(txt, proxies, optimizer_history, llm_kwargs, optimizer, categories, searxng_url, engines)
history = []
if len(urls) == 0:
chatbot.append((f"结论:{txt}",
"[Local Message] 受到限制无法从searxng获取信息请尝试更换搜索引擎。"))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# ------------- < 第2步依次访问网页 > -------------
max_search_result = 5 # 最多收纳多少个网页的结果
if optimizer == "开启(增强)":
max_search_result = 8
chatbot.append(["联网检索中 ...", None])
for index, url in enumerate(urls[:max_search_result]):
res = scrape_text(url['link'], proxies)
prefix = f"{index}份搜索结果 [源自{url['source'][0]}搜索] {url['title'][:25]}"
history.extend([prefix, res])
res_squeeze = res.replace('\n', '...')
chatbot[-1] = [prefix + "\n\n" + res_squeeze[:500] + "......", None]
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# ------------- < 第3步ChatGPT综合 > -------------
if (optimizer != "开启(增强)"):
i_say = f"从以上搜索结果中抽取信息,然后回答问题:{txt}"
i_say, history = input_clipping( # 裁剪输入从最长的条目开始裁剪防止爆token
inputs=i_say,
history=history,
max_token_limit=min(model_info[llm_kwargs['llm_model']]['max_token']*3//4, 8192)
)
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=i_say, inputs_show_user=i_say,
llm_kwargs=llm_kwargs, chatbot=chatbot, history=history,
sys_prompt="请从给定的若干条搜索结果中抽取信息,对最相关的两个搜索结果进行总结,然后回答问题。"
)
chatbot[-1] = (i_say, gpt_say)
history.append(i_say);history.append(gpt_say)
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新
#* 或者使用搜索优化器,这样可以保证后续问答能读取到有效的历史记录
else:
i_say = f"从以上搜索结果中抽取与问题:{txt} 相关的信息:"
i_say, history = input_clipping( # 裁剪输入从最长的条目开始裁剪防止爆token
inputs=i_say,
history=history,
max_token_limit=min(model_info[llm_kwargs['llm_model']]['max_token']*3//4, 8192)
)
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=i_say, inputs_show_user=i_say,
llm_kwargs=llm_kwargs, chatbot=chatbot, history=history,
sys_prompt="请从给定的若干条搜索结果中抽取信息,对最相关的三个搜索结果进行总结"
)
chatbot[-1] = (i_say, gpt_say)
history = []
history.append(i_say);history.append(gpt_say)
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新
# ------------- < 第4步根据综合回答问题 > -------------
i_say = f"请根据以上搜索结果回答问题:{txt}"
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=i_say, inputs_show_user=i_say,
llm_kwargs=llm_kwargs, chatbot=chatbot, history=history,
sys_prompt="请根据给定的若干条搜索结果回答问题"
)
chatbot[-1] = (i_say, gpt_say)
history.append(i_say);history.append(gpt_say)
yield from update_ui(chatbot=chatbot, history=history)

View File

@@ -1,45 +0,0 @@
from toolbox import get_conf
from crazy_functions.Internet_GPT import 连接网络回答问题
from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty
class NetworkGPT_Wrap(GptAcademicPluginTemplate):
def __init__(self):
"""
请注意`execute`会执行在不同的线程中,因此您在定义和使用类变量时,应当慎之又慎!
"""
pass
def define_arg_selection_menu(self):
"""
定义插件的二级选项菜单
第一个参数,名称`main_input`,参数`type`声明这是一个文本框,文本框上方显示`title`,文本框内部显示`description``default_value`为默认值;
第二个参数,名称`advanced_arg`,参数`type`声明这是一个文本框,文本框上方显示`title`,文本框内部显示`description``default_value`为默认值;
第三个参数,名称`allow_cache`,参数`type`声明这是一个下拉菜单,下拉菜单上方显示`title`+`description`,下拉菜单的选项为`options``default_value`为下拉菜单默认值;
"""
gui_definition = {
"main_input":
ArgProperty(title="输入问题", description="待通过互联网检索的问题,会自动读取输入框内容", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步
"categories":
ArgProperty(title="搜索分类", options=["网页", "学术论文"], default_value="网页", description="", type="dropdown").model_dump_json(),
"engine":
ArgProperty(title="选择搜索引擎", options=["Mixed", "bing", "google", "duckduckgo"], default_value="google", description="", type="dropdown").model_dump_json(),
"optimizer":
ArgProperty(title="搜索优化", options=["关闭", "开启", "开启(增强)"], default_value="关闭", description="是否使用搜索增强。注意这可能会消耗较多token", type="dropdown").model_dump_json(),
"searxng_url":
ArgProperty(title="Searxng服务地址", description="输入Searxng的地址", default_value=get_conf("SEARXNG_URL"), type="string").model_dump_json(), # 主输入,自动从输入框同步
}
return gui_definition
def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
"""
执行插件
"""
if plugin_kwargs["categories"] == "网页": plugin_kwargs["categories"] = "general"
if plugin_kwargs["categories"] == "学术论文": plugin_kwargs["categories"] = "science"
yield from 连接网络回答问题(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)

View File

@@ -1,85 +0,0 @@
from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF, PDF翻译中文并重新编译PDF
from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty
class Arxiv_Localize(GptAcademicPluginTemplate):
def __init__(self):
"""
请注意`execute`会执行在不同的线程中,因此您在定义和使用类变量时,应当慎之又慎!
"""
pass
def define_arg_selection_menu(self):
"""
定义插件的二级选项菜单
第一个参数,名称`main_input`,参数`type`声明这是一个文本框,文本框上方显示`title`,文本框内部显示`description``default_value`为默认值;
第二个参数,名称`advanced_arg`,参数`type`声明这是一个文本框,文本框上方显示`title`,文本框内部显示`description``default_value`为默认值;
第三个参数,名称`allow_cache`,参数`type`声明这是一个下拉菜单,下拉菜单上方显示`title`+`description`,下拉菜单的选项为`options``default_value`为下拉菜单默认值;
"""
gui_definition = {
"main_input":
ArgProperty(title="ArxivID", description="输入Arxiv的ID或者网址", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步
"advanced_arg":
ArgProperty(title="额外的翻译提示词",
description=r"如果有必要, 请在此处给出自定义翻译命令, 解决部分词汇翻译不准确的问题。 "
r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: "
r'If the term "agent" is used in this section, it should be translated to "智能体". ',
default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步
"allow_cache":
ArgProperty(title="是否允许从缓存中调取结果", options=["允许缓存", "从头执行"], default_value="允许缓存", description="", type="dropdown").model_dump_json(),
"allow_cloudio":
ArgProperty(title="是否允许从GPTAC学术云下载(或者上传)翻译结果(仅针对Arxiv论文)", options=["允许", "禁止"], default_value="禁止", description="共享文献,互助互利", type="dropdown").model_dump_json(),
}
return gui_definition
def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
"""
执行插件
"""
allow_cache = plugin_kwargs["allow_cache"]
allow_cloudio = plugin_kwargs["allow_cloudio"]
advanced_arg = plugin_kwargs["advanced_arg"]
if allow_cache == "从头执行": plugin_kwargs["advanced_arg"] = "--no-cache " + plugin_kwargs["advanced_arg"]
# 从云端下载翻译结果,以及上传翻译结果到云端;人人为我,我为人人。
if allow_cloudio == "允许": plugin_kwargs["advanced_arg"] = "--allow-cloudio " + plugin_kwargs["advanced_arg"]
yield from Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
class PDF_Localize(GptAcademicPluginTemplate):
def __init__(self):
"""
请注意`execute`会执行在不同的线程中,因此您在定义和使用类变量时,应当慎之又慎!
"""
pass
def define_arg_selection_menu(self):
"""
定义插件的二级选项菜单
"""
gui_definition = {
"main_input":
ArgProperty(title="PDF文件路径", description="未指定路径,请上传文件后,再点击该插件", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步
"advanced_arg":
ArgProperty(title="额外的翻译提示词",
description=r"如果有必要, 请在此处给出自定义翻译命令, 解决部分词汇翻译不准确的问题。 "
r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: "
r'If the term "agent" is used in this section, it should be translated to "智能体". ',
default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步
"method":
ArgProperty(title="采用哪种方法执行转换", options=["MATHPIX", "DOC2X"], default_value="DOC2X", description="", type="dropdown").model_dump_json(),
}
return gui_definition
def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
"""
执行插件
"""
yield from PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)

View File

@@ -1,6 +1,6 @@
from toolbox import update_ui, trimmed_format_exc, promote_file_to_downloadzone, get_log_folder
from toolbox import CatchException, report_exception, write_history_to_file, zip_folder
from loguru import logger
class PaperFileGroup():
def __init__(self):
@@ -33,7 +33,7 @@ class PaperFileGroup():
self.sp_file_index.append(index)
self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex")
logger.info('Segmentation: done')
print('Segmentation: done')
def merge_result(self):
self.file_result = ["" for _ in range(len(self.file_paths))]
for r, k in zip(self.sp_file_result, self.sp_file_index):
@@ -56,7 +56,7 @@ class PaperFileGroup():
def 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en', mode='polish'):
import time, os, re
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
# <-------- 读取Latex文件删除其中的所有注释 ---------->
@@ -81,8 +81,8 @@ def 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
# <-------- 多线程润色开始 ---------->
if language == 'en':
if mode == 'polish':
inputs_array = [r"Below is a section from an academic paper, polish this section to meet the academic standard, " +
r"improve the grammar, clarity and overall readability, do not modify any latex command such as \section, \cite and equations:" +
inputs_array = ["Below is a section from an academic paper, polish this section to meet the academic standard, " +
"improve the grammar, clarity and overall readability, do not modify any latex command such as \section, \cite and equations:" +
f"\n\n{frag}" for frag in pfg.sp_file_contents]
else:
inputs_array = [r"Below is a section from an academic paper, proofread this section." +
@@ -93,10 +93,10 @@ def 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)]
elif language == 'zh':
if mode == 'polish':
inputs_array = [r"以下是一篇学术论文中的一段内容请将此部分润色以满足学术标准提高语法、清晰度和整体可读性不要修改任何LaTeX命令例如\section\cite和方程式" +
inputs_array = [f"以下是一篇学术论文中的一段内容请将此部分润色以满足学术标准提高语法、清晰度和整体可读性不要修改任何LaTeX命令例如\section\cite和方程式" +
f"\n\n{frag}" for frag in pfg.sp_file_contents]
else:
inputs_array = [r"以下是一篇学术论文中的一段内容请对这部分内容进行语法矫正。不要修改任何LaTeX命令例如\section\cite和方程式" +
inputs_array = [f"以下是一篇学术论文中的一段内容请对这部分内容进行语法矫正。不要修改任何LaTeX命令例如\section\cite和方程式" +
f"\n\n{frag}" for frag in pfg.sp_file_contents]
inputs_show_user_array = [f"润色 {f}" for f in pfg.sp_file_tag]
sys_prompt_array=["你是一位专业的中文学术论文作家。" for _ in range(n_split)]
@@ -122,7 +122,7 @@ def 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
pfg.write_result()
pfg.zip_result()
except:
logger.error(trimmed_format_exc())
print(trimmed_format_exc())
# <-------- 整理结果,退出 ---------->
create_report_file_name = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + f"-chatgpt.polish.md"

View File

@@ -1,6 +1,6 @@
from toolbox import update_ui, promote_file_to_downloadzone
from toolbox import CatchException, report_exception, write_history_to_file
from loguru import logger
fast_debug = False
class PaperFileGroup():
def __init__(self):
@@ -33,11 +33,11 @@ class PaperFileGroup():
self.sp_file_index.append(index)
self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex")
logger.info('Segmentation: done')
print('Segmentation: done')
def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en'):
import time, os, re
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
# <-------- 读取Latex文件删除其中的所有注释 ---------->
pfg = PaperFileGroup()

View File

@@ -1,12 +1,10 @@
from toolbox import update_ui, trimmed_format_exc, get_conf, get_log_folder, promote_file_to_downloadzone, check_repeat_upload, map_file_to_sha256
from toolbox import update_ui, trimmed_format_exc, get_conf, get_log_folder, promote_file_to_downloadzone
from toolbox import CatchException, report_exception, update_ui_lastest_msg, zip_result, gen_time_str
from functools import partial
from loguru import logger
import glob, os, requests, time, json, tarfile, threading
import glob, os, requests, time, json, tarfile
pj = os.path.join
ARXIV_CACHE_DIR = get_conf("ARXIV_CACHE_DIR")
ARXIV_CACHE_DIR = os.path.expanduser(f"~/arxiv_cache/")
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 工具函数 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
@@ -109,10 +107,6 @@ def arxiv_download(chatbot, history, txt, allow_cache=True):
except ValueError:
return False
if txt.startswith('https://arxiv.org/pdf/'):
arxiv_id = txt.split('/')[-1] # 2402.14207v2.pdf
txt = arxiv_id.split('v')[0] # 2402.14207
if ('.' in txt) and ('/' not in txt) and is_float(txt): # is arxiv ID
txt = 'https://arxiv.org/abs/' + txt.strip()
if ('.' in txt) and ('/' not in txt) and is_float(txt[:10]): # is arxiv ID
@@ -127,7 +121,6 @@ def arxiv_download(chatbot, history, txt, allow_cache=True):
time.sleep(1) # 刷新界面
url_ = txt # https://arxiv.org/abs/1707.06690
if not txt.startswith('https://arxiv.org/abs/'):
msg = f"解析arxiv网址失败, 期望格式例如: https://arxiv.org/abs/1707.06690。实际得到格式: {url_}"
yield from update_ui_lastest_msg(msg, chatbot=chatbot, history=history) # 刷新界面
@@ -138,48 +131,29 @@ def arxiv_download(chatbot, history, txt, allow_cache=True):
cached_translation_pdf = check_cached_translation_pdf(arxiv_id)
if cached_translation_pdf and allow_cache: return cached_translation_pdf, arxiv_id
extract_dst = pj(ARXIV_CACHE_DIR, arxiv_id, 'extract')
url_tar = url_.replace('/abs/', '/e-print/')
translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'e-print')
dst = pj(translation_dir, arxiv_id + '.tar')
extract_dst = pj(ARXIV_CACHE_DIR, arxiv_id, 'extract')
os.makedirs(translation_dir, exist_ok=True)
# <-------------- download arxiv source file ------------->
def fix_url_and_download():
# for url_tar in [url_.replace('/abs/', '/e-print/'), url_.replace('/abs/', '/src/')]:
for url_tar in [url_.replace('/abs/', '/src/'), url_.replace('/abs/', '/e-print/')]:
# <-------------- download arxiv source file ------------->
dst = pj(translation_dir, arxiv_id + '.tar')
if os.path.exists(dst):
yield from update_ui_lastest_msg("调用缓存", chatbot=chatbot, history=history) # 刷新界面
else:
yield from update_ui_lastest_msg("开始下载", chatbot=chatbot, history=history) # 刷新界面
proxies = get_conf('proxies')
r = requests.get(url_tar, proxies=proxies)
if r.status_code == 200:
with open(dst, 'wb+') as f:
f.write(r.content)
return True
return False
if os.path.exists(dst) and allow_cache:
yield from update_ui_lastest_msg(f"调用缓存 {arxiv_id}", chatbot=chatbot, history=history) # 刷新界面
success = True
else:
yield from update_ui_lastest_msg(f"开始下载 {arxiv_id}", chatbot=chatbot, history=history) # 刷新界面
success = fix_url_and_download()
yield from update_ui_lastest_msg(f"下载完成 {arxiv_id}", chatbot=chatbot, history=history) # 刷新界面
if not success:
yield from update_ui_lastest_msg(f"下载失败 {arxiv_id}", chatbot=chatbot, history=history)
raise tarfile.ReadError(f"论文下载失败 {arxiv_id}")
# <-------------- extract file ------------->
yield from update_ui_lastest_msg("下载完成", chatbot=chatbot, history=history) # 刷新界面
from toolbox import extract_archive
try:
extract_archive(file_path=dst, dest_dir=extract_dst)
except tarfile.ReadError:
os.remove(dst)
raise tarfile.ReadError(f"论文下载失败")
return extract_dst, arxiv_id
def pdf2tex_project(pdf_file_path, plugin_kwargs):
if plugin_kwargs["method"] == "MATHPIX":
def pdf2tex_project(pdf_file_path):
# Mathpix API credentials
app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY')
headers = {"app_id": app_id, "app_key": app_key}
@@ -198,7 +172,7 @@ def pdf2tex_project(pdf_file_path, plugin_kwargs):
if response.ok:
pdf_id = response.json()["pdf_id"]
logger.info(f"PDF processing initiated. PDF ID: {pdf_id}")
print(f"PDF processing initiated. PDF ID: {pdf_id}")
# Step 2: Check processing status
while True:
@@ -206,12 +180,12 @@ def pdf2tex_project(pdf_file_path, plugin_kwargs):
conversion_data = conversion_response.json()
if conversion_data["status"] == "completed":
logger.info("PDF processing completed.")
print("PDF processing completed.")
break
elif conversion_data["status"] == "error":
logger.info("Error occurred during processing.")
print("Error occurred during processing.")
else:
logger.info(f"Processing status: {conversion_data['status']}")
print(f"Processing status: {conversion_data['status']}")
time.sleep(5) # wait for a few seconds before checking again
# Step 3: Save results to local files
@@ -226,7 +200,7 @@ def pdf2tex_project(pdf_file_path, plugin_kwargs):
output_path = os.path.join(output_dir, output_name)
with open(output_path, "wb") as output_file:
output_file.write(response.content)
logger.info(f"tex.zip file saved at: {output_path}")
print(f"tex.zip file saved at: {output_path}")
import zipfile
unzip_dir = os.path.join(output_dir, file_name_wo_dot)
@@ -236,14 +210,8 @@ def pdf2tex_project(pdf_file_path, plugin_kwargs):
return unzip_dir
else:
logger.error(f"Error sending PDF for processing. Status code: {response.status_code}")
print(f"Error sending PDF for processing. Status code: {response.status_code}")
return None
else:
from crazy_functions.pdf_fns.parse_pdf_via_doc2x import 解析PDF_DOC2X_转Latex
unzip_dir = 解析PDF_DOC2X_转Latex(pdf_file_path)
return unzip_dir
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 插件主程序1 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
@@ -253,7 +221,7 @@ def pdf2tex_project(pdf_file_path, plugin_kwargs):
def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
# <-------------- information about this plugin ------------->
chatbot.append(["函数插件功能?",
"对整个Latex项目进行纠错, 用latex编译为PDF对修正处做高亮。函数插件贡献者: Binary-Husky。注意事项: 目前对机器学习类文献转化效果最好其他类型文献转化效果未知。仅在Windows系统进行了测试其他操作系统表现未知。"])
"对整个Latex项目进行纠错, 用latex编译为PDF对修正处做高亮。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4其他模型转化效果未知。目前对机器学习类文献转化效果最好其他类型文献转化效果未知。仅在Windows系统进行了测试其他操作系统表现未知。"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# <-------------- more requirements ------------->
@@ -291,8 +259,6 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo
project_folder = desend_to_extracted_folder_if_exist(project_folder)
# <-------------- move latex project away from temp folder ------------->
from shared_utils.fastapi_server import validate_path_safety
validate_path_safety(project_folder, chatbot.get_user())
project_folder = move_project(project_folder, arxiv_id=None)
# <-------------- if merge_translate_zh is already generated, skip gpt req ------------->
@@ -316,7 +282,7 @@ def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, histo
promote_file_to_downloadzone(file=zip_res, chatbot=chatbot)
else:
chatbot.append((f"失败了",
'虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+Conversation_To_File进行反馈 ...'))
'虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+对话历史存档进行反馈 ...'))
yield from update_ui(chatbot=chatbot, history=history);
time.sleep(1) # 刷新界面
promote_file_to_downloadzone(file=zip_res, chatbot=chatbot)
@@ -332,23 +298,17 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
# <-------------- information about this plugin ------------->
chatbot.append([
"函数插件功能?",
"对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 此插件Windows支持最佳Linux下必须使用Docker安装详见项目主README.md。目前对机器学习类文献转化效果最好其他类型文献转化效果未知。"])
"对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 此插件Windows支持最佳Linux下必须使用Docker安装详见项目主README.md。目前仅支持GPT3.5/GPT4其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# <-------------- more requirements ------------->
if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
more_req = plugin_kwargs.get("advanced_arg", "")
no_cache = ("--no-cache" in more_req)
if no_cache: more_req = more_req.replace("--no-cache", "").strip()
allow_gptac_cloud_io = ("--allow-cloudio" in more_req) # 从云端下载翻译结果,以及上传翻译结果到云端
if allow_gptac_cloud_io: more_req = more_req.replace("--allow-cloudio", "").strip()
no_cache = more_req.startswith("--no-cache")
if no_cache: more_req.lstrip("--no-cache")
allow_cache = not no_cache
_switch_prompt_ = partial(switch_prompt, more_requirement=more_req)
# <-------------- check deps ------------->
try:
import glob, os, time, subprocess
@@ -375,20 +335,6 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# #################################################################
if allow_gptac_cloud_io and arxiv_id:
# 访问 GPTAC学术云查询云端是否存在该论文的翻译版本
from crazy_functions.latex_fns.latex_actions import check_gptac_cloud
success, downloaded = check_gptac_cloud(arxiv_id, chatbot)
if success:
chatbot.append([
f"检测到GPTAC云端存在翻译版本, 如果不满意翻译结果, 请禁用云端分享, 然后重新执行。",
None
])
yield from update_ui(chatbot=chatbot, history=history)
return
#################################################################
if os.path.exists(txt):
project_folder = txt
else:
@@ -407,8 +353,6 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
project_folder = desend_to_extracted_folder_if_exist(project_folder)
# <-------------- move latex project away from temp folder ------------->
from shared_utils.fastapi_server import validate_path_safety
validate_path_safety(project_folder, chatbot.get_user())
project_folder = move_project(project_folder, arxiv_id)
# <-------------- if merge_translate_zh is already generated, skip gpt req ------------->
@@ -426,21 +370,14 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot,
# <-------------- zip PDF ------------->
zip_res = zip_result(project_folder)
if success:
if allow_gptac_cloud_io and arxiv_id:
# 如果用户允许我们将翻译好的arxiv论文PDF上传到GPTAC学术云
from crazy_functions.latex_fns.latex_actions import upload_to_gptac_cloud_if_user_allow
threading.Thread(target=upload_to_gptac_cloud_if_user_allow,
args=(chatbot, arxiv_id), daemon=True).start()
chatbot.append((f"成功啦", '请查收结果(压缩包)...'))
yield from update_ui(chatbot=chatbot, history=history)
yield from update_ui(chatbot=chatbot, history=history);
time.sleep(1) # 刷新界面
promote_file_to_downloadzone(file=zip_res, chatbot=chatbot)
else:
chatbot.append((f"失败了",
'虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 您可以到Github Issue区, 用该压缩包进行反馈。如系统是Linux请检查系统字体见Github wiki ...'))
yield from update_ui(chatbot=chatbot, history=history)
yield from update_ui(chatbot=chatbot, history=history);
time.sleep(1) # 刷新界面
promote_file_to_downloadzone(file=zip_res, chatbot=chatbot)
@@ -455,7 +392,7 @@ def PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, h
# <-------------- information about this plugin ------------->
chatbot.append([
"函数插件功能?",
"将PDF转换为Latex项目翻译为中文后重新编译为PDF。函数插件贡献者: Marroh。注意事项: 此插件Windows支持最佳Linux下必须使用Docker安装详见项目主README.md。目前对机器学习类文献转化效果最好其他类型文献转化效果未知。"])
"将PDF转换为Latex项目翻译为中文后重新编译为PDF。函数插件贡献者: Marroh。注意事项: 此插件Windows支持最佳Linux下必须使用Docker安装详见项目主README.md。目前仅支持GPT3.5/GPT4其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# <-------------- more requirements ------------->
@@ -495,55 +432,16 @@ def PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, h
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"不支持同时处理多个pdf文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
if plugin_kwargs.get("method", "") == 'MATHPIX':
app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY')
if len(app_id) == 0 or len(app_key) == 0:
report_exception(chatbot, history, a="缺失 MATHPIX_APPID 和 MATHPIX_APPKEY。", b=f"请配置 MATHPIX_APPID 和 MATHPIX_APPKEY")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
if plugin_kwargs.get("method", "") == 'DOC2X':
app_id, app_key = "", ""
DOC2X_API_KEY = get_conf('DOC2X_API_KEY')
if len(DOC2X_API_KEY) == 0:
report_exception(chatbot, history, a="缺失 DOC2X_API_KEY。", b=f"请配置 DOC2X_API_KEY")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
hash_tag = map_file_to_sha256(file_manifest[0])
# # <-------------- check repeated pdf ------------->
# chatbot.append([f"检查PDF是否被重复上传", "正在检查..."])
# yield from update_ui(chatbot=chatbot, history=history)
# repeat, project_folder = check_repeat_upload(file_manifest[0], hash_tag)
# if repeat:
# yield from update_ui_lastest_msg(f"发现重复上传,请查收结果(压缩包)...", chatbot=chatbot, history=history)
# try:
# translate_pdf = [f for f in glob.glob(f'{project_folder}/**/merge_translate_zh.pdf', recursive=True)][0]
# promote_file_to_downloadzone(translate_pdf, rename_file=None, chatbot=chatbot)
# comparison_pdf = [f for f in glob.glob(f'{project_folder}/**/comparison.pdf', recursive=True)][0]
# promote_file_to_downloadzone(comparison_pdf, rename_file=None, chatbot=chatbot)
# zip_res = zip_result(project_folder)
# promote_file_to_downloadzone(file=zip_res, chatbot=chatbot)
# return
# except:
# report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"发现重复上传,但是无法找到相关文件")
# yield from update_ui(chatbot=chatbot, history=history)
# else:
# yield from update_ui_lastest_msg(f"未发现重复上传", chatbot=chatbot, history=history)
# <-------------- convert pdf into tex ------------->
chatbot.append([f"解析项目: {txt}", "正在将PDF转换为tex项目请耐心等待..."])
yield from update_ui(chatbot=chatbot, history=history)
project_folder = pdf2tex_project(file_manifest[0], plugin_kwargs)
if project_folder is None:
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"PDF转换为tex项目失败")
yield from update_ui(chatbot=chatbot, history=history)
return False
project_folder = pdf2tex_project(file_manifest[0])
# <-------------- translate latex file into Chinese ------------->
yield from update_ui_lastest_msg("正在tex项目将翻译为中文...", chatbot=chatbot, history=history)
# Translate English Latex to Chinese Latex, and compile it
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
if len(file_manifest) == 0:
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到任何.tex文件: {txt}")
@@ -554,16 +452,8 @@ def PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, h
project_folder = desend_to_extracted_folder_if_exist(project_folder)
# <-------------- move latex project away from temp folder ------------->
from shared_utils.fastapi_server import validate_path_safety
validate_path_safety(project_folder, chatbot.get_user())
project_folder = move_project(project_folder)
# <-------------- set a hash tag for repeat-checking ------------->
with open(pj(project_folder, hash_tag + '.tag'), 'w') as f:
f.write(hash_tag)
f.close()
# <-------------- if merge_translate_zh is already generated, skip gpt req ------------->
if not os.path.exists(project_folder + '/merge_translate_zh.tex'):
yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
@@ -571,7 +461,6 @@ def PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, h
switch_prompt=_switch_prompt_)
# <-------------- compile PDF ------------->
yield from update_ui_lastest_msg("正在将翻译好的项目tex项目编译为PDF...", chatbot=chatbot, history=history)
success = yield from 编译Latex(chatbot, history, main_file_original='merge',
main_file_modified='merge_translate_zh', mode='translate_zh',
work_folder_original=project_folder, work_folder_modified=project_folder,

View File

@@ -1,83 +0,0 @@
from toolbox import CatchException, check_packages, get_conf
from toolbox import update_ui, update_ui_lastest_msg, disable_auto_promotion
from toolbox import trimmed_format_exc_markdown
from crazy_functions.crazy_utils import get_files_from_everything
from crazy_functions.pdf_fns.parse_pdf import get_avail_grobid_url
from crazy_functions.pdf_fns.parse_pdf_via_doc2x import 解析PDF_基于DOC2X
from crazy_functions.pdf_fns.parse_pdf_legacy import 解析PDF_简单拆解
from crazy_functions.pdf_fns.parse_pdf_grobid import 解析PDF_基于GROBID
from shared_utils.colorful import *
@CatchException
def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
disable_auto_promotion(chatbot)
# 基本信息:功能、贡献者
chatbot.append([None, "插件功能批量翻译PDF文档。函数插件贡献者: Binary-Husky"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
check_packages(["fitz", "tiktoken", "scipdf"])
except:
chatbot.append([None, f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf tiktoken scipdf_parser```。"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# 清空历史,以免输入溢出
history = []
success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf')
# 检测输入参数,如没有给定输入参数,直接退出
if (not success) and txt == "": txt = '空空如也的输入栏。提示请先上传文件把PDF文件拖入对话'
# 如果没找到任何文件
if len(file_manifest) == 0:
chatbot.append([None, f"找不到任何.pdf拓展名的文件: {txt}"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# 开始正式执行任务
method = plugin_kwargs.get("pdf_parse_method", None)
if method == "DOC2X":
# ------- 第一种方法效果最好但是需要DOC2X服务 -------
DOC2X_API_KEY = get_conf("DOC2X_API_KEY")
if len(DOC2X_API_KEY) != 0:
try:
yield from 解析PDF_基于DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request)
return
except:
chatbot.append([None, f"DOC2X服务不可用现在将执行效果稍差的旧版代码。{trimmed_format_exc_markdown()}"])
yield from update_ui(chatbot=chatbot, history=history)
if method == "GROBID":
# ------- 第二种方法,效果次优 -------
grobid_url = get_avail_grobid_url()
if grobid_url is not None:
yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url)
return
if method == "ClASSIC":
# ------- 第三种方法,早期代码,效果不理想 -------
yield from update_ui_lastest_msg("GROBID服务不可用请检查config中的GROBID_URL。作为替代现在将执行效果稍差的旧版代码。", chatbot, history, delay=3)
yield from 解析PDF_简单拆解(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
return
if method is None:
# ------- 以上三种方法都试一遍 -------
DOC2X_API_KEY = get_conf("DOC2X_API_KEY")
if len(DOC2X_API_KEY) != 0:
try:
yield from 解析PDF_基于DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request)
return
except:
chatbot.append([None, f"DOC2X服务不可用正在尝试GROBID。{trimmed_format_exc_markdown()}"])
yield from update_ui(chatbot=chatbot, history=history)
grobid_url = get_avail_grobid_url()
if grobid_url is not None:
yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url)
return
yield from update_ui_lastest_msg("GROBID服务不可用请检查config中的GROBID_URL。作为替代现在将执行效果稍差的旧版代码。", chatbot, history, delay=3)
yield from 解析PDF_简单拆解(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
return

View File

@@ -1,33 +0,0 @@
from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty
from .PDF_Translate import 批量翻译PDF文档
class PDF_Tran(GptAcademicPluginTemplate):
def __init__(self):
"""
请注意`execute`会执行在不同的线程中,因此您在定义和使用类变量时,应当慎之又慎!
"""
pass
def define_arg_selection_menu(self):
"""
定义插件的二级选项菜单
"""
gui_definition = {
"main_input":
ArgProperty(title="PDF文件路径", description="未指定路径,请上传文件后,再点击该插件", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步
"additional_prompt":
ArgProperty(title="额外提示词", description="例如:对专有名词、翻译语气等方面的要求", default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步
"pdf_parse_method":
ArgProperty(title="PDF解析方法", options=["DOC2X", "GROBID", "ClASSIC"], description="", default_value="GROBID", type="dropdown").model_dump_json(),
}
return gui_definition
def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
"""
执行插件
"""
main_input = plugin_kwargs["main_input"]
additional_prompt = plugin_kwargs["additional_prompt"]
pdf_parse_method = plugin_kwargs["pdf_parse_method"]
yield from 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)

View File

@@ -1,154 +0,0 @@
import os,glob
from typing import List
from shared_utils.fastapi_server import validate_path_safety
from toolbox import report_exception
from toolbox import CatchException, update_ui, get_conf, get_log_folder, update_ui_lastest_msg
from crazy_functions.crazy_utils import input_clipping
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
RAG_WORKER_REGISTER = {}
MAX_HISTORY_ROUND = 5
MAX_CONTEXT_TOKEN_LIMIT = 4096
REMEMBER_PREVIEW = 1000
@CatchException
def handle_document_upload(files: List[str], llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request, rag_worker):
"""
Handles document uploads by extracting text and adding it to the vector store.
"""
from llama_index.core import Document
from crazy_functions.rag_fns.rag_file_support import extract_text, supports_format
user_name = chatbot.get_user()
checkpoint_dir = get_log_folder(user_name, plugin_name='experimental_rag')
for file_path in files:
try:
validate_path_safety(file_path, user_name)
text = extract_text(file_path)
if text is None:
chatbot.append(
[f"上传文件: {os.path.basename(file_path)}", f"文件解析失败无法提取文本内容请更换文件。失败原因可能为1.文档格式过于复杂2. 不支持的文件格式,支持的文件格式后缀有:" + ", ".join(supports_format)])
else:
chatbot.append(
[f"上传文件: {os.path.basename(file_path)}", f"上传文件前50个字符为:{text[:50]}"])
document = Document(text=text, metadata={"source": file_path})
rag_worker.add_documents_to_vector_store([document])
chatbot.append([f"上传文件: {os.path.basename(file_path)}", "文件已成功添加到知识库。"])
except Exception as e:
report_exception(chatbot, history, a=f"处理文件: {file_path}", b=str(e))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# Main Q&A function with document upload support
@CatchException
def Rag问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
# import vector store lib
VECTOR_STORE_TYPE = "Milvus"
if VECTOR_STORE_TYPE == "Milvus":
try:
from crazy_functions.rag_fns.milvus_worker import MilvusRagWorker as LlamaIndexRagWorker
except:
VECTOR_STORE_TYPE = "Simple"
if VECTOR_STORE_TYPE == "Simple":
from crazy_functions.rag_fns.llama_index_worker import LlamaIndexRagWorker
# 1. we retrieve rag worker from global context
user_name = chatbot.get_user()
checkpoint_dir = get_log_folder(user_name, plugin_name='experimental_rag')
if user_name in RAG_WORKER_REGISTER:
rag_worker = RAG_WORKER_REGISTER[user_name]
else:
rag_worker = RAG_WORKER_REGISTER[user_name] = LlamaIndexRagWorker(
user_name,
llm_kwargs,
checkpoint_dir=checkpoint_dir,
auto_load_checkpoint=True
)
current_context = f"{VECTOR_STORE_TYPE} @ {checkpoint_dir}"
tip = "提示输入“清空向量数据库”可以清空RAG向量数据库"
# 2. Handle special commands
if os.path.exists(txt) and os.path.isdir(txt):
project_folder = txt
validate_path_safety(project_folder, chatbot.get_user())
# Extract file paths from the user input
# Assuming the user inputs file paths separated by commas after the command
file_paths = [f for f in glob.glob(f'{project_folder}/**/*', recursive=True)]
chatbot.append([txt, f'正在处理上传的文档 ({current_context}) ...'])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
yield from handle_document_upload(file_paths, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request, rag_worker)
return
elif txt == "清空向量数据库":
chatbot.append([txt, f'正在清空 ({current_context}) ...'])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
rag_worker.purge_vector_store()
yield from update_ui_lastest_msg('已清空', chatbot, history, delay=0) # 刷新界面
return
else:
report_exception(chatbot, history, a=f"上传文件路径错误: {txt}", b="请检查并提供正确路径。")
# 3. Normal Q&A processing
chatbot.append([txt, f'正在召回知识 ({current_context}) ...'])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 4. Clip history to reduce token consumption
txt_origin = txt
if len(history) > MAX_HISTORY_ROUND * 2:
history = history[-(MAX_HISTORY_ROUND * 2):]
txt_clip, history, flags = input_clipping(txt, history, max_token_limit=MAX_CONTEXT_TOKEN_LIMIT, return_clip_flags=True)
input_is_clipped_flag = (flags["original_input_len"] != flags["clipped_input_len"])
# 5. If input is clipped, add input to vector store before retrieve
if input_is_clipped_flag:
yield from update_ui_lastest_msg('检测到长输入, 正在向量化 ...', chatbot, history, delay=0) # 刷新界面
# Save input to vector store
rag_worker.add_text_to_vector_store(txt_origin)
yield from update_ui_lastest_msg('向量化完成 ...', chatbot, history, delay=0) # 刷新界面
if len(txt_origin) > REMEMBER_PREVIEW:
HALF = REMEMBER_PREVIEW // 2
i_say_to_remember = txt[:HALF] + f" ...\n...(省略{len(txt_origin)-REMEMBER_PREVIEW}字)...\n... " + txt[-HALF:]
if (flags["original_input_len"] - flags["clipped_input_len"]) > HALF:
txt_clip = txt_clip + f" ...\n...(省略{len(txt_origin)-len(txt_clip)-HALF}字)...\n... " + txt[-HALF:]
else:
i_say_to_remember = i_say = txt_clip
else:
i_say_to_remember = i_say = txt_clip
# 6. Search vector store and build prompts
nodes = rag_worker.retrieve_from_store_with_query(i_say)
prompt = rag_worker.build_prompt(query=i_say, nodes=nodes)
# 7. Query language model
if len(chatbot) != 0:
chatbot.pop(-1) # Pop temp chat, because we are going to add them again inside `request_gpt_model_in_new_thread_with_ui_alive`
model_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=prompt,
inputs_show_user=i_say,
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history=history,
sys_prompt=system_prompt,
retry_times_at_unknown_error=0
)
# 8. Remember Q&A
yield from update_ui_lastest_msg(
model_say + '</br></br>' + f'对话记忆中, 请稍等 ({current_context}) ...',
chatbot, history, delay=0.5
)
rag_worker.remember_qa(i_say_to_remember, model_say)
history.extend([i_say, model_say])
# 9. Final UI Update
yield from update_ui_lastest_msg(model_say, chatbot, history, delay=0, msg=tip)

View File

@@ -1,167 +0,0 @@
import pickle, os, random
from toolbox import CatchException, update_ui, get_conf, get_log_folder, update_ui_lastest_msg
from crazy_functions.crazy_utils import input_clipping
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from request_llms.bridge_all import predict_no_ui_long_connection
from crazy_functions.json_fns.select_tool import structure_output, select_tool
from pydantic import BaseModel, Field
from loguru import logger
from typing import List
SOCIAL_NETWOK_WORKER_REGISTER = {}
class SocialNetwork():
def __init__(self):
self.people = []
class SaveAndLoad():
def __init__(self, user_name, llm_kwargs, auto_load_checkpoint=True, checkpoint_dir=None) -> None:
self.user_name = user_name
self.checkpoint_dir = checkpoint_dir
if auto_load_checkpoint:
self.social_network = self.load_from_checkpoint(checkpoint_dir)
else:
self.social_network = SocialNetwork()
def does_checkpoint_exist(self, checkpoint_dir=None):
import os, glob
if checkpoint_dir is None: checkpoint_dir = self.checkpoint_dir
if not os.path.exists(checkpoint_dir): return False
if len(glob.glob(os.path.join(checkpoint_dir, "social_network.pkl"))) == 0: return False
return True
def save_to_checkpoint(self, checkpoint_dir=None):
if checkpoint_dir is None: checkpoint_dir = self.checkpoint_dir
with open(os.path.join(checkpoint_dir, 'social_network.pkl'), "wb+") as f:
pickle.dump(self.social_network, f)
return
def load_from_checkpoint(self, checkpoint_dir=None):
if checkpoint_dir is None: checkpoint_dir = self.checkpoint_dir
if self.does_checkpoint_exist(checkpoint_dir=checkpoint_dir):
with open(os.path.join(checkpoint_dir, 'social_network.pkl'), "rb") as f:
social_network = pickle.load(f)
return social_network
else:
return SocialNetwork()
class Friend(BaseModel):
friend_name: str = Field(description="name of a friend")
friend_description: str = Field(description="description of a friend (everything about this friend)")
friend_relationship: str = Field(description="The relationship with a friend (e.g. friend, family, colleague)")
class FriendList(BaseModel):
friends_list: List[Friend] = Field(description="The list of friends")
class SocialNetworkWorker(SaveAndLoad):
def ai_socail_advice(self, prompt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, run_gpt_fn, intention_type):
pass
def ai_remove_friend(self, prompt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, run_gpt_fn, intention_type):
pass
def ai_list_friends(self, prompt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, run_gpt_fn, intention_type):
pass
def ai_add_multi_friends(self, prompt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, run_gpt_fn, intention_type):
friend, err_msg = structure_output(
txt=prompt,
prompt="根据提示, 解析多个联系人的身份信息\n\n",
err_msg=f"不能理解该联系人",
run_gpt_fn=run_gpt_fn,
pydantic_cls=FriendList
)
if friend.friends_list:
for f in friend.friends_list:
self.add_friend(f)
msg = f"成功添加{len(friend.friends_list)}个联系人: {str(friend.friends_list)}"
yield from update_ui_lastest_msg(lastmsg=msg, chatbot=chatbot, history=history, delay=0)
def run(self, txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
prompt = txt
run_gpt_fn = lambda inputs, sys_prompt: predict_no_ui_long_connection(inputs=inputs, llm_kwargs=llm_kwargs, history=[], sys_prompt=sys_prompt, observe_window=[])
self.tools_to_select = {
"SocialAdvice":{
"explain_to_llm": "如果用户希望获取社交指导调用SocialAdvice生成一些社交建议",
"callback": self.ai_socail_advice,
},
"AddFriends":{
"explain_to_llm": "如果用户给出了联系人调用AddMultiFriends把联系人添加到数据库",
"callback": self.ai_add_multi_friends,
},
"RemoveFriend":{
"explain_to_llm": "如果用户希望移除某个联系人调用RemoveFriend",
"callback": self.ai_remove_friend,
},
"ListFriends":{
"explain_to_llm": "如果用户列举联系人调用ListFriends",
"callback": self.ai_list_friends,
}
}
try:
Explaination = '\n'.join([f'{k}: {v["explain_to_llm"]}' for k, v in self.tools_to_select.items()])
class UserSociaIntention(BaseModel):
intention_type: str = Field(
description=
f"The type of user intention. You must choose from {self.tools_to_select.keys()}.\n\n"
f"Explaination:\n{Explaination}",
default="SocialAdvice"
)
pydantic_cls_instance, err_msg = select_tool(
prompt=txt,
run_gpt_fn=run_gpt_fn,
pydantic_cls=UserSociaIntention
)
except Exception as e:
yield from update_ui_lastest_msg(
lastmsg=f"无法理解用户意图 {err_msg}",
chatbot=chatbot,
history=history,
delay=0
)
return
intention_type = pydantic_cls_instance.intention_type
intention_callback = self.tools_to_select[pydantic_cls_instance.intention_type]['callback']
yield from intention_callback(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, run_gpt_fn, intention_type)
def add_friend(self, friend):
# check whether the friend is already in the social network
for f in self.social_network.people:
if f.friend_name == friend.friend_name:
f.friend_description = friend.friend_description
f.friend_relationship = friend.friend_relationship
logger.info(f"Repeated friend, update info: {friend}")
return
logger.info(f"Add a new friend: {friend}")
self.social_network.people.append(friend)
return
@CatchException
def I人助手(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
# 1. we retrieve worker from global context
user_name = chatbot.get_user()
checkpoint_dir=get_log_folder(user_name, plugin_name='experimental_rag')
if user_name in SOCIAL_NETWOK_WORKER_REGISTER:
social_network_worker = SOCIAL_NETWOK_WORKER_REGISTER[user_name]
else:
social_network_worker = SOCIAL_NETWOK_WORKER_REGISTER[user_name] = SocialNetworkWorker(
user_name,
llm_kwargs,
checkpoint_dir=checkpoint_dir,
auto_load_checkpoint=True
)
# 2. save
yield from social_network_worker.run(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
social_network_worker.save_to_checkpoint(checkpoint_dir)
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

View File

@@ -1,162 +0,0 @@
import os, copy, time
from toolbox import CatchException, report_exception, update_ui, zip_result, promote_file_to_downloadzone, update_ui_lastest_msg, get_conf, generate_file_link
from shared_utils.fastapi_server import validate_path_safety
from crazy_functions.crazy_utils import input_clipping
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from crazy_functions.agent_fns.python_comment_agent import PythonCodeComment
from crazy_functions.diagram_fns.file_tree import FileNode
from crazy_functions.agent_fns.watchdog import WatchDog
from shared_utils.advanced_markdown_format import markdown_convertion_for_file
from loguru import logger
def 注释源代码(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
summary_batch_isolation = True
inputs_array = []
inputs_show_user_array = []
history_array = []
sys_prompt_array = []
assert len(file_manifest) <= 512, "源文件太多超过512个, 请缩减输入文件的数量。或者您也可以选择删除此行警告并修改代码拆分file_manifest列表从而实现分批次处理。"
# 建立文件树
file_tree_struct = FileNode("root", build_manifest=True)
for file_path in file_manifest:
file_tree_struct.add_file(file_path, file_path)
# <第一步,逐个文件分析,多线程>
lang = "" if not plugin_kwargs["use_chinese"] else " (you must use Chinese)"
for index, fp in enumerate(file_manifest):
# 读取文件
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
file_content = f.read()
prefix = ""
i_say = prefix + f'Please conclude the following source code at {os.path.relpath(fp, project_folder)} with only one sentence{lang}, the code is:\n```{file_content}```'
i_say_show_user = prefix + f'[{index+1}/{len(file_manifest)}] 请用一句话对下面的程序文件做一个整体概述: {fp}'
# 装载请求内容
MAX_TOKEN_SINGLE_FILE = 2560
i_say, _ = input_clipping(inputs=i_say, history=[], max_token_limit=MAX_TOKEN_SINGLE_FILE)
inputs_array.append(i_say)
inputs_show_user_array.append(i_say_show_user)
history_array.append([])
sys_prompt_array.append(f"You are a software architecture analyst analyzing a source code project. Do not dig into details, tell me what the code is doing in general. Your answer must be short, simple and clear{lang}.")
# 文件读取完成,对每一个源代码文件,生成一个请求线程,发送到大模型进行分析
gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
inputs_array = inputs_array,
inputs_show_user_array = inputs_show_user_array,
history_array = history_array,
sys_prompt_array = sys_prompt_array,
llm_kwargs = llm_kwargs,
chatbot = chatbot,
show_user_at_complete = True
)
# <第二步,逐个文件分析,生成带注释文件>
tasks = ["" for _ in range(len(file_manifest))]
def bark_fn(tasks):
for i in range(len(tasks)): tasks[i] = "watchdog is dead"
wd = WatchDog(timeout=10, bark_fn=lambda: bark_fn(tasks), interval=3, msg="ThreadWatcher timeout")
wd.begin_watch()
from concurrent.futures import ThreadPoolExecutor
executor = ThreadPoolExecutor(max_workers=get_conf('DEFAULT_WORKER_NUM'))
def _task_multi_threading(i_say, gpt_say, fp, file_tree_struct, index):
language = 'Chinese' if plugin_kwargs["use_chinese"] else 'English'
def observe_window_update(x):
if tasks[index] == "watchdog is dead":
raise TimeoutError("ThreadWatcher: watchdog is dead")
tasks[index] = x
pcc = PythonCodeComment(llm_kwargs, plugin_kwargs, language=language, observe_window_update=observe_window_update)
pcc.read_file(path=fp, brief=gpt_say)
revised_path, revised_content = pcc.begin_comment_source_code(None, None)
file_tree_struct.manifest[fp].revised_path = revised_path
file_tree_struct.manifest[fp].revised_content = revised_content
# <将结果写回源文件>
with open(fp, 'w', encoding='utf-8') as f:
f.write(file_tree_struct.manifest[fp].revised_content)
# <生成对比html>
with open("crazy_functions/agent_fns/python_comment_compare.html", 'r', encoding='utf-8') as f:
html_template = f.read()
warp = lambda x: "```python\n\n" + x + "\n\n```"
from themes.theme import load_dynamic_theme
_, advanced_css, _, _ = load_dynamic_theme("Default")
html_template = html_template.replace("ADVANCED_CSS", advanced_css)
html_template = html_template.replace("REPLACE_CODE_FILE_LEFT", pcc.get_markdown_block_in_html(markdown_convertion_for_file(warp(pcc.original_content))))
html_template = html_template.replace("REPLACE_CODE_FILE_RIGHT", pcc.get_markdown_block_in_html(markdown_convertion_for_file(warp(revised_content))))
compare_html_path = fp + '.compare.html'
file_tree_struct.manifest[fp].compare_html = compare_html_path
with open(compare_html_path, 'w', encoding='utf-8') as f:
f.write(html_template)
tasks[index] = ""
chatbot.append([None, f"正在处理:"])
futures = []
index = 0
for i_say, gpt_say, fp in zip(gpt_response_collection[0::2], gpt_response_collection[1::2], file_manifest):
future = executor.submit(_task_multi_threading, i_say, gpt_say, fp, file_tree_struct, index)
index += 1
futures.append(future)
# <第三步,等待任务完成>
cnt = 0
while True:
cnt += 1
wd.feed()
time.sleep(3)
worker_done = [h.done() for h in futures]
remain = len(worker_done) - sum(worker_done)
# <展示已经完成的部分>
preview_html_list = []
for done, fp in zip(worker_done, file_manifest):
if not done: continue
if hasattr(file_tree_struct.manifest[fp], 'compare_html'):
preview_html_list.append(file_tree_struct.manifest[fp].compare_html)
else:
logger.error(f"文件: {fp} 的注释结果未能成功")
file_links = generate_file_link(preview_html_list)
yield from update_ui_lastest_msg(
f"当前任务: <br/>{'<br/>'.join(tasks)}.<br/>" +
f"剩余源文件数量: {remain}.<br/>" +
f"已完成的文件: {sum(worker_done)}.<br/>" +
file_links +
"<br/>" +
''.join(['.']*(cnt % 10 + 1)
), chatbot=chatbot, history=history, delay=0)
yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
if all(worker_done):
executor.shutdown()
break
# <第四步,压缩结果>
zip_res = zip_result(project_folder)
promote_file_to_downloadzone(file=zip_res, chatbot=chatbot)
# <END>
chatbot.append((None, "所有源文件均已处理完毕。"))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
@CatchException
def 注释Python项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
history = [] # 清空历史,以免输入溢出
plugin_kwargs["use_chinese"] = plugin_kwargs.get("use_chinese", False)
import glob, os
if os.path.exists(txt):
project_folder = txt
validate_path_safety(project_folder, chatbot.get_user())
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.py', recursive=True)]
if len(file_manifest) == 0:
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何python文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
yield from 注释源代码(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)

View File

@@ -1,36 +0,0 @@
from toolbox import get_conf, update_ui
from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty
from crazy_functions.SourceCode_Comment import 注释Python项目
class SourceCodeComment_Wrap(GptAcademicPluginTemplate):
def __init__(self):
"""
请注意`execute`会执行在不同的线程中,因此您在定义和使用类变量时,应当慎之又慎!
"""
pass
def define_arg_selection_menu(self):
"""
定义插件的二级选项菜单
"""
gui_definition = {
"main_input":
ArgProperty(title="路径", description="程序路径(上传文件后自动填写)", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步
"use_chinese":
ArgProperty(title="注释语言", options=["英文", "中文"], default_value="英文", description="", type="dropdown").model_dump_json(),
# "use_emoji":
# ArgProperty(title="在注释中使用emoji", options=["禁止", "允许"], default_value="禁止", description="无", type="dropdown").model_dump_json(),
}
return gui_definition
def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
"""
执行插件
"""
if plugin_kwargs["use_chinese"] == "中文":
plugin_kwargs["use_chinese"] = True
else:
plugin_kwargs["use_chinese"] = False
yield from 注释Python项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)

View File

@@ -1,5 +1,4 @@
from crazy_functions.agent_fns.pipe import PluginMultiprocessManager, PipeCom
from loguru import logger
class EchoDemo(PluginMultiprocessManager):
def subprocess_worker(self, child_conn):
@@ -17,4 +16,4 @@ class EchoDemo(PluginMultiprocessManager):
elif msg.cmd == "terminate":
self.child_conn.send(PipeCom("done", ""))
break
logger.info('[debug] subprocess_worker terminated')
print('[debug] subprocess_worker terminated')

View File

@@ -1,6 +1,5 @@
from toolbox import get_log_folder, update_ui, gen_time_str, get_conf, promote_file_to_downloadzone
from crazy_functions.agent_fns.watchdog import WatchDog
from loguru import logger
import time, os
class PipeCom:
@@ -48,7 +47,7 @@ class PluginMultiprocessManager:
def terminate(self):
self.p.terminate()
self.alive = False
logger.info("[debug] instance terminated")
print("[debug] instance terminated")
def subprocess_worker(self, child_conn):
# ⭐⭐ run in subprocess

View File

@@ -1,457 +0,0 @@
import datetime
import re
import os
from loguru import logger
from textwrap import dedent
from toolbox import CatchException, update_ui
from request_llms.bridge_all import predict_no_ui_long_connection
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
# TODO: 解决缩进问题
find_function_end_prompt = '''
Below is a page of code that you need to read. This page may not yet complete, you job is to split this page to sperate functions, class functions etc.
- Provide the line number where the first visible function ends.
- Provide the line number where the next visible function begins.
- If there are no other functions in this page, you should simply return the line number of the last line.
- Only focus on functions declared by `def` keyword. Ignore inline functions. Ignore function calls.
------------------ Example ------------------
INPUT:
```
L0000 |import sys
L0001 |import re
L0002 |
L0003 |def trimmed_format_exc():
L0004 | import os
L0005 | import traceback
L0006 | str = traceback.format_exc()
L0007 | current_path = os.getcwd()
L0008 | replace_path = "."
L0009 | return str.replace(current_path, replace_path)
L0010 |
L0011 |
L0012 |def trimmed_format_exc_markdown():
L0013 | ...
L0014 | ...
```
OUTPUT:
```
<first_function_end_at>L0009</first_function_end_at>
<next_function_begin_from>L0012</next_function_begin_from>
```
------------------ End of Example ------------------
------------------ the real INPUT you need to process NOW ------------------
```
{THE_TAGGED_CODE}
```
'''
revise_funtion_prompt = '''
You need to read the following code, and revise the source code ({FILE_BASENAME}) according to following instructions:
1. You should analyze the purpose of the functions (if there are any).
2. You need to add docstring for the provided functions (if there are any).
Be aware:
1. You must NOT modify the indent of code.
2. You are NOT authorized to change or translate non-comment code, and you are NOT authorized to add empty lines either, toggle qu.
3. Use {LANG} to add comments and docstrings. Do NOT translate Chinese that is already in the code.
4. Besides adding a docstring, use the ⭐ symbol to annotate the most core and important line of code within the function, explaining its role.
------------------ Example ------------------
INPUT:
```
L0000 |
L0001 |def zip_result(folder):
L0002 | t = gen_time_str()
L0003 | zip_folder(folder, get_log_folder(), f"result.zip")
L0004 | return os.path.join(get_log_folder(), f"result.zip")
L0005 |
L0006 |
```
OUTPUT:
<instruction_1_purpose>
This function compresses a given folder, and return the path of the resulting `zip` file.
</instruction_1_purpose>
<instruction_2_revised_code>
```
def zip_result(folder):
"""
Compresses the specified folder into a zip file and stores it in the log folder.
Args:
folder (str): The path to the folder that needs to be compressed.
Returns:
str: The path to the created zip file in the log folder.
"""
t = gen_time_str()
zip_folder(folder, get_log_folder(), f"result.zip") # ⭐ Execute the zipping of folder
return os.path.join(get_log_folder(), f"result.zip")
```
</instruction_2_revised_code>
------------------ End of Example ------------------
------------------ the real INPUT you need to process NOW ({FILE_BASENAME}) ------------------
```
{THE_CODE}
```
{INDENT_REMINDER}
{BRIEF_REMINDER}
{HINT_REMINDER}
'''
revise_funtion_prompt_chinese = '''
您需要阅读以下代码,并根据以下说明修订源代码({FILE_BASENAME}):
1. 如果源代码中包含函数的话, 你应该分析给定函数实现了什么功能
2. 如果源代码中包含函数的话, 你需要为函数添加docstring, docstring必须使用中文
请注意:
1. 你不得修改代码的缩进
2. 你无权更改或翻译代码中的非注释部分,也不允许添加空行
3. 使用 {LANG} 添加注释和文档字符串。不要翻译代码中已有的中文
4. 除了添加docstring之外, 使用⭐符号给该函数中最核心、最重要的一行代码添加注释,并说明其作用
------------------ 示例 ------------------
INPUT:
```
L0000 |
L0001 |def zip_result(folder):
L0002 | t = gen_time_str()
L0003 | zip_folder(folder, get_log_folder(), f"result.zip")
L0004 | return os.path.join(get_log_folder(), f"result.zip")
L0005 |
L0006 |
```
OUTPUT:
<instruction_1_purpose>
该函数用于压缩指定文件夹,并返回生成的`zip`文件的路径。
</instruction_1_purpose>
<instruction_2_revised_code>
```
def zip_result(folder):
"""
该函数将指定的文件夹压缩成ZIP文件, 并将其存储在日志文件夹中。
输入参数:
folder (str): 需要压缩的文件夹的路径。
返回值:
str: 日志文件夹中创建的ZIP文件的路径。
"""
t = gen_time_str()
zip_folder(folder, get_log_folder(), f"result.zip") # ⭐ 执行文件夹的压缩
return os.path.join(get_log_folder(), f"result.zip")
```
</instruction_2_revised_code>
------------------ End of Example ------------------
------------------ the real INPUT you need to process NOW ({FILE_BASENAME}) ------------------
```
{THE_CODE}
```
{INDENT_REMINDER}
{BRIEF_REMINDER}
{HINT_REMINDER}
'''
class PythonCodeComment():
def __init__(self, llm_kwargs, plugin_kwargs, language, observe_window_update) -> None:
self.original_content = ""
self.full_context = []
self.full_context_with_line_no = []
self.current_page_start = 0
self.page_limit = 100 # 100 lines of code each page
self.ignore_limit = 20
self.llm_kwargs = llm_kwargs
self.plugin_kwargs = plugin_kwargs
self.language = language
self.observe_window_update = observe_window_update
if self.language == "chinese":
self.core_prompt = revise_funtion_prompt_chinese
else:
self.core_prompt = revise_funtion_prompt
self.path = None
self.file_basename = None
self.file_brief = ""
def generate_tagged_code_from_full_context(self):
for i, code in enumerate(self.full_context):
number = i
padded_number = f"{number:04}"
result = f"L{padded_number}"
self.full_context_with_line_no.append(f"{result} | {code}")
return self.full_context_with_line_no
def read_file(self, path, brief):
with open(path, 'r', encoding='utf8') as f:
self.full_context = f.readlines()
self.original_content = ''.join(self.full_context)
self.file_basename = os.path.basename(path)
self.file_brief = brief
self.full_context_with_line_no = self.generate_tagged_code_from_full_context()
self.path = path
def find_next_function_begin(self, tagged_code:list, begin_and_end):
begin, end = begin_and_end
THE_TAGGED_CODE = ''.join(tagged_code)
self.llm_kwargs['temperature'] = 0
result = predict_no_ui_long_connection(
inputs=find_function_end_prompt.format(THE_TAGGED_CODE=THE_TAGGED_CODE),
llm_kwargs=self.llm_kwargs,
history=[],
sys_prompt="",
observe_window=[],
console_slience=True
)
def extract_number(text):
# 使用正则表达式匹配模式
match = re.search(r'<next_function_begin_from>L(\d+)</next_function_begin_from>', text)
if match:
# 提取匹配的数字部分并转换为整数
return int(match.group(1))
return None
line_no = extract_number(result)
if line_no is not None:
return line_no
else:
return end
def _get_next_window(self):
#
current_page_start = self.current_page_start
if self.current_page_start == len(self.full_context) + 1:
raise StopIteration
# 如果剩余的行数非常少,一鼓作气处理掉
if len(self.full_context) - self.current_page_start < self.ignore_limit:
future_page_start = len(self.full_context) + 1
self.current_page_start = future_page_start
return current_page_start, future_page_start
tagged_code = self.full_context_with_line_no[ self.current_page_start: self.current_page_start + self.page_limit]
line_no = self.find_next_function_begin(tagged_code, [self.current_page_start, self.current_page_start + self.page_limit])
if line_no > len(self.full_context) - 5:
line_no = len(self.full_context) + 1
future_page_start = line_no
self.current_page_start = future_page_start
# ! consider eof
return current_page_start, future_page_start
def dedent(self, text):
"""Remove any common leading whitespace from every line in `text`.
"""
# Look for the longest leading string of spaces and tabs common to
# all lines.
margin = None
_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
text = _whitespace_only_re.sub('', text)
indents = _leading_whitespace_re.findall(text)
for indent in indents:
if margin is None:
margin = indent
# Current line more deeply indented than previous winner:
# no change (previous winner is still on top).
elif indent.startswith(margin):
pass
# Current line consistent with and no deeper than previous winner:
# it's the new winner.
elif margin.startswith(indent):
margin = indent
# Find the largest common whitespace between current line and previous
# winner.
else:
for i, (x, y) in enumerate(zip(margin, indent)):
if x != y:
margin = margin[:i]
break
# sanity check (testing/debugging only)
if 0 and margin:
for line in text.split("\n"):
assert not line or line.startswith(margin), \
"line = %r, margin = %r" % (line, margin)
if margin:
text = re.sub(r'(?m)^' + margin, '', text)
return text, len(margin)
else:
return text, 0
def get_next_batch(self):
current_page_start, future_page_start = self._get_next_window()
return ''.join(self.full_context[current_page_start: future_page_start]), current_page_start, future_page_start
def tag_code(self, fn, hint):
code = fn
_, n_indent = self.dedent(code)
indent_reminder = "" if n_indent == 0 else "(Reminder: as you can see, this piece of code has indent made up with {n_indent} whitespace, please preseve them in the OUTPUT.)"
brief_reminder = "" if self.file_brief == "" else f"({self.file_basename} abstract: {self.file_brief})"
hint_reminder = "" if hint is None else f"(Reminder: do not ignore or modify code such as `{hint}`, provide complete code in the OUTPUT.)"
self.llm_kwargs['temperature'] = 0
result = predict_no_ui_long_connection(
inputs=self.core_prompt.format(
LANG=self.language,
FILE_BASENAME=self.file_basename,
THE_CODE=code,
INDENT_REMINDER=indent_reminder,
BRIEF_REMINDER=brief_reminder,
HINT_REMINDER=hint_reminder
),
llm_kwargs=self.llm_kwargs,
history=[],
sys_prompt="",
observe_window=[],
console_slience=True
)
def get_code_block(reply):
import re
pattern = r"```([\s\S]*?)```" # regex pattern to match code blocks
matches = re.findall(pattern, reply) # find all code blocks in text
if len(matches) == 1:
return matches[0].strip('python') # code block
return None
code_block = get_code_block(result)
if code_block is not None:
code_block = self.sync_and_patch(original=code, revised=code_block)
return code_block
else:
return code
def get_markdown_block_in_html(self, html):
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
found_list = soup.find_all("div", class_="markdown-body")
if found_list:
res = found_list[0]
return res.prettify()
else:
return None
def sync_and_patch(self, original, revised):
"""Ensure the number of pre-string empty lines in revised matches those in original."""
def count_leading_empty_lines(s, reverse=False):
"""Count the number of leading empty lines in a string."""
lines = s.split('\n')
if reverse: lines = list(reversed(lines))
count = 0
for line in lines:
if line.strip() == '':
count += 1
else:
break
return count
original_empty_lines = count_leading_empty_lines(original)
revised_empty_lines = count_leading_empty_lines(revised)
if original_empty_lines > revised_empty_lines:
additional_lines = '\n' * (original_empty_lines - revised_empty_lines)
revised = additional_lines + revised
elif original_empty_lines < revised_empty_lines:
lines = revised.split('\n')
revised = '\n'.join(lines[revised_empty_lines - original_empty_lines:])
original_empty_lines = count_leading_empty_lines(original, reverse=True)
revised_empty_lines = count_leading_empty_lines(revised, reverse=True)
if original_empty_lines > revised_empty_lines:
additional_lines = '\n' * (original_empty_lines - revised_empty_lines)
revised = revised + additional_lines
elif original_empty_lines < revised_empty_lines:
lines = revised.split('\n')
revised = '\n'.join(lines[:-(revised_empty_lines - original_empty_lines)])
return revised
def begin_comment_source_code(self, chatbot=None, history=None):
# from toolbox import update_ui_lastest_msg
assert self.path is not None
assert '.py' in self.path # must be python source code
# write_target = self.path + '.revised.py'
write_content = ""
# with open(self.path + '.revised.py', 'w+', encoding='utf8') as f:
while True:
try:
# yield from update_ui_lastest_msg(f"({self.file_basename}) 正在读取下一段代码片段:\n", chatbot=chatbot, history=history, delay=0)
next_batch, line_no_start, line_no_end = self.get_next_batch()
self.observe_window_update(f"正在处理{self.file_basename} - {line_no_start}/{len(self.full_context)}\n")
# yield from update_ui_lastest_msg(f"({self.file_basename}) 处理代码片段:\n\n{next_batch}", chatbot=chatbot, history=history, delay=0)
hint = None
MAX_ATTEMPT = 2
for attempt in range(MAX_ATTEMPT):
result = self.tag_code(next_batch, hint)
try:
successful, hint = self.verify_successful(next_batch, result)
except Exception as e:
logger.error('ignored exception:\n' + str(e))
break
if successful:
break
if attempt == MAX_ATTEMPT - 1:
# cannot deal with this, give up
result = next_batch
break
# f.write(result)
write_content += result
except StopIteration:
next_batch, line_no_start, line_no_end = [], -1, -1
return None, write_content
def verify_successful(self, original, revised):
""" Determine whether the revised code contains every line that already exists
"""
from crazy_functions.ast_fns.comment_remove import remove_python_comments
original = remove_python_comments(original)
original_lines = original.split('\n')
revised_lines = revised.split('\n')
for l in original_lines:
l = l.strip()
if '\'' in l or '\"' in l: continue # ast sometimes toggle " to '
found = False
for lt in revised_lines:
if l in lt:
found = True
break
if not found:
return False, l
return True, None

View File

@@ -1,45 +0,0 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<style>ADVANCED_CSS</style>
<meta charset="UTF-8">
<title>源文件对比</title>
<style>
body {
font-family: Arial, sans-serif;
display: flex;
justify-content: center;
align-items: center;
height: 100vh;
margin: 0;
}
.container {
display: flex;
width: 95%;
height: -webkit-fill-available;
}
.code-container {
flex: 1;
margin: 0px;
padding: 0px;
border: 1px solid #ccc;
background-color: #f9f9f9;
overflow: auto;
}
pre {
white-space: pre-wrap;
word-wrap: break-word;
}
</style>
</head>
<body>
<div class="container">
<div class="code-container">
REPLACE_CODE_FILE_LEFT
</div>
<div class="code-container">
REPLACE_CODE_FILE_RIGHT
</div>
</div>
</body>
</html>

View File

@@ -1,5 +1,4 @@
import threading, time
from loguru import logger
class WatchDog():
def __init__(self, timeout, bark_fn, interval=3, msg="") -> None:
@@ -14,7 +13,7 @@ class WatchDog():
while True:
if self.kill_dog: break
if time.time() - self.last_feed > self.timeout:
if len(self.msg) > 0: logger.info(self.msg)
if len(self.msg) > 0: print(self.msg)
self.bark_fn()
break
time.sleep(self.interval)

View File

@@ -1,54 +0,0 @@
import token
import tokenize
import copy
import io
def remove_python_comments(input_source: str) -> str:
source_flag = copy.copy(input_source)
source = io.StringIO(input_source)
ls = input_source.split('\n')
prev_toktype = token.INDENT
readline = source.readline
def get_char_index(lineno, col):
# find the index of the char in the source code
if lineno == 1:
return len('\n'.join(ls[:(lineno-1)])) + col
else:
return len('\n'.join(ls[:(lineno-1)])) + col + 1
def replace_char_between(start_lineno, start_col, end_lineno, end_col, source, replace_char, ls):
# replace char between start_lineno, start_col and end_lineno, end_col with replace_char, but keep '\n' and ' '
b = get_char_index(start_lineno, start_col)
e = get_char_index(end_lineno, end_col)
for i in range(b, e):
if source[i] == '\n':
source = source[:i] + '\n' + source[i+1:]
elif source[i] == ' ':
source = source[:i] + ' ' + source[i+1:]
else:
source = source[:i] + replace_char + source[i+1:]
return source
tokgen = tokenize.generate_tokens(readline)
for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
if toktype == token.STRING and (prev_toktype == token.INDENT):
source_flag = replace_char_between(slineno, scol, elineno, ecol, source_flag, ' ', ls)
elif toktype == token.STRING and (prev_toktype == token.NEWLINE):
source_flag = replace_char_between(slineno, scol, elineno, ecol, source_flag, ' ', ls)
elif toktype == tokenize.COMMENT:
source_flag = replace_char_between(slineno, scol, elineno, ecol, source_flag, ' ', ls)
prev_toktype = toktype
return source_flag
# 示例使用
if __name__ == "__main__":
with open("source.py", "r", encoding="utf-8") as f:
source_code = f.read()
cleaned_code = remove_python_comments(source_code)
with open("cleaned_source.py", "w", encoding="utf-8") as f:
f.write(cleaned_code)

View File

@@ -1,5 +1,5 @@
from toolbox import CatchException, update_ui, promote_file_to_downloadzone
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
import datetime, json
def fetch_items(list_of_items, batch_size):

View File

@@ -1,39 +1,25 @@
import os
import threading
from loguru import logger
from shared_utils.char_visual_effect import scolling_visual_effect
from toolbox import update_ui, get_conf, trimmed_format_exc, get_max_token, Singleton
import threading
import os
import logging
def input_clipping(inputs, history, max_token_limit, return_clip_flags=False):
"""
当输入文本 + 历史文本超出最大限制时,采取措施丢弃一部分文本。
输入:
- inputs 本次请求
- history 历史上下文
- max_token_limit 最大token限制
输出:
- inputs 本次请求经过clip
- history 历史上下文经过clip
"""
def input_clipping(inputs, history, max_token_limit):
import numpy as np
from request_llms.bridge_all import model_info
enc = model_info["gpt-3.5-turbo"]['tokenizer']
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
mode = 'input-and-history'
# 当 输入部分的token占比 小于 全文的一半时,只裁剪历史
input_token_num = get_token_num(inputs)
original_input_len = len(inputs)
if input_token_num < max_token_limit//2:
mode = 'only-history'
max_token_limit = max_token_limit - input_token_num
everything = [inputs] if mode == 'input-and-history' else ['']
everything.extend(history)
full_token_num = n_token = get_token_num('\n'.join(everything))
n_token = get_token_num('\n'.join(everything))
everything_token = [get_token_num(e) for e in everything]
everything_token_num = sum(everything_token)
delta = max(everything_token) // 16 # 截断时的颗粒度
while n_token > max_token_limit:
@@ -46,24 +32,10 @@ def input_clipping(inputs, history, max_token_limit, return_clip_flags=False):
if mode == 'input-and-history':
inputs = everything[0]
full_token_num = everything_token_num
else:
full_token_num = everything_token_num + input_token_num
pass
history = everything[1:]
flags = {
"mode": mode,
"original_input_token_num": input_token_num,
"original_full_token_num": full_token_num,
"original_input_len": original_input_len,
"clipped_input_len": len(inputs),
}
if not return_clip_flags:
return inputs, history
else:
return inputs, history, flags
def request_gpt_model_in_new_thread_with_ui_alive(
inputs, inputs_show_user, llm_kwargs,
@@ -133,7 +105,7 @@ def request_gpt_model_in_new_thread_with_ui_alive(
except:
# 【第三种情况】:其他错误:重试几次
tb_str = '```\n' + trimmed_format_exc() + '```'
logger.error(tb_str)
print(tb_str)
mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback\n\n{tb_str}\n\n"
if retry_op > 0:
retry_op -= 1
@@ -163,11 +135,7 @@ def request_gpt_model_in_new_thread_with_ui_alive(
yield from update_ui(chatbot=chatbot, history=[]) # 如果最后成功了,则删除报错信息
return final_result
def can_multi_process(llm) -> bool:
from request_llms.bridge_all import model_info
def default_condition(llm) -> bool:
# legacy condition
def can_multi_process(llm):
if llm.startswith('gpt-'): return True
if llm.startswith('api2d-'): return True
if llm.startswith('azure-'): return True
@@ -175,18 +143,10 @@ def can_multi_process(llm) -> bool:
if llm.startswith('zhipuai') or llm.startswith('glm-'): return True
return False
if llm in model_info:
if 'can_multi_thread' in model_info[llm]:
return model_info[llm]['can_multi_thread']
else:
return default_condition(llm)
else:
return default_condition(llm)
def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
inputs_array, inputs_show_user_array, llm_kwargs,
chatbot, history_array, sys_prompt_array,
refresh_interval=0.2, max_workers=-1, scroller_max_len=75,
refresh_interval=0.2, max_workers=-1, scroller_max_len=30,
handle_token_exceed=True, show_user_at_complete=False,
retry_times_at_unknown_error=2,
):
@@ -283,7 +243,7 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
# 【第三种情况】:其他错误
if detect_timeout(): raise RuntimeError("检测到程序终止。")
tb_str = '```\n' + trimmed_format_exc() + '```'
logger.error(tb_str)
print(tb_str)
gpt_say += f"[Local Message] 警告,线程{index}在执行过程中遭遇问题, Traceback\n\n{tb_str}\n\n"
if len(mutable[index][0]) > 0: gpt_say += "此线程失败前收到的回答:\n\n" + mutable[index][0]
if retry_op > 0:
@@ -311,8 +271,6 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
futures = [executor.submit(_req_gpt, index, inputs, history, sys_prompt) for index, inputs, history, sys_prompt in zip(
range(len(inputs_array)), inputs_array, history_array, sys_prompt_array)]
cnt = 0
while True:
# yield一次以刷新前端页面
time.sleep(refresh_interval)
@@ -325,7 +283,8 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
mutable[thread_index][1] = time.time()
# 在前端打印些好玩的东西
for thread_index, _ in enumerate(worker_done):
print_something_really_funny = f"[ ...`{scolling_visual_effect(mutable[thread_index][0], scroller_max_len)}`... ]"
print_something_really_funny = "[ ...`"+mutable[thread_index][0][-scroller_max_len:].\
replace('\n', '').replace('`', '.').replace(' ', '.').replace('<br/>', '.....').replace('$', '.')+"`... ]"
observe_win.append(print_something_really_funny)
# 在前端打印些好玩的东西
stat_str = ''.join([f'`{mutable[thread_index][2]}`: {obs}\n\n'
@@ -378,7 +337,7 @@ def read_and_clean_pdf_text(fp):
import fitz, copy
import re
import numpy as np
# from shared_utils.colorful import print亮黄, print亮绿
from colorful import print亮黄, print亮绿
fc = 0 # Index 0 文本
fs = 1 # Index 1 字体
fb = 2 # Index 2 框框
@@ -595,15 +554,15 @@ class nougat_interface():
def nougat_with_timeout(self, command, cwd, timeout=3600):
import subprocess
from toolbox import ProxyNetworkActivate
logger.info(f'正在执行命令 {command}')
logging.info(f'正在执行命令 {command}')
with ProxyNetworkActivate("Nougat_Download"):
process = subprocess.Popen(command, shell=False, cwd=cwd, env=os.environ)
process = subprocess.Popen(command, shell=True, cwd=cwd, env=os.environ)
try:
stdout, stderr = process.communicate(timeout=timeout)
except subprocess.TimeoutExpired:
process.kill()
stdout, stderr = process.communicate()
logger.error("Process timed out!")
print("Process timed out!")
return False
return True
@@ -621,8 +580,7 @@ class nougat_interface():
yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度正在加载NOUGAT... 提示首次运行需要花费较长时间下载NOUGAT参数",
chatbot=chatbot, history=history, delay=0)
command = ['nougat', '--out', os.path.abspath(dst), os.path.abspath(fp)]
self.nougat_with_timeout(command, cwd=os.getcwd(), timeout=3600)
self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}"', os.getcwd(), timeout=3600)
res = glob.glob(os.path.join(dst,'*.mmd'))
if len(res) == 0:
self.threadLock.release()

View File

@@ -1,9 +1,8 @@
import os
from textwrap import indent
from loguru import logger
class FileNode:
def __init__(self, name, build_manifest=False):
def __init__(self, name):
self.name = name
self.children = []
self.is_leaf = False
@@ -11,8 +10,6 @@ class FileNode:
self.parenting_ship = []
self.comment = ""
self.comment_maxlen_show = 50
self.build_manifest = build_manifest
self.manifest = {}
@staticmethod
def add_linebreaks_at_spaces(string, interval=10):
@@ -32,7 +29,6 @@ class FileNode:
level = 1
if directory_names == "":
new_node = FileNode(file_name)
self.manifest[file_path] = new_node
current_node.children.append(new_node)
new_node.is_leaf = True
new_node.comment = self.sanitize_comment(file_comment)
@@ -54,14 +50,13 @@ class FileNode:
new_node.level = level - 1
current_node = new_node
term = FileNode(file_name)
self.manifest[file_path] = term
term.level = level
term.comment = self.sanitize_comment(file_comment)
term.is_leaf = True
current_node.children.append(term)
def print_files_recursively(self, level=0, code="R0"):
logger.info(' '*level + self.name + ' ' + str(self.is_leaf) + ' ' + str(self.level))
print(' '*level + self.name + ' ' + str(self.is_leaf) + ' ' + str(self.level))
for j, child in enumerate(self.children):
child.print_files_recursively(level=level+1, code=code+str(j))
self.parenting_ship.extend(child.parenting_ship)
@@ -124,4 +119,4 @@ if __name__ == "__main__":
"用于加载和分割文件中的文本的通用文件加载器用于加载和分割文件中的文本的通用文件加载器用于加载和分割文件中的文本的通用文件加载器",
"包含了用于构建和管理向量数据库的函数和类包含了用于构建和管理向量数据库的函数和类包含了用于构建和管理向量数据库的函数和类",
]
logger.info(build_file_tree_mermaid_diagram(file_manifest, file_comments, "项目文件树"))
print(build_file_tree_mermaid_diagram(file_manifest, file_comments, "项目文件树"))

View File

@@ -1,450 +0,0 @@
import os
import time
from abc import ABC, abstractmethod
from datetime import datetime
from docx import Document
from docx.enum.style import WD_STYLE_TYPE
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
from docx.oxml.ns import qn
from docx.shared import Inches, Cm
from docx.shared import Pt, RGBColor, Inches
from typing import Dict, List, Tuple
class DocumentFormatter(ABC):
"""文档格式化基类,定义文档格式化的基本接口"""
def __init__(self, final_summary: str, file_summaries_map: Dict, failed_files: List[Tuple]):
self.final_summary = final_summary
self.file_summaries_map = file_summaries_map
self.failed_files = failed_files
@abstractmethod
def format_failed_files(self) -> str:
"""格式化失败文件列表"""
pass
@abstractmethod
def format_file_summaries(self) -> str:
"""格式化文件总结内容"""
pass
@abstractmethod
def create_document(self) -> str:
"""创建完整文档"""
pass
class WordFormatter(DocumentFormatter):
"""Word格式文档生成器 - 符合中国政府公文格式规范(GB/T 9704-2012),并进行了优化"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.doc = Document()
self._setup_document()
self._create_styles()
# 初始化三级标题编号系统
self.numbers = {
1: 0, # 一级标题编号
2: 0, # 二级标题编号
3: 0 # 三级标题编号
}
def _setup_document(self):
"""设置文档基本格式,包括页面设置和页眉"""
sections = self.doc.sections
for section in sections:
# 设置页面大小为A4
section.page_width = Cm(21)
section.page_height = Cm(29.7)
# 设置页边距
section.top_margin = Cm(3.7) # 上边距37mm
section.bottom_margin = Cm(3.5) # 下边距35mm
section.left_margin = Cm(2.8) # 左边距28mm
section.right_margin = Cm(2.6) # 右边距26mm
# 设置页眉页脚距离
section.header_distance = Cm(2.0)
section.footer_distance = Cm(2.0)
# 添加页眉
header = section.header
header_para = header.paragraphs[0]
header_para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
header_run = header_para.add_run("该文档由GPT-academic生成")
header_run.font.name = '仿宋'
header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
header_run.font.size = Pt(9)
def _create_styles(self):
"""创建文档样式"""
# 创建正文样式
style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH)
style.font.name = '仿宋'
style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
style.font.size = Pt(14)
style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
style.paragraph_format.space_after = Pt(0)
style.paragraph_format.first_line_indent = Pt(28)
# 创建各级标题样式
self._create_heading_style('Title_Custom', '方正小标宋简体', 32, WD_PARAGRAPH_ALIGNMENT.CENTER)
self._create_heading_style('Heading1_Custom', '黑体', 22, WD_PARAGRAPH_ALIGNMENT.LEFT)
self._create_heading_style('Heading2_Custom', '黑体', 18, WD_PARAGRAPH_ALIGNMENT.LEFT)
self._create_heading_style('Heading3_Custom', '黑体', 16, WD_PARAGRAPH_ALIGNMENT.LEFT)
def _create_heading_style(self, style_name: str, font_name: str, font_size: int, alignment):
"""创建标题样式"""
style = self.doc.styles.add_style(style_name, WD_STYLE_TYPE.PARAGRAPH)
style.font.name = font_name
style._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
style.font.size = Pt(font_size)
style.font.bold = True
style.paragraph_format.alignment = alignment
style.paragraph_format.space_before = Pt(12)
style.paragraph_format.space_after = Pt(12)
style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
return style
def _get_heading_number(self, level: int) -> str:
"""
生成标题编号
Args:
level: 标题级别 (0-3)
Returns:
str: 格式化的标题编号
"""
if level == 0: # 主标题不需要编号
return ""
self.numbers[level] += 1 # 增加当前级别的编号
# 重置下级标题编号
for i in range(level + 1, 4):
self.numbers[i] = 0
# 根据级别返回不同格式的编号
if level == 1:
return f"{self.numbers[1]}. "
elif level == 2:
return f"{self.numbers[1]}.{self.numbers[2]} "
elif level == 3:
return f"{self.numbers[1]}.{self.numbers[2]}.{self.numbers[3]} "
return ""
def _add_heading(self, text: str, level: int):
"""
添加带编号的标题
Args:
text: 标题文本
level: 标题级别 (0-3)
"""
style_map = {
0: 'Title_Custom',
1: 'Heading1_Custom',
2: 'Heading2_Custom',
3: 'Heading3_Custom'
}
number = self._get_heading_number(level)
paragraph = self.doc.add_paragraph(style=style_map[level])
if number:
number_run = paragraph.add_run(number)
font_size = 22 if level == 1 else (18 if level == 2 else 16)
self._get_run_style(number_run, '黑体', font_size, True)
text_run = paragraph.add_run(text)
font_size = 32 if level == 0 else (22 if level == 1 else (18 if level == 2 else 16))
self._get_run_style(text_run, '黑体', font_size, True)
# 主标题添加日期
if level == 0:
date_paragraph = self.doc.add_paragraph()
date_paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
date_run = date_paragraph.add_run(datetime.now().strftime('%Y年%m月%d'))
self._get_run_style(date_run, '仿宋', 16, False)
return paragraph
def _get_run_style(self, run, font_name: str, font_size: int, bold: bool = False):
"""设置文本运行对象的样式"""
run.font.name = font_name
run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
run.font.size = Pt(font_size)
run.font.bold = bold
def format_failed_files(self) -> str:
"""格式化失败文件列表"""
result = []
if not self.failed_files:
return "\n".join(result)
result.append("处理失败文件:")
for fp, reason in self.failed_files:
result.append(f"{os.path.basename(fp)}: {reason}")
self._add_heading("处理失败文件", 1)
for fp, reason in self.failed_files:
self._add_content(f"{os.path.basename(fp)}: {reason}", indent=False)
self.doc.add_paragraph()
return "\n".join(result)
def _add_content(self, text: str, indent: bool = True):
"""添加正文内容"""
paragraph = self.doc.add_paragraph(text, style='Normal_Custom')
if not indent:
paragraph.paragraph_format.first_line_indent = Pt(0)
return paragraph
def format_file_summaries(self) -> str:
"""
格式化文件总结内容,确保正确的标题层级
返回:
str: 格式化后的文件总结字符串
标题层级规则:
1. 一级标题为"各文件详细总结"
2. 如果文件有目录路径:
- 目录路径作为二级标题 (2.1, 2.2 等)
- 该目录下所有文件作为三级标题 (2.1.1, 2.1.2 等)
3. 如果文件没有目录路径:
- 文件直接作为二级标题 (2.1, 2.2 等)
"""
result = []
# 首先对文件路径进行分组整理
file_groups = {}
for path in sorted(self.file_summaries_map.keys()):
dir_path = os.path.dirname(path)
if dir_path not in file_groups:
file_groups[dir_path] = []
file_groups[dir_path].append(path)
# 处理没有目录的文件
root_files = file_groups.get("", [])
if root_files:
for path in sorted(root_files):
file_name = os.path.basename(path)
result.append(f"\n📄 {file_name}")
result.append(self.file_summaries_map[path])
# 无目录的文件作为二级标题
self._add_heading(f"📄 {file_name}", 2)
self._add_content(self.file_summaries_map[path])
self.doc.add_paragraph()
# 处理有目录的文件
for dir_path in sorted(file_groups.keys()):
if dir_path == "": # 跳过已处理的根目录文件
continue
# 添加目录作为二级标题
result.append(f"\n📁 {dir_path}")
self._add_heading(f"📁 {dir_path}", 2)
# 该目录下的所有文件作为三级标题
for path in sorted(file_groups[dir_path]):
file_name = os.path.basename(path)
result.append(f"\n📄 {file_name}")
result.append(self.file_summaries_map[path])
# 添加文件名作为三级标题
self._add_heading(f"📄 {file_name}", 3)
self._add_content(self.file_summaries_map[path])
self.doc.add_paragraph()
return "\n".join(result)
def create_document(self):
"""创建完整Word文档并返回文档对象"""
# 重置所有编号
for level in self.numbers:
self.numbers[level] = 0
# 添加主标题
self._add_heading("文档总结报告", 0)
self.doc.add_paragraph()
# 添加总体摘要
self._add_heading("总体摘要", 1)
self._add_content(self.final_summary)
self.doc.add_paragraph()
# 添加失败文件列表(如果有)
if self.failed_files:
self.format_failed_files()
# 添加文件详细总结
self._add_heading("各文件详细总结", 1)
self.format_file_summaries()
return self.doc
class MarkdownFormatter(DocumentFormatter):
"""Markdown格式文档生成器"""
def format_failed_files(self) -> str:
if not self.failed_files:
return ""
formatted_text = ["\n## ⚠️ 处理失败的文件"]
for fp, reason in self.failed_files:
formatted_text.append(f"- {os.path.basename(fp)}: {reason}")
formatted_text.append("\n---")
return "\n".join(formatted_text)
def format_file_summaries(self) -> str:
formatted_text = []
sorted_paths = sorted(self.file_summaries_map.keys())
current_dir = ""
for path in sorted_paths:
dir_path = os.path.dirname(path)
if dir_path != current_dir:
if dir_path:
formatted_text.append(f"\n## 📁 {dir_path}")
current_dir = dir_path
file_name = os.path.basename(path)
formatted_text.append(f"\n### 📄 {file_name}")
formatted_text.append(self.file_summaries_map[path])
formatted_text.append("\n---")
return "\n".join(formatted_text)
def create_document(self) -> str:
document = [
"# 📑 文档总结报告",
"\n## 总体摘要",
self.final_summary
]
if self.failed_files:
document.append(self.format_failed_files())
document.extend([
"\n# 📚 各文件详细总结",
self.format_file_summaries()
])
return "\n".join(document)
class HtmlFormatter(DocumentFormatter):
"""HTML格式文档生成器"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.css_styles = """
body {
font-family: "Microsoft YaHei", Arial, sans-serif;
line-height: 1.6;
max-width: 1000px;
margin: 0 auto;
padding: 20px;
color: #333;
}
h1 {
color: #2c3e50;
border-bottom: 2px solid #eee;
padding-bottom: 10px;
font-size: 24px;
text-align: center;
}
h2 {
color: #34495e;
margin-top: 30px;
font-size: 20px;
border-left: 4px solid #3498db;
padding-left: 10px;
}
h3 {
color: #2c3e50;
font-size: 18px;
margin-top: 20px;
}
.summary {
background-color: #f8f9fa;
padding: 20px;
border-radius: 5px;
margin: 20px 0;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.details {
margin-top: 40px;
}
.failed-files {
background-color: #fff3f3;
padding: 15px;
border-left: 4px solid #e74c3c;
margin: 20px 0;
}
.file-summary {
background-color: #fff;
padding: 15px;
margin: 15px 0;
border-radius: 4px;
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
}
"""
def format_failed_files(self) -> str:
if not self.failed_files:
return ""
failed_files_html = ['<div class="failed-files">']
failed_files_html.append("<h2>⚠️ 处理失败的文件</h2>")
failed_files_html.append("<ul>")
for fp, reason in self.failed_files:
failed_files_html.append(f"<li><strong>{os.path.basename(fp)}:</strong> {reason}</li>")
failed_files_html.append("</ul></div>")
return "\n".join(failed_files_html)
def format_file_summaries(self) -> str:
formatted_html = []
sorted_paths = sorted(self.file_summaries_map.keys())
current_dir = ""
for path in sorted_paths:
dir_path = os.path.dirname(path)
if dir_path != current_dir:
if dir_path:
formatted_html.append(f'<h2>📁 {dir_path}</h2>')
current_dir = dir_path
file_name = os.path.basename(path)
formatted_html.append('<div class="file-summary">')
formatted_html.append(f'<h3>📄 {file_name}</h3>')
formatted_html.append(f'<p>{self.file_summaries_map[path]}</p>')
formatted_html.append('</div>')
return "\n".join(formatted_html)
def create_document(self) -> str:
return f"""
<!DOCTYPE html>
<html>
<head>
<meta charset='utf-8'>
<title>文档总结报告</title>
<style>{self.css_styles}</style>
</head>
<body>
<h1>📑 文档总结报告</h1>
<h2>总体摘要</h2>
<div class="summary">{self.final_summary}</div>
{self.format_failed_files()}
<div class="details">
<h2>📚 各文件详细总结</h2>
{self.format_file_summaries()}
</div>
</body>
</html>
"""

View File

@@ -1,387 +0,0 @@
from abc import ABC, abstractmethod
from typing import Any, Dict, Optional, Type, TypeVar, Generic, Union
from dataclasses import dataclass
from enum import Enum, auto
import logging
from datetime import datetime
from crazy_functions.rag_fns.arxiv_fns.section_fragment import SectionFragment
# 设置日志
logger = logging.getLogger(__name__)
# 自定义异常类定义
class FoldingError(Exception):
"""折叠相关的自定义异常基类"""
pass
class FormattingError(FoldingError):
"""格式化过程中的错误"""
pass
class MetadataError(FoldingError):
"""元数据相关的错误"""
pass
class ValidationError(FoldingError):
"""验证错误"""
pass
class FoldingStyle(Enum):
"""折叠样式枚举"""
SIMPLE = auto() # 简单折叠
DETAILED = auto() # 详细折叠(带有额外信息)
NESTED = auto() # 嵌套折叠
@dataclass
class FoldingOptions:
"""折叠选项配置"""
style: FoldingStyle = FoldingStyle.DETAILED
code_language: Optional[str] = None # 代码块的语言
show_timestamp: bool = False # 是否显示时间戳
indent_level: int = 0 # 缩进级别
custom_css: Optional[str] = None # 自定义CSS类
T = TypeVar('T') # 用于泛型类型
class BaseMetadata(ABC):
"""元数据基类"""
@abstractmethod
def validate(self) -> bool:
"""验证元数据的有效性"""
pass
def _validate_non_empty_str(self, value: Optional[str]) -> bool:
"""验证字符串非空"""
return bool(value and value.strip())
@dataclass
class FileMetadata(BaseMetadata):
"""文件元数据"""
rel_path: str
size: float
last_modified: Optional[datetime] = None
mime_type: Optional[str] = None
encoding: str = 'utf-8'
def validate(self) -> bool:
"""验证文件元数据的有效性"""
try:
if not self._validate_non_empty_str(self.rel_path):
return False
if self.size < 0:
return False
return True
except Exception as e:
logger.error(f"File metadata validation error: {str(e)}")
return False
class ContentFormatter(ABC, Generic[T]):
"""内容格式化抽象基类
支持泛型类型参数,可以指定具体的元数据类型。
"""
@abstractmethod
def format(self,
content: str,
metadata: T,
options: Optional[FoldingOptions] = None) -> str:
"""格式化内容
Args:
content: 需要格式化的内容
metadata: 类型化的元数据
options: 折叠选项
Returns:
str: 格式化后的内容
Raises:
FormattingError: 格式化过程中的错误
"""
pass
def _create_summary(self, metadata: T) -> str:
"""创建折叠摘要,可被子类重写"""
return str(metadata)
def _format_content_block(self,
content: str,
options: Optional[FoldingOptions]) -> str:
"""格式化内容块,处理代码块等特殊格式"""
if not options:
return content
if options.code_language:
return f"```{options.code_language}\n{content}\n```"
return content
def _add_indent(self, text: str, level: int) -> str:
"""添加缩进"""
if level <= 0:
return text
indent = " " * level
return "\n".join(indent + line for line in text.splitlines())
class FileContentFormatter(ContentFormatter[FileMetadata]):
"""文件内容格式化器"""
def format(self,
content: str,
metadata: FileMetadata,
options: Optional[FoldingOptions] = None) -> str:
"""格式化文件内容"""
if not metadata.validate():
raise MetadataError("Invalid file metadata")
try:
options = options or FoldingOptions()
# 构建摘要信息
summary_parts = [
f"{metadata.rel_path} ({metadata.size:.2f}MB)",
f"Type: {metadata.mime_type}" if metadata.mime_type else None,
(f"Modified: {metadata.last_modified.strftime('%Y-%m-%d %H:%M:%S')}"
if metadata.last_modified and options.show_timestamp else None)
]
summary = " | ".join(filter(None, summary_parts))
# 构建HTML类
css_class = f' class="{options.custom_css}"' if options.custom_css else ''
# 格式化内容
formatted_content = self._format_content_block(content, options)
# 组装最终结果
result = (
f'<details{css_class}><summary>{summary}</summary>\n\n'
f'{formatted_content}\n\n'
f'</details>\n\n'
)
return self._add_indent(result, options.indent_level)
except Exception as e:
logger.error(f"Error formatting file content: {str(e)}")
raise FormattingError(f"Failed to format file content: {str(e)}")
class ContentFoldingManager:
"""内容折叠管理器"""
def __init__(self):
"""初始化折叠管理器"""
self._formatters: Dict[str, ContentFormatter] = {}
self._register_default_formatters()
def _register_default_formatters(self) -> None:
"""注册默认的格式化器"""
self.register_formatter('file', FileContentFormatter())
def register_formatter(self, name: str, formatter: ContentFormatter) -> None:
"""注册新的格式化器"""
if not isinstance(formatter, ContentFormatter):
raise TypeError("Formatter must implement ContentFormatter interface")
self._formatters[name] = formatter
def _guess_language(self, extension: str) -> Optional[str]:
"""根据文件扩展名猜测编程语言"""
extension = extension.lower().lstrip('.')
language_map = {
'py': 'python',
'js': 'javascript',
'java': 'java',
'cpp': 'cpp',
'cs': 'csharp',
'html': 'html',
'css': 'css',
'md': 'markdown',
'json': 'json',
'xml': 'xml',
'sql': 'sql',
'sh': 'bash',
'yaml': 'yaml',
'yml': 'yaml',
'txt': None # 纯文本不需要语言标识
}
return language_map.get(extension)
def format_content(self,
content: str,
formatter_type: str,
metadata: Union[FileMetadata],
options: Optional[FoldingOptions] = None) -> str:
"""格式化内容"""
formatter = self._formatters.get(formatter_type)
if not formatter:
raise KeyError(f"No formatter registered for type: {formatter_type}")
if not isinstance(metadata, FileMetadata):
raise TypeError("Invalid metadata type")
return formatter.format(content, metadata, options)
@dataclass
class PaperMetadata(BaseMetadata):
"""论文元数据"""
title: str
authors: str
abstract: str
catalogs: str
arxiv_id: str = ""
def validate(self) -> bool:
"""验证论文元数据的有效性"""
try:
if not self._validate_non_empty_str(self.title):
return False
if not self._validate_non_empty_str(self.authors):
return False
if not self._validate_non_empty_str(self.abstract):
return False
if not self._validate_non_empty_str(self.catalogs):
return False
return True
except Exception as e:
logger.error(f"Paper metadata validation error: {str(e)}")
return False
class PaperContentFormatter(ContentFormatter[PaperMetadata]):
"""论文内容格式化器"""
def format(self,
fragments: list[SectionFragment],
metadata: PaperMetadata,
options: Optional[FoldingOptions] = None) -> str:
"""格式化论文内容
Args:
fragments: 论文片段列表
metadata: 论文元数据
options: 折叠选项
Returns:
str: 格式化后的论文内容
"""
if not metadata.validate():
raise MetadataError("Invalid paper metadata")
try:
options = options or FoldingOptions()
# 1. 生成标题部分(不折叠)
result = [f"# {metadata.title}\n"]
# 2. 生成作者信息(折叠)
result.append(self._create_folded_section(
"Authors",
metadata.authors,
options
))
# 3. 生成摘要(折叠)
result.append(self._create_folded_section(
"Abstract",
metadata.abstract,
options
))
# 4. 生成目录树(折叠)
result.append(self._create_folded_section(
"Table of Contents",
f"```\n{metadata.catalogs}\n```",
options
))
# 5. 按章节组织并生成内容
sections = self._organize_sections(fragments)
for section, section_fragments in sections.items():
# 拼接该章节的所有内容
section_content = "\n\n".join(
fragment.content for fragment in section_fragments
)
result.append(self._create_folded_section(
section,
section_content,
options
))
# 6. 生成参考文献(折叠)
# 收集所有非空的参考文献
all_refs = "\n".join(filter(None,
(fragment.bibliography for fragment in fragments)
))
if all_refs:
result.append(self._create_folded_section(
"Bibliography",
f"```bibtex\n{all_refs}\n```",
options
))
return "\n\n".join(result)
except Exception as e:
logger.error(f"Error formatting paper content: {str(e)}")
raise FormattingError(f"Failed to format paper content: {str(e)}")
def _create_folded_section(self,
title: str,
content: str,
options: FoldingOptions) -> str:
"""创建折叠区块
Args:
title: 区块标题
content: 区块内容
options: 折叠选项
Returns:
str: 格式化后的折叠区块
"""
css_class = f' class="{options.custom_css}"' if options.custom_css else ''
result = (
f'<details{css_class}><summary>{title}</summary>\n\n'
f'{content}\n\n'
f'</details>'
)
return self._add_indent(result, options.indent_level)
def _organize_sections(self,
fragments: list[SectionFragment]
) -> Dict[str, list[SectionFragment]]:
"""将片段按章节分组
Args:
fragments: 论文片段列表
Returns:
Dict[str, list[SectionFragment]]: 按章节分组的片段字典
"""
sections: Dict[str, list[SectionFragment]] = {}
for fragment in fragments:
section = fragment.current_section or "Uncategorized"
if section not in sections:
sections[section] = []
sections[section].append(fragment)
return sections

View File

@@ -1,354 +0,0 @@
from pathlib import Path
from typing import List, Dict
from dataclasses import dataclass
from datetime import datetime
import os
import re
@dataclass
class SectionFragment:
"""Arxiv论文片段数据类"""
title: str
authors: str
abstract: str
catalogs: str
arxiv_id: str = ""
current_section: str = "Introduction"
content: str = ''
bibliography: str = ''
class PaperHtmlFormatter:
"""HTML格式论文文档生成器"""
def __init__(self, fragments: List[SectionFragment], output_dir: Path):
self.fragments = fragments
self.output_dir = output_dir
self.css_styles = """
:root {
--primary-color: #1a73e8;
--secondary-color: #34495e;
--background-color: #f8f9fa;
--text-color: #2c3e50;
--border-color: #e0e0e0;
--code-bg-color: #f6f8fa;
}
body {
font-family: "Source Serif Pro", "Times New Roman", serif;
line-height: 1.8;
max-width: 1000px;
margin: 0 auto;
padding: 2rem;
color: var(--text-color);
background-color: var(--background-color);
font-size: 16px;
}
.container {
background: white;
padding: 2rem;
border-radius: 8px;
box-shadow: 0 2px 12px rgba(0,0,0,0.1);
}
h1 {
color: var(--primary-color);
font-size: 2.2em;
text-align: center;
margin: 1.5rem 0;
padding-bottom: 1rem;
border-bottom: 3px solid var(--primary-color);
}
h2 {
color: var(--secondary-color);
font-size: 1.8em;
margin-top: 2rem;
padding-left: 1rem;
border-left: 4px solid var(--primary-color);
}
h3 {
color: var(--text-color);
font-size: 1.5em;
margin-top: 1.5rem;
border-bottom: 2px solid var(--border-color);
padding-bottom: 0.5rem;
}
.authors {
text-align: center;
color: var(--secondary-color);
font-size: 1.1em;
margin: 1rem 0 2rem;
}
.abstract-container {
background: var(--background-color);
padding: 1.5rem;
border-radius: 6px;
margin: 2rem 0;
}
.abstract-title {
font-weight: bold;
color: var(--primary-color);
margin-bottom: 1rem;
}
.abstract-content {
font-style: italic;
line-height: 1.7;
}
.toc {
background: white;
padding: 1.5rem;
border-radius: 6px;
margin: 2rem 0;
box-shadow: 0 2px 8px rgba(0,0,0,0.05);
}
.toc-title {
color: var(--primary-color);
font-size: 1.4em;
margin-bottom: 1rem;
}
.section-content {
background: white;
padding: 1.5rem;
border-radius: 6px;
margin: 1.5rem 0;
box-shadow: 0 1px 3px rgba(0,0,0,0.05);
}
.fragment {
margin: 2rem 0;
padding-left: 1rem;
border-left: 3px solid var(--border-color);
}
.fragment:hover {
border-left-color: var(--primary-color);
}
.bibliography {
background: var(--code-bg-color);
padding: 1rem;
border-radius: 4px;
font-family: "Source Code Pro", monospace;
font-size: 0.9em;
white-space: pre-wrap;
margin-top: 1rem;
}
pre {
background: var(--code-bg-color);
padding: 1rem;
border-radius: 4px;
overflow-x: auto;
font-family: "Source Code Pro", monospace;
}
.paper-info {
background: white;
padding: 2rem;
border-radius: 8px;
margin: 2rem 0;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
}
.arxiv-id {
text-align: center;
color: #666;
font-size: 0.9em;
margin: 1rem 0;
}
.section-title {
display: flex;
align-items: center;
gap: 0.5rem;
color: var(--secondary-color);
}
.section-icon {
color: var(--primary-color);
}
@media print {
body {
background: white;
}
.container {
box-shadow: none;
}
}
"""
def _sanitize_html(self, text: str) -> str:
"""清理HTML特殊字符"""
if not text:
return ""
replacements = {
"&": "&amp;",
"<": "&lt;",
">": "&gt;",
'"': "&quot;",
"'": "&#39;"
}
for old, new in replacements.items():
text = text.replace(old, new)
return text
def _create_section_id(self, section: str) -> str:
"""创建section的ID"""
section = section.strip() or "uncategorized"
# 移除特殊字符,转换为小写并用连字符替换空格
section_id = re.sub(r'[^\w\s-]', '', section.lower())
return section_id.replace(' ', '-')
def format_paper_info(self) -> str:
"""格式化论文基本信息"""
if not self.fragments:
return ""
first_fragment = self.fragments[0]
paper_info = ['<div class="paper-info">']
# 添加标题
if first_fragment.title:
paper_info.append(f'<h1>{self._sanitize_html(first_fragment.title)}</h1>')
# 添加arXiv ID
if first_fragment.arxiv_id:
paper_info.append(f'<div class="arxiv-id">arXiv: {self._sanitize_html(first_fragment.arxiv_id)}</div>')
# 添加作者
if first_fragment.authors:
paper_info.append(f'<div class="authors">{self._sanitize_html(first_fragment.authors)}</div>')
# 添加摘要
if first_fragment.abstract:
paper_info.append('<div class="abstract-container">')
paper_info.append('<div class="abstract-title">Abstract</div>')
paper_info.append(f'<div class="abstract-content">{self._sanitize_html(first_fragment.abstract)}</div>')
paper_info.append('</div>')
# 添加目录结构
if first_fragment.catalogs:
paper_info.append('<h2>Document Structure</h2>')
paper_info.append('<pre>')
paper_info.append(self._sanitize_html(first_fragment.catalogs))
paper_info.append('</pre>')
paper_info.append('</div>')
return '\n'.join(paper_info)
def format_table_of_contents(self, sections: Dict[str, List[SectionFragment]]) -> str:
"""生成目录"""
toc = ['<div class="toc">']
toc.append('<div class="toc-title">Table of Contents</div>')
toc.append('<nav>')
for section in sections:
section_id = self._create_section_id(section)
clean_section = section.strip() or "Uncategorized"
toc.append(f'<div><a href="#{section_id}">{self._sanitize_html(clean_section)} '
f'</a></div>')
toc.append('</nav>')
toc.append('</div>')
return '\n'.join(toc)
def format_sections(self) -> str:
"""格式化论文各部分内容"""
sections = {}
for fragment in self.fragments:
section = fragment.current_section or "Uncategorized"
if section not in sections:
sections[section] = []
sections[section].append(fragment)
formatted_html = ['<div class="content">']
formatted_html.append(self.format_table_of_contents(sections))
# 生成各部分内容
for section, fragments in sections.items():
section_id = self._create_section_id(section)
formatted_html.append(f'<h2 id="{section_id}">')
formatted_html.append(f'<span class="section-title">')
formatted_html.append(f'<span class="section-icon">§</span>')
formatted_html.append(f'{self._sanitize_html(section)}')
formatted_html.append('</span>')
formatted_html.append('</h2>')
formatted_html.append('<div class="section-content">')
for i, fragment in enumerate(fragments, 1):
formatted_html.append('<div class="fragment">')
# 添加内容
if fragment.content:
formatted_html.append(
f'<div class="fragment-content">{self._sanitize_html(fragment.content)}</div>'
)
# 添加参考文献
if fragment.bibliography:
formatted_html.append('<div class="bibliography">')
formatted_html.append(f'{self._sanitize_html(fragment.bibliography)}')
formatted_html.append('</div>')
formatted_html.append('</div>')
formatted_html.append('</div>')
formatted_html.append('</div>')
return '\n'.join(formatted_html)
def save_html(self) -> Path:
"""保存HTML文档"""
try:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"paper_content_{timestamp}.html"
file_path = self.output_dir / filename
html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>{self._sanitize_html(self.fragments[0].title if self.fragments else 'Paper Content')}</title>
<style>
{self.css_styles}
</style>
</head>
<body>
<div class="container">
{self.format_paper_info()}
{self.format_sections()}
</div>
</body>
</html>
"""
with open(file_path, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"HTML document saved to: {file_path}")
return file_path
except Exception as e:
print(f"Error saving HTML document: {str(e)}")
raise
# 使用示例:
# formatter = PaperHtmlFormatter(fragments, output_dir)
# output_path = formatter.save_html()

View File

@@ -92,7 +92,7 @@ class MiniGame_ResumeStory(GptAcademicGameBaseState):
def generate_story_image(self, story_paragraph):
try:
from crazy_functions.Image_Generate import gen_image
from crazy_functions.图片生成 import gen_image
prompt_ = predict_no_ui_long_connection(inputs=story_paragraph, llm_kwargs=self.llm_kwargs, history=[], sys_prompt='你需要根据用户给出的小说段落进行简短的环境描写。要求80字以内。')
image_url, image_path = gen_image(self.llm_kwargs, prompt_, '512x512', model="dall-e-2", quality='standard', style='natural')
return f'<br/><div align="center"><img src="file={image_path}"></div>'

View File

@@ -24,8 +24,8 @@ class Actor(BaseModel):
film_names: List[str] = Field(description="list of names of films they starred in")
"""
import json, re
from loguru import logger as logging
import json, re, logging
PYDANTIC_FORMAT_INSTRUCTIONS = """The output should be formatted as a JSON instance that conforms to the JSON schema below.
@@ -62,8 +62,8 @@ class GptJsonIO():
if "type" in reduced_schema:
del reduced_schema["type"]
# Ensure json in context is well-formed with double quotes.
schema_str = json.dumps(reduced_schema)
if self.example_instruction:
schema_str = json.dumps(reduced_schema)
return PYDANTIC_FORMAT_INSTRUCTIONS.format(schema=schema_str)
else:
return PYDANTIC_FORMAT_INSTRUCTIONS_SIMPLE.format(schema=schema_str)

View File

@@ -1,26 +0,0 @@
from crazy_functions.json_fns.pydantic_io import GptJsonIO, JsonStringError
def structure_output(txt, prompt, err_msg, run_gpt_fn, pydantic_cls):
gpt_json_io = GptJsonIO(pydantic_cls)
analyze_res = run_gpt_fn(
txt,
sys_prompt=prompt + gpt_json_io.format_instructions
)
try:
friend = gpt_json_io.generate_output_auto_repair(analyze_res, run_gpt_fn)
except JsonStringError as e:
return None, err_msg
err_msg = ""
return friend, err_msg
def select_tool(prompt, run_gpt_fn, pydantic_cls):
pydantic_cls_instance, err_msg = structure_output(
txt=prompt,
prompt="根据提示, 分析应该调用哪个工具函数\n\n",
err_msg=f"不能理解该联系人",
run_gpt_fn=run_gpt_fn,
pydantic_cls=pydantic_cls
)
return pydantic_cls_instance, err_msg

View File

@@ -1,17 +1,14 @@
import os
import re
import shutil
import numpy as np
from loguru import logger
from toolbox import update_ui, update_ui_lastest_msg, get_log_folder, gen_time_str
from toolbox import get_conf, promote_file_to_downloadzone
from crazy_functions.latex_fns.latex_toolbox import PRESERVE, TRANSFORM
from crazy_functions.latex_fns.latex_toolbox import set_forbidden_text, set_forbidden_text_begin_end, set_forbidden_text_careful_brace
from crazy_functions.latex_fns.latex_toolbox import reverse_forbidden_text_careful_brace, reverse_forbidden_text, convert_to_linklist, post_process
from crazy_functions.latex_fns.latex_toolbox import fix_content, find_main_tex_file, merge_tex_files, compile_latex_with_timeout
from crazy_functions.latex_fns.latex_toolbox import find_title_and_abs
from crazy_functions.latex_fns.latex_pickle_io import objdump, objload
from toolbox import update_ui, update_ui_lastest_msg, get_log_folder
from toolbox import get_conf, objdump, objload, promote_file_to_downloadzone
from .latex_toolbox import PRESERVE, TRANSFORM
from .latex_toolbox import set_forbidden_text, set_forbidden_text_begin_end, set_forbidden_text_careful_brace
from .latex_toolbox import reverse_forbidden_text_careful_brace, reverse_forbidden_text, convert_to_linklist, post_process
from .latex_toolbox import fix_content, find_main_tex_file, merge_tex_files, compile_latex_with_timeout
from .latex_toolbox import find_title_and_abs
import os, shutil
import re
import numpy as np
pj = os.path.join
@@ -325,7 +322,7 @@ def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work
buggy_lines = [int(l) for l in buggy_lines]
buggy_lines = sorted(buggy_lines)
buggy_line = buggy_lines[0]-1
logger.warning("reversing tex line that has errors", buggy_line)
print("reversing tex line that has errors", buggy_line)
# 重组,逆转出错的段落
if buggy_line not in fixed_line:
@@ -339,7 +336,7 @@ def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work
return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines
except:
logger.error("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.")
print("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.")
return False, -1, [-1]
@@ -382,7 +379,7 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
if mode!='translate_zh':
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面
logger.info( f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex')
print( f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex')
ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex', os.getcwd())
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面
@@ -421,7 +418,7 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
shutil.copyfile(concat_pdf, pj(work_folder, '..', 'translation', 'comparison.pdf'))
promote_file_to_downloadzone(concat_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI
except Exception as e:
logger.error(e)
print(e)
pass
return True # 成功啦
else:
@@ -467,71 +464,4 @@ def write_html(sp_file_contents, sp_file_result, chatbot, project_folder):
promote_file_to_downloadzone(file=res, chatbot=chatbot)
except:
from toolbox import trimmed_format_exc
logger.error('writing html result failed:', trimmed_format_exc())
def upload_to_gptac_cloud_if_user_allow(chatbot, arxiv_id):
try:
# 如果用户允许我们将arxiv论文PDF上传到GPTAC学术云
from toolbox import map_file_to_sha256
# 检查是否顺利,如果没有生成预期的文件,则跳过
is_result_good = False
for file_path in chatbot._cookies.get("files_to_promote", []):
if file_path.endswith('translate_zh.pdf'):
is_result_good = True
if not is_result_good:
return
# 上传文件
for file_path in chatbot._cookies.get("files_to_promote", []):
align_name = None
# normalized name
for name in ['translate_zh.pdf', 'comparison.pdf']:
if file_path.endswith(name): align_name = name
# if match any align name
if align_name:
logger.info(f'Uploading to GPTAC cloud as the user has set `allow_cloud_io`: {file_path}')
with open(file_path, 'rb') as f:
import requests
url = 'https://cloud-2.agent-matrix.com/arxiv_tf_paper_normal_upload'
files = {'file': (align_name, f, 'application/octet-stream')}
data = {
'arxiv_id': arxiv_id,
'file_hash': map_file_to_sha256(file_path),
'language': 'zh',
'trans_prompt': 'to_be_implemented',
'llm_model': 'to_be_implemented',
'llm_model_param': 'to_be_implemented',
}
resp = requests.post(url=url, files=files, data=data, timeout=30)
logger.info(f'Uploading terminate ({resp.status_code})`: {file_path}')
except:
# 如果上传失败,不会中断程序,因为这是次要功能
pass
def check_gptac_cloud(arxiv_id, chatbot):
import requests
success = False
downloaded = []
try:
for pdf_target in ['translate_zh.pdf', 'comparison.pdf']:
url = 'https://cloud-2.agent-matrix.com/arxiv_tf_paper_normal_exist'
data = {
'arxiv_id': arxiv_id,
'name': pdf_target,
}
resp = requests.post(url=url, data=data)
cache_hit_result = resp.text.strip('"')
if cache_hit_result.startswith("http"):
url = cache_hit_result
logger.info(f'Downloading from GPTAC cloud: {url}')
resp = requests.get(url=url, timeout=30)
target = os.path.join(get_log_folder(plugin_name='gptac_cloud'), gen_time_str(), pdf_target)
os.makedirs(os.path.dirname(target), exist_ok=True)
with open(target, 'wb') as f:
f.write(resp.content)
new_path = promote_file_to_downloadzone(target, chatbot=chatbot)
success = True
downloaded.append(new_path)
except:
pass
return success, downloaded
print('writing html result failed:', trimmed_format_exc())

View File

@@ -1,48 +0,0 @@
import pickle
class SafeUnpickler(pickle.Unpickler):
def get_safe_classes(self):
from crazy_functions.latex_fns.latex_actions import LatexPaperFileGroup, LatexPaperSplit
from crazy_functions.latex_fns.latex_toolbox import LinkedListNode
from numpy.core.multiarray import scalar
from numpy import dtype
# 定义允许的安全类
safe_classes = {
# 在这里添加其他安全的类
'LatexPaperFileGroup': LatexPaperFileGroup,
'LatexPaperSplit': LatexPaperSplit,
'LinkedListNode': LinkedListNode,
'scalar': scalar,
'dtype': dtype,
}
return safe_classes
def find_class(self, module, name):
# 只允许特定的类进行反序列化
self.safe_classes = self.get_safe_classes()
match_class_name = None
for class_name in self.safe_classes.keys():
if (class_name in f'{module}.{name}'):
match_class_name = class_name
if match_class_name is not None:
return self.safe_classes[match_class_name]
# 如果尝试加载未授权的类,则抛出异常
raise pickle.UnpicklingError(f"Attempted to deserialize unauthorized class '{name}' from module '{module}'")
def objdump(obj, file="objdump.tmp"):
with open(file, "wb+") as f:
pickle.dump(obj, f)
return
def objload(file="objdump.tmp"):
import os
if not os.path.exists(file):
return
with open(file, "rb") as f:
unpickler = SafeUnpickler(f)
return unpickler.load()

View File

@@ -1,8 +1,6 @@
import os
import os, shutil
import re
import shutil
import numpy as np
from loguru import logger
PRESERVE = 0
TRANSFORM = 1
@@ -57,7 +55,7 @@ def post_process(root):
str_stack.append("{")
elif c == "}":
if len(str_stack) == 1:
logger.warning("fixing brace error")
print("stack fix")
return i
str_stack.pop(-1)
else:
@@ -603,7 +601,7 @@ def compile_latex_with_timeout(command, cwd, timeout=60):
except subprocess.TimeoutExpired:
process.kill()
stdout, stderr = process.communicate()
logger.error("Process timed out (compile_latex_with_timeout)!")
print("Process timed out!")
return False
return True
@@ -644,216 +642,6 @@ def run_in_subprocess(func):
def _merge_pdfs(pdf1_path, pdf2_path, output_path):
try:
logger.info("Merging PDFs using _merge_pdfs_ng")
_merge_pdfs_ng(pdf1_path, pdf2_path, output_path)
except:
logger.info("Merging PDFs using _merge_pdfs_legacy")
_merge_pdfs_legacy(pdf1_path, pdf2_path, output_path)
def _merge_pdfs_ng(pdf1_path, pdf2_path, output_path):
import PyPDF2 # PyPDF2这个库有严重的内存泄露问题把它放到子进程中运行从而方便内存的释放
from PyPDF2.generic import NameObject, TextStringObject, ArrayObject, FloatObject, NumberObject
Percent = 1
# raise RuntimeError('PyPDF2 has a serious memory leak problem, please use other tools to merge PDF files.')
# Open the first PDF file
with open(pdf1_path, "rb") as pdf1_file:
pdf1_reader = PyPDF2.PdfFileReader(pdf1_file)
# Open the second PDF file
with open(pdf2_path, "rb") as pdf2_file:
pdf2_reader = PyPDF2.PdfFileReader(pdf2_file)
# Create a new PDF file to store the merged pages
output_writer = PyPDF2.PdfFileWriter()
# Determine the number of pages in each PDF file
num_pages = max(pdf1_reader.numPages, pdf2_reader.numPages)
# Merge the pages from the two PDF files
for page_num in range(num_pages):
# Add the page from the first PDF file
if page_num < pdf1_reader.numPages:
page1 = pdf1_reader.getPage(page_num)
else:
page1 = PyPDF2.PageObject.createBlankPage(pdf1_reader)
# Add the page from the second PDF file
if page_num < pdf2_reader.numPages:
page2 = pdf2_reader.getPage(page_num)
else:
page2 = PyPDF2.PageObject.createBlankPage(pdf1_reader)
# Create a new empty page with double width
new_page = PyPDF2.PageObject.createBlankPage(
width=int(
int(page1.mediaBox.getWidth())
+ int(page2.mediaBox.getWidth()) * Percent
),
height=max(page1.mediaBox.getHeight(), page2.mediaBox.getHeight()),
)
new_page.mergeTranslatedPage(page1, 0, 0)
new_page.mergeTranslatedPage(
page2,
int(
int(page1.mediaBox.getWidth())
- int(page2.mediaBox.getWidth()) * (1 - Percent)
),
0,
)
if "/Annots" in new_page:
annotations = new_page["/Annots"]
for i, annot in enumerate(annotations):
annot_obj = annot.get_object()
# 检查注释类型是否是链接(/Link
if annot_obj.get("/Subtype") == "/Link":
# 检查是否为内部链接跳转(/GoTo或外部URI链接/URI
action = annot_obj.get("/A")
if action:
if "/S" in action and action["/S"] == "/GoTo":
# 内部链接:跳转到文档中的某个页面
dest = action.get("/D") # 目标页或目标位置
# if dest and annot.idnum in page2_annot_id:
# if dest in pdf2_reader.named_destinations:
if dest and page2.annotations:
if annot in page2.annotations:
# 获取原始文件中跳转信息,包括跳转页面
destination = pdf2_reader.named_destinations[
dest
]
page_number = (
pdf2_reader.get_destination_page_number(
destination
)
)
# 更新跳转信息,跳转到对应的页面和,指定坐标 (100, 150),缩放比例为 100%
# “/D”:[10,'/XYZ',100,100,0]
if destination.dest_array[1] == "/XYZ":
annot_obj["/A"].update(
{
NameObject("/D"): ArrayObject(
[
NumberObject(page_number),
destination.dest_array[1],
FloatObject(
destination.dest_array[
2
]
+ int(
page1.mediaBox.getWidth()
)
),
destination.dest_array[3],
destination.dest_array[4],
]
) # 确保键和值是 PdfObject
}
)
else:
annot_obj["/A"].update(
{
NameObject("/D"): ArrayObject(
[
NumberObject(page_number),
destination.dest_array[1],
]
) # 确保键和值是 PdfObject
}
)
rect = annot_obj.get("/Rect")
# 更新点击坐标
rect = ArrayObject(
[
FloatObject(
rect[0]
+ int(page1.mediaBox.getWidth())
),
rect[1],
FloatObject(
rect[2]
+ int(page1.mediaBox.getWidth())
),
rect[3],
]
)
annot_obj.update(
{
NameObject(
"/Rect"
): rect # 确保键和值是 PdfObject
}
)
# if dest and annot.idnum in page1_annot_id:
# if dest in pdf1_reader.named_destinations:
if dest and page1.annotations:
if annot in page1.annotations:
# 获取原始文件中跳转信息,包括跳转页面
destination = pdf1_reader.named_destinations[
dest
]
page_number = (
pdf1_reader.get_destination_page_number(
destination
)
)
# 更新跳转信息,跳转到对应的页面和,指定坐标 (100, 150),缩放比例为 100%
# “/D”:[10,'/XYZ',100,100,0]
if destination.dest_array[1] == "/XYZ":
annot_obj["/A"].update(
{
NameObject("/D"): ArrayObject(
[
NumberObject(page_number),
destination.dest_array[1],
FloatObject(
destination.dest_array[
2
]
),
destination.dest_array[3],
destination.dest_array[4],
]
) # 确保键和值是 PdfObject
}
)
else:
annot_obj["/A"].update(
{
NameObject("/D"): ArrayObject(
[
NumberObject(page_number),
destination.dest_array[1],
]
) # 确保键和值是 PdfObject
}
)
rect = annot_obj.get("/Rect")
rect = ArrayObject(
[
FloatObject(rect[0]),
rect[1],
FloatObject(rect[2]),
rect[3],
]
)
annot_obj.update(
{
NameObject(
"/Rect"
): rect # 确保键和值是 PdfObject
}
)
elif "/S" in action and action["/S"] == "/URI":
# 外部链接跳转到某个URI
uri = action.get("/URI")
output_writer.addPage(new_page)
# Save the merged PDF file
with open(output_path, "wb") as output_file:
output_writer.write(output_file)
def _merge_pdfs_legacy(pdf1_path, pdf2_path, output_path):
import PyPDF2 # PyPDF2这个库有严重的内存泄露问题把它放到子进程中运行从而方便内存的释放
Percent = 0.95

View File

@@ -1,6 +1,5 @@
import time, json, sys, struct
import time, logging, json, sys, struct
import numpy as np
from loguru import logger as logging
from scipy.io.wavfile import WAVE_FORMAT
def write_numpy_to_wave(filename, rate, data, add_header=False):
@@ -107,14 +106,18 @@ def is_speaker_speaking(vad, data, sample_rate):
class AliyunASR():
def test_on_sentence_begin(self, message, *args):
# print("test_on_sentence_begin:{}".format(message))
pass
def test_on_sentence_end(self, message, *args):
# print("test_on_sentence_end:{}".format(message))
message = json.loads(message)
self.parsed_sentence = message['payload']['result']
self.event_on_entence_end.set()
# print(self.parsed_sentence)
def test_on_start(self, message, *args):
# print("test_on_start:{}".format(message))
pass
def test_on_error(self, message, *args):
@@ -126,11 +129,13 @@ class AliyunASR():
pass
def test_on_result_chg(self, message, *args):
# print("test_on_chg:{}".format(message))
message = json.loads(message)
self.parsed_text = message['payload']['result']
self.event_on_result_chg.set()
def test_on_completed(self, message, *args):
# print("on_completed:args=>{} message=>{}".format(args, message))
pass
def audio_convertion_thread(self, uuid):
@@ -243,14 +248,14 @@ class AliyunASR():
try:
response = client.do_action_with_exception(request)
logging.info(response)
print(response)
jss = json.loads(response)
if 'Token' in jss and 'Id' in jss['Token']:
token = jss['Token']['Id']
expireTime = jss['Token']['ExpireTime']
logging.info("token = " + token)
logging.info("expireTime = " + str(expireTime))
print("token = " + token)
print("expireTime = " + str(expireTime))
except Exception as e:
logging.error(e)
print(e)
return token

View File

@@ -1,5 +1,4 @@
from crazy_functions.ipc_fns.mp import run_in_subprocess_with_timeout
from loguru import logger
def force_breakdown(txt, limit, get_token_fn):
""" 当无法用标点、空行分割时,我们用最暴力的方法切割
@@ -77,7 +76,7 @@ def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=F
remain_txt_to_cut = post
remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage)
process = fin_len/total_len
logger.info(f'正在文本切分 {int(process*100)}%')
print(f'正在文本切分 {int(process*100)}%')
if len(remain_txt_to_cut.strip()) == 0:
break
return res
@@ -120,7 +119,7 @@ if __name__ == '__main__':
for i in range(5):
file_content += file_content
logger.info(len(file_content))
print(len(file_content))
TOKEN_LIMIT_PER_FRAGMENT = 2500
res = breakdown_text_to_satisfy_token_limit(file_content, TOKEN_LIMIT_PER_FRAGMENT)

View File

@@ -4,7 +4,7 @@ from toolbox import promote_file_to_downloadzone
from toolbox import write_history_to_file, promote_file_to_downloadzone
from toolbox import get_conf
from toolbox import ProxyNetworkActivate
from shared_utils.colorful import *
from colorful import *
import requests
import random
import copy
@@ -72,7 +72,7 @@ def produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chat
generated_conclusion_files.append(res_path)
return res_path
def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG, plugin_kwargs={}):
def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG):
from crazy_functions.pdf_fns.report_gen_html import construct_html
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
@@ -138,7 +138,7 @@ def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_fi
chatbot=chatbot,
history_array=[meta for _ in inputs_array],
sys_prompt_array=[
"请你作为一个学术翻译,负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" + plugin_kwargs.get("additional_prompt", "") for _ in inputs_array],
"请你作为一个学术翻译,负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" for _ in inputs_array],
)
# -=-=-=-=-=-=-=-= 写出Markdown文件 -=-=-=-=-=-=-=-=
produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chatbot, fp, generated_conclusion_files)

View File

@@ -1,26 +0,0 @@
import os
from toolbox import CatchException, report_exception, get_log_folder, gen_time_str, check_packages
from toolbox import update_ui, promote_file_to_downloadzone, update_ui_lastest_msg, disable_auto_promotion
from toolbox import write_history_to_file, promote_file_to_downloadzone, get_conf, extract_archive
from crazy_functions.pdf_fns.parse_pdf import parse_pdf, translate_pdf
def 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url):
import copy, json
TOKEN_LIMIT_PER_FRAGMENT = 1024
generated_conclusion_files = []
generated_html_files = []
DST_LANG = "中文"
from crazy_functions.pdf_fns.report_gen_html import construct_html
for index, fp in enumerate(file_manifest):
chatbot.append(["当前进度:", f"正在连接GROBID服务请稍候: {grobid_url}\n如果等待时间过长请修改config中的GROBID_URL可修改成本地GROBID服务。"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
article_dict = parse_pdf(fp, grobid_url)
grobid_json_res = os.path.join(get_log_folder(), gen_time_str() + "grobid.json")
with open(grobid_json_res, 'w+', encoding='utf8') as f:
f.write(json.dumps(article_dict, indent=4, ensure_ascii=False))
promote_file_to_downloadzone(grobid_json_res, chatbot=chatbot)
if article_dict is None: raise RuntimeError("解析PDF失败请检查PDF是否损坏。")
yield from translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG, plugin_kwargs=plugin_kwargs)
chatbot.append(("给出输出文件清单", str(generated_conclusion_files + generated_html_files)))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

View File

@@ -1,250 +0,0 @@
from toolbox import get_log_folder, gen_time_str, get_conf
from toolbox import update_ui, promote_file_to_downloadzone
from toolbox import promote_file_to_downloadzone, extract_archive
from toolbox import generate_file_link, zip_folder
from crazy_functions.crazy_utils import get_files_from_everything
from shared_utils.colorful import *
from loguru import logger
import os
import time
def refresh_key(doc2x_api_key):
import requests, json
url = "https://api.doc2x.noedgeai.com/api/token/refresh"
res = requests.post(
url,
headers={"Authorization": "Bearer " + doc2x_api_key}
)
res_json = []
if res.status_code == 200:
decoded = res.content.decode("utf-8")
res_json = json.loads(decoded)
doc2x_api_key = res_json['data']['token']
else:
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
return doc2x_api_key
def 解析PDF_DOC2X_转Latex(pdf_file_path):
zip_file_path, unzipped_folder = 解析PDF_DOC2X(pdf_file_path, format='tex')
return unzipped_folder
def 解析PDF_DOC2X(pdf_file_path, format='tex'):
"""
format: 'tex', 'md', 'docx'
"""
import requests, json, os
DOC2X_API_KEY = get_conf('DOC2X_API_KEY')
latex_dir = get_log_folder(plugin_name="pdf_ocr_latex")
markdown_dir = get_log_folder(plugin_name="pdf_ocr")
doc2x_api_key = DOC2X_API_KEY
# < ------ 第1步上传 ------ >
logger.info("Doc2x 第1步上传")
with open(pdf_file_path, 'rb') as file:
res = requests.post(
"https://v2.doc2x.noedgeai.com/api/v2/parse/pdf",
headers={"Authorization": "Bearer " + doc2x_api_key},
data=file
)
# res_json = []
if res.status_code == 200:
res_json = res.json()
else:
raise RuntimeError(f"Doc2x return an error: {res.json()}")
uuid = res_json['data']['uid']
# < ------ 第2步轮询等待 ------ >
logger.info("Doc2x 第2步轮询等待")
params = {'uid': uuid}
while True:
res = requests.get(
'https://v2.doc2x.noedgeai.com/api/v2/parse/status',
headers={"Authorization": "Bearer " + doc2x_api_key},
params=params
)
res_json = res.json()
if res_json['data']['status'] == "success":
break
elif res_json['data']['status'] == "processing":
time.sleep(3)
logger.info(f"Doc2x is processing at {res_json['data']['progress']}%")
elif res_json['data']['status'] == "failed":
raise RuntimeError(f"Doc2x return an error: {res_json}")
# < ------ 第3步提交转化 ------ >
logger.info("Doc2x 第3步提交转化")
data = {
"uid": uuid,
"to": format,
"formula_mode": "dollar",
"filename": "output"
}
res = requests.post(
'https://v2.doc2x.noedgeai.com/api/v2/convert/parse',
headers={"Authorization": "Bearer " + doc2x_api_key},
json=data
)
if res.status_code == 200:
res_json = res.json()
else:
raise RuntimeError(f"Doc2x return an error: {res.json()}")
# < ------ 第4步等待结果 ------ >
logger.info("Doc2x 第4步等待结果")
params = {'uid': uuid}
while True:
res = requests.get(
'https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result',
headers={"Authorization": "Bearer " + doc2x_api_key},
params=params
)
res_json = res.json()
if res_json['data']['status'] == "success":
break
elif res_json['data']['status'] == "processing":
time.sleep(3)
logger.info(f"Doc2x still processing")
elif res_json['data']['status'] == "failed":
raise RuntimeError(f"Doc2x return an error: {res_json}")
# < ------ 第5步最后的处理 ------ >
logger.info("Doc2x 第5步最后的处理")
if format=='tex':
target_path = latex_dir
if format=='md':
target_path = markdown_dir
os.makedirs(target_path, exist_ok=True)
max_attempt = 3
# < ------ 下载 ------ >
for attempt in range(max_attempt):
try:
result_url = res_json['data']['url']
res = requests.get(result_url)
zip_path = os.path.join(target_path, gen_time_str() + '.zip')
unzip_path = os.path.join(target_path, gen_time_str())
if res.status_code == 200:
with open(zip_path, "wb") as f: f.write(res.content)
else:
raise RuntimeError(f"Doc2x return an error: {res.json()}")
except Exception as e:
if attempt < max_attempt - 1:
logger.error(f"Failed to download latex file, retrying... {e}")
time.sleep(3)
continue
else:
raise e
# < ------ 解压 ------ >
import zipfile
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(unzip_path)
return zip_path, unzip_path
def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request):
def pdf2markdown(filepath):
chatbot.append((None, f"Doc2x 解析中"))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
md_zip_path, unzipped_folder = 解析PDF_DOC2X(filepath, format='md')
promote_file_to_downloadzone(md_zip_path, chatbot=chatbot)
chatbot.append((None, f"完成解析 {md_zip_path} ..."))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return md_zip_path
def deliver_to_markdown_plugin(md_zip_path, user_request):
from crazy_functions.Markdown_Translate import Markdown英译中
import shutil, re
time_tag = gen_time_str()
target_path_base = get_log_folder(chatbot.get_user())
file_origin_name = os.path.basename(md_zip_path)
this_file_path = os.path.join(target_path_base, file_origin_name)
os.makedirs(target_path_base, exist_ok=True)
shutil.copyfile(md_zip_path, this_file_path)
ex_folder = this_file_path + ".extract"
extract_archive(
file_path=this_file_path, dest_dir=ex_folder
)
# edit markdown files
success, file_manifest, project_folder = get_files_from_everything(ex_folder, type='.md')
for generated_fp in file_manifest:
# 修正一些公式问题
with open(generated_fp, 'r', encoding='utf8') as f:
content = f.read()
# 将公式中的\[ \]替换成$$
content = content.replace(r'\[', r'$$').replace(r'\]', r'$$')
# 将公式中的\( \)替换成$
content = content.replace(r'\(', r'$').replace(r'\)', r'$')
content = content.replace('```markdown', '\n').replace('```', '\n')
with open(generated_fp, 'w', encoding='utf8') as f:
f.write(content)
promote_file_to_downloadzone(generated_fp, chatbot=chatbot)
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 生成在线预览html
file_name = '在线预览翻译(原文)' + gen_time_str() + '.html'
preview_fp = os.path.join(ex_folder, file_name)
from shared_utils.advanced_markdown_format import markdown_convertion_for_file
with open(generated_fp, "r", encoding="utf-8") as f:
md = f.read()
# # Markdown中使用不标准的表格需要在表格前加上一个emoji以便公式渲染
# md = re.sub(r'^<table>', r'.<table>', md, flags=re.MULTILINE)
html = markdown_convertion_for_file(md)
with open(preview_fp, "w", encoding="utf-8") as f: f.write(html)
chatbot.append([None, f"生成在线预览:{generate_file_link([preview_fp])}"])
promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
chatbot.append((None, f"调用Markdown插件 {ex_folder} ..."))
plugin_kwargs['markdown_expected_output_dir'] = ex_folder
translated_f_name = 'translated_markdown.md'
generated_fp = plugin_kwargs['markdown_expected_output_path'] = os.path.join(ex_folder, translated_f_name)
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
yield from Markdown英译中(ex_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
if os.path.exists(generated_fp):
# 修正一些公式问题
with open(generated_fp, 'r', encoding='utf8') as f: content = f.read()
content = content.replace('```markdown', '\n').replace('```', '\n')
# Markdown中使用不标准的表格需要在表格前加上一个emoji以便公式渲染
# content = re.sub(r'^<table>', r'.<table>', content, flags=re.MULTILINE)
with open(generated_fp, 'w', encoding='utf8') as f: f.write(content)
# 生成在线预览html
file_name = '在线预览翻译' + gen_time_str() + '.html'
preview_fp = os.path.join(ex_folder, file_name)
from shared_utils.advanced_markdown_format import markdown_convertion_for_file
with open(generated_fp, "r", encoding="utf-8") as f:
md = f.read()
html = markdown_convertion_for_file(md)
with open(preview_fp, "w", encoding="utf-8") as f: f.write(html)
promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
# 生成包含图片的压缩包
dest_folder = get_log_folder(chatbot.get_user())
zip_name = '翻译后的带图文档.zip'
zip_folder(source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name)
zip_fp = os.path.join(dest_folder, zip_name)
promote_file_to_downloadzone(zip_fp, chatbot=chatbot)
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
md_zip_path = yield from pdf2markdown(fp)
yield from deliver_to_markdown_plugin(md_zip_path, user_request)
def 解析PDF_基于DOC2X(file_manifest, *args):
for index, fp in enumerate(file_manifest):
yield from 解析PDF_DOC2X_单文件(fp, *args)
return

View File

@@ -1,73 +0,0 @@
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>GPT-Academic 翻译报告书</title>
<style>
.centered-a {
color: red;
text-align: center;
margin-bottom: 2%;
font-size: 1.5em;
}
.centered-b {
color: red;
text-align: center;
margin-top: 10%;
margin-bottom: 20%;
font-size: 1.5em;
}
.centered-c {
color: rgba(255, 0, 0, 0);
text-align: center;
margin-top: 2%;
margin-bottom: 20%;
font-size: 7em;
}
</style>
<script>
// Configure MathJax settings
MathJax = {
tex: {
inlineMath: [
['$', '$'],
['\(', '\)']
]
}
}
addEventListener('zero-md-rendered', () => {MathJax.typeset(); console.log('MathJax typeset!');})
</script>
<!-- Load MathJax library -->
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js"></script>
<script
type="module"
src="https://cdn.jsdelivr.net/gh/zerodevx/zero-md@2/dist/zero-md.min.js"
></script>
</head>
<body>
<div class="test_temp1" style="width:10%; height: 500px; float:left;">
</div>
<div class="test_temp2" style="width:80%; height: 500px; float:left;">
<!-- Simply set the `src` attribute to your MD file and win -->
<div class="centered-a">
请按Ctrl+S保存此页面否则该页面可能在几分钟后失效。
</div>
<zero-md src="translated_markdown.md" no-shadow>
</zero-md>
<div class="centered-b">
本报告由GPT-Academic开源项目生成地址https://github.com/binary-husky/gpt_academic。
</div>
<div class="centered-c">
本报告由GPT-Academic开源项目生成地址https://github.com/binary-husky/gpt_academic。
</div>
</div>
<div class="test_temp3" style="width:10%; height: 500px; float:left;">
</div>
</body>
</html>

View File

@@ -1,52 +0,0 @@
import os, json, base64
from pydantic import BaseModel, Field
from textwrap import dedent
from typing import List
class ArgProperty(BaseModel): # PLUGIN_ARG_MENU
title: str = Field(description="The title", default="")
description: str = Field(description="The description", default="")
default_value: str = Field(description="The default value", default="")
type: str = Field(description="The type", default="") # currently we support ['string', 'dropdown']
options: List[str] = Field(default=[], description="List of options available for the argument") # only used when type is 'dropdown'
class GptAcademicPluginTemplate():
def __init__(self):
# please note that `execute` method may run in different threads,
# thus you should not store any state in the plugin instance,
# which may be accessed by multiple threads
pass
def define_arg_selection_menu(self):
"""
An example as below:
```
def define_arg_selection_menu(self):
gui_definition = {
"main_input":
ArgProperty(title="main input", description="description", default_value="default_value", type="string").model_dump_json(),
"advanced_arg":
ArgProperty(title="advanced arguments", description="description", default_value="default_value", type="string").model_dump_json(),
"additional_arg_01":
ArgProperty(title="additional", description="description", default_value="default_value", type="string").model_dump_json(),
}
return gui_definition
```
"""
raise NotImplementedError("You need to implement this method in your plugin class")
def get_js_code_for_generating_menu(self, btnName):
define_arg_selection = self.define_arg_selection_menu()
if len(define_arg_selection.keys()) > 8:
raise ValueError("You can only have up to 8 arguments in the define_arg_selection")
# if "main_input" not in define_arg_selection:
# raise ValueError("You must have a 'main_input' in the define_arg_selection")
DEFINE_ARG_INPUT_INTERFACE = json.dumps(define_arg_selection)
return base64.b64encode(DEFINE_ARG_INPUT_INTERFACE.encode('utf-8')).decode('utf-8')
def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
raise NotImplementedError("You need to implement this method in your plugin class")

View File

@@ -1,87 +0,0 @@
SearchOptimizerPrompt="""作为一个网页搜索助手,你的任务是结合历史记录,从不同角度,为“原问题”生成个不同版本的“检索词”,从而提高网页检索的精度。生成的问题要求指向对象清晰明确,并与“原问题语言相同”。例如:
历史记录:
"
Q: 对话背景。
A: 当前对话是关于 Nginx 的介绍和在Ubuntu上的使用等。
"
原问题: 怎么下载
检索词: ["Nginx 下载","Ubuntu Nginx","Ubuntu安装Nginx"]
----------------
历史记录:
"
Q: 对话背景。
A: 当前对话是关于 Nginx 的介绍和使用等。
Q: 报错 "no connection"
A: 报错"no connection"可能是因为……
"
原问题: 怎么解决
检索词: ["Nginx报错"no connection" 解决","Nginx'no connection'报错 原因","Nginx提示'no connection'"]
----------------
历史记录:
"
"
原问题: 你知道 Python 么?
检索词: ["Python","Python 使用教程。","Python 特点和优势"]
----------------
历史记录:
"
Q: 列出Java的三种特点
A: 1. Java 是一种编译型语言。
2. Java 是一种面向对象的编程语言。
3. Java 是一种跨平台的编程语言。
"
原问题: 介绍下第2点。
检索词: ["Java 面向对象特点","Java 面向对象编程优势。","Java 面向对象编程"]
----------------
现在有历史记录:
"
{history}
"
有其原问题: {query}
直接给出最多{num}个检索词必须以json形式给出不得有多余字符:
"""
SearchAcademicOptimizerPrompt="""作为一个学术论文搜索助手,你的任务是结合历史记录,从不同角度,为“原问题”生成个不同版本的“检索词”,从而提高学术论文检索的精度。生成的问题要求指向对象清晰明确,并与“原问题语言相同”。例如:
历史记录:
"
Q: 对话背景。
A: 当前对话是关于深度学习的介绍和在图像识别中的应用等。
"
原问题: 怎么下载相关论文
检索词: ["深度学习 图像识别 论文下载","图像识别 深度学习 研究论文","深度学习 图像识别 论文资源","Deep Learning Image Recognition Paper Download","Image Recognition Deep Learning Research Paper"]
----------------
历史记录:
"
Q: 对话背景。
A: 当前对话是关于深度学习的介绍和应用等。
Q: 报错 "模型不收敛"
A: 报错"模型不收敛"可能是因为……
"
原问题: 怎么解决
检索词: ["深度学习 模型不收敛 解决方案 论文","深度学习 模型不收敛 原因 研究","深度学习 模型不收敛 论文","Deep Learning Model Convergence Issue Solution Paper","Deep Learning Model Convergence Problem Research"]
----------------
历史记录:
"
"
原问题: 你知道 GAN 么?
检索词: ["生成对抗网络 论文","GAN 使用教程 论文","GAN 特点和优势 研究","Generative Adversarial Network Paper","GAN Usage Tutorial Paper"]
----------------
历史记录:
"
Q: 列出机器学习的三种应用?
A: 1. 机器学习在图像识别中的应用。
2. 机器学习在自然语言处理中的应用。
3. 机器学习在推荐系统中的应用。
"
原问题: 介绍下第2点。
检索词: ["机器学习 自然语言处理 应用 论文","机器学习 自然语言处理 研究","机器学习 NLP 应用 论文","Machine Learning Natural Language Processing Application Paper","Machine Learning NLP Research"]
----------------
现在有历史记录:
"
{history}
"
有其原问题: {query}
直接给出最多{num}个检索词必须以json形式给出不得有多余字符:
"""

View File

@@ -1,115 +0,0 @@
import logging
import tarfile
from pathlib import Path
from typing import Optional, Dict
import requests
class ArxivDownloader:
"""用于下载arXiv论文源码的下载器"""
def __init__(self, root_dir: str = "./papers", proxies: Optional[Dict[str, str]] = None):
"""
初始化下载器
Args:
root_dir: 保存下载文件的根目录
proxies: 代理服务器设置,例如 {"http": "http://proxy:port", "https": "https://proxy:port"}
"""
self.root_dir = Path(root_dir)
self.root_dir.mkdir(exist_ok=True)
self.proxies = proxies
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
def _download_and_extract(self, arxiv_id: str) -> str:
"""
下载并解压arxiv论文源码
Args:
arxiv_id: arXiv论文ID例如"2103.00020"
Returns:
str: 解压后的文件目录路径
Raises:
RuntimeError: 当下载失败时抛出
"""
paper_dir = self.root_dir / arxiv_id
tar_path = paper_dir / f"{arxiv_id}.tar.gz"
# 检查缓存
if paper_dir.exists() and any(paper_dir.iterdir()):
logging.info(f"Using cached version for {arxiv_id}")
return str(paper_dir)
paper_dir.mkdir(exist_ok=True)
urls = [
f"https://arxiv.org/src/{arxiv_id}",
f"https://arxiv.org/e-print/{arxiv_id}"
]
for url in urls:
try:
logging.info(f"Downloading from {url}")
response = requests.get(url, proxies=self.proxies)
if response.status_code == 200:
tar_path.write_bytes(response.content)
with tarfile.open(tar_path, 'r:gz') as tar:
tar.extractall(path=paper_dir)
return str(paper_dir)
except Exception as e:
logging.warning(f"Download failed for {url}: {e}")
continue
raise RuntimeError(f"Failed to download paper {arxiv_id}")
def download_paper(self, arxiv_id: str) -> str:
"""
下载指定的arXiv论文
Args:
arxiv_id: arXiv论文ID
Returns:
str: 论文文件所在的目录路径
"""
return self._download_and_extract(arxiv_id)
def main():
"""测试下载功能"""
# 配置代理(如果需要)
proxies = {
"http": "http://your-proxy:port",
"https": "https://your-proxy:port"
}
# 创建下载器实例如果不需要代理可以不传入proxies参数
downloader = ArxivDownloader(root_dir="./downloaded_papers", proxies=None)
# 测试下载一篇论文这里使用一个示例ID
try:
paper_id = "2103.00020" # 这是一个示例ID
paper_dir = downloader.download_paper(paper_id)
print(f"Successfully downloaded paper to: {paper_dir}")
# 检查下载的文件
paper_path = Path(paper_dir)
if paper_path.exists():
print("Downloaded files:")
for file in paper_path.rglob("*"):
if file.is_file():
print(f"- {file.relative_to(paper_path)}")
except Exception as e:
print(f"Error downloading paper: {e}")
if __name__ == "__main__":
main()

View File

@@ -1,836 +0,0 @@
import asyncio
import logging
import re
import tarfile
import time
from copy import deepcopy
from pathlib import Path
from typing import List, Optional, Dict, Set
import aiohttp
from crazy_functions.rag_fns.arxiv_fns.author_extractor import LatexAuthorExtractor
from crazy_functions.rag_fns.arxiv_fns.essay_structure import EssayStructureParser, DocumentStructure, read_tex_file
from crazy_functions.rag_fns.arxiv_fns.section_extractor import Section
from crazy_functions.rag_fns.arxiv_fns.section_fragment import SectionFragment
from crazy_functions.rag_fns.arxiv_fns.tex_utils import TexUtils
from crazy_functions.doc_fns.content_folder import PaperContentFormatter, PaperMetadata
def save_fragments_to_file(fragments: List[SectionFragment], output_dir: Path ) -> Path:
"""
Save all fragments to a single structured markdown file.
Args:
fragments: List of SectionFragment objects
output_dir: Output directory path
Returns:
Path: Path to the generated markdown file
"""
from datetime import datetime
from pathlib import Path
# Create output directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Generate filename
filename = f"paper_latex_content_{timestamp}.md"
file_path = output_path/ filename
# Group fragments by section
sections = {}
for fragment in fragments:
section = fragment.current_section or "Uncategorized"
if section not in sections:
sections[section] = []
sections[section].append(fragment)
with open(file_path, "w", encoding="utf-8") as f:
# Write document header
f.write("# Document Fragments Analysis\n\n")
f.write("## Overview\n")
f.write(f"- Total Fragments: {len(fragments)}\n")
f.write(f"- Generated Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
# Add paper information if available
if fragments and (fragments[0].title or fragments[0].abstract):
f.write("\n## Paper Information\n")
if fragments[0].title:
f.write(f"### Title\n{fragments[0].title}\n")
if fragments[0].authors:
f.write(f"\n### Authors\n{fragments[0].authors}\n")
if fragments[0].abstract:
f.write(f"\n### Abstract\n{fragments[0].abstract}\n")
# Write section tree if available
if fragments and fragments[0].catalogs:
f.write("\n## Section Tree\n")
f.write("```\n") # 添加代码块开始标记
f.write(fragments[0].catalogs)
f.write("\n```") # 添加代码块结束标记
# Generate table of contents
f.write("\n## Table of Contents\n")
for section in sections:
clean_section = section.strip() or "Uncategorized"
fragment_count = len(sections[section])
f.write(f"- [{clean_section}](#{clean_section.lower().replace(' ', '-')}) "
f"({fragment_count} fragments)\n")
# Write content sections
f.write("\n## Content\n")
for section, section_fragments in sections.items():
clean_section = section.strip() or "Uncategorized"
f.write(f"\n### {clean_section}\n")
# Write each fragment
for i, fragment in enumerate(section_fragments, 1):
f.write(f"\n#### Fragment {i}\n")
# Metadata
f.write("**Metadata:**\n")
metadata = [
f"- Section: {fragment.current_section}",
f"- Length: {len(fragment.content)} chars",
f"- ArXiv ID: {fragment.arxiv_id}" if fragment.arxiv_id else None
]
f.write("\n".join(filter(None, metadata)) + "\n")
# Content
f.write("\n**Content:**\n")
f.write("\n")
f.write(fragment.content)
f.write("\n")
# Bibliography if exists
if fragment.bibliography:
f.write("\n**Bibliography:**\n")
f.write("```bibtex\n")
f.write(fragment.bibliography)
f.write("\n```\n")
# Add separator
if i < len(section_fragments):
f.write("\n---\n")
# Add statistics
f.write("\n## Statistics\n")
# Length distribution
lengths = [len(f.content) for f in fragments]
f.write("\n### Length Distribution\n")
f.write(f"- Minimum: {min(lengths)} chars\n")
f.write(f"- Maximum: {max(lengths)} chars\n")
f.write(f"- Average: {sum(lengths) / len(lengths):.1f} chars\n")
# Section distribution
f.write("\n### Section Distribution\n")
for section, section_fragments in sections.items():
percentage = (len(section_fragments) / len(fragments)) * 100
f.write(f"- {section}: {len(section_fragments)} ({percentage:.1f}%)\n")
print(f"Fragments saved to: {file_path}")
return file_path
# 定义各种引用命令的模式
CITATION_PATTERNS = [
# 基本的 \cite{} 格式
r'\\cite(?:\*)?(?:\[[^\]]*\])?{([^}]+)}',
# natbib 格式
r'\\citep(?:\*)?(?:\[[^\]]*\])?{([^}]+)}',
r'\\citet(?:\*)?(?:\[[^\]]*\])?{([^}]+)}',
r'\\citeauthor(?:\*)?(?:\[[^\]]*\])?{([^}]+)}',
r'\\citeyear(?:\*)?(?:\[[^\]]*\])?{([^}]+)}',
r'\\citealt(?:\*)?(?:\[[^\]]*\])?{([^}]+)}',
r'\\citealp(?:\*)?(?:\[[^\]]*\])?{([^}]+)}',
# biblatex 格式
r'\\textcite(?:\*)?(?:\[[^\]]*\])?{([^}]+)}',
r'\\parencite(?:\*)?(?:\[[^\]]*\])?{([^}]+)}',
r'\\autocite(?:\*)?(?:\[[^\]]*\])?{([^}]+)}',
# 自定义 [cite:...] 格式
r'\[cite:([^\]]+)\]',
]
# 编译所有模式
COMPILED_PATTERNS = [re.compile(pattern) for pattern in CITATION_PATTERNS]
class ArxivSplitter:
"""Arxiv论文智能分割器"""
def __init__(self,
root_dir: str = "gpt_log/arxiv_cache",
proxies: Optional[Dict[str, str]] = None,
cache_ttl: int = 7 * 24 * 60 * 60):
"""
初始化分割器
Args:
char_range: 字符数范围(最小值, 最大值)
root_dir: 缓存根目录
proxies: 代理设置
cache_ttl: 缓存过期时间(秒)
"""
self.root_dir = Path(root_dir)
self.root_dir.mkdir(parents=True, exist_ok=True)
self.proxies = proxies or {}
self.cache_ttl = cache_ttl
# 动态计算最优线程数
import multiprocessing
cpu_count = multiprocessing.cpu_count()
# 根据CPU核心数动态设置但设置上限防止过度并发
self.document_structure = DocumentStructure()
self.document_parser = EssayStructureParser()
self.max_workers = min(32, cpu_count * 2)
# 初始化TeX处理器
self.tex_processor = TexUtils()
# 配置日志
self._setup_logging()
def _setup_logging(self):
"""配置日志"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
self.logger = logging.getLogger(__name__)
def _normalize_arxiv_id(self, input_str: str) -> str:
"""规范化ArXiv ID"""
if 'arxiv.org/' in input_str.lower():
# 处理URL格式
if '/pdf/' in input_str:
arxiv_id = input_str.split('/pdf/')[-1]
else:
arxiv_id = input_str.split('/abs/')[-1]
# 移除版本号和其他后缀
return arxiv_id.split('v')[0].strip()
return input_str.split('v')[0].strip()
def _check_cache(self, paper_dir: Path) -> bool:
"""
检查缓存是否有效,包括文件完整性检查
Args:
paper_dir: 论文目录路径
Returns:
bool: 如果缓存有效返回True否则返回False
"""
if not paper_dir.exists():
return False
# 检查目录中是否存在必要文件
has_tex_files = False
has_main_tex = False
for file_path in paper_dir.rglob("*"):
if file_path.suffix == '.tex':
has_tex_files = True
content = self.tex_processor.read_file(str(file_path))
if content and r'\documentclass' in content:
has_main_tex = True
break
if not (has_tex_files and has_main_tex):
return False
# 检查缓存时间
cache_time = paper_dir.stat().st_mtime
if (time.time() - cache_time) < self.cache_ttl:
self.logger.info(f"Using valid cache for {paper_dir.name}")
return True
return False
async def download_paper(self, arxiv_id: str, paper_dir: Path) -> bool:
"""
异步下载论文,包含重试机制和临时文件处理
Args:
arxiv_id: ArXiv论文ID
paper_dir: 目标目录路径
Returns:
bool: 下载成功返回True否则返回False
"""
from crazy_functions.rag_fns.arxiv_fns.arxiv_downloader import ArxivDownloader
temp_tar_path = paper_dir / f"{arxiv_id}_temp.tar.gz"
final_tar_path = paper_dir / f"{arxiv_id}.tar.gz"
# 确保目录存在
paper_dir.mkdir(parents=True, exist_ok=True)
# 尝试使用 ArxivDownloader 下载
try:
downloader = ArxivDownloader(root_dir=str(paper_dir), proxies=self.proxies)
downloaded_dir = downloader.download_paper(arxiv_id)
if downloaded_dir:
self.logger.info(f"Successfully downloaded using ArxivDownloader to {downloaded_dir}")
return True
except Exception as e:
self.logger.warning(f"ArxivDownloader failed: {str(e)}. Falling back to direct download.")
# 如果 ArxivDownloader 失败,使用原有的下载方式作为备选
urls = [
f"https://arxiv.org/src/{arxiv_id}",
f"https://arxiv.org/e-print/{arxiv_id}"
]
max_retries = 3
retry_delay = 1 # 初始重试延迟(秒)
for url in urls:
for attempt in range(max_retries):
try:
self.logger.info(f"Downloading from {url} (attempt {attempt + 1}/{max_retries})")
async with aiohttp.ClientSession() as session:
async with session.get(url, proxy=self.proxies.get('http')) as response:
if response.status == 200:
content = await response.read()
# 写入临时文件
temp_tar_path.write_bytes(content)
try:
# 验证tar文件完整性并解压
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, self._process_tar_file, temp_tar_path, paper_dir)
# 下载成功后移动临时文件到最终位置
temp_tar_path.rename(final_tar_path)
return True
except Exception as e:
self.logger.warning(f"Invalid tar file: {str(e)}")
if temp_tar_path.exists():
temp_tar_path.unlink()
except Exception as e:
self.logger.warning(f"Download attempt {attempt + 1} failed from {url}: {str(e)}")
await asyncio.sleep(retry_delay * (attempt + 1)) # 指数退避
continue
return False
def _process_tar_file(self, tar_path: Path, extract_path: Path):
"""处理tar文件的同步操作"""
with tarfile.open(tar_path, 'r:gz') as tar:
tar.testall() # 验证文件完整性
tar.extractall(path=extract_path) # 解压文件
def process_references(self, doc_structure: DocumentStructure, ref_bib: str) -> DocumentStructure:
"""
Process citations in document structure and add referenced literature for each section
Args:
doc_structure: DocumentStructure object
ref_bib: String containing references separated by newlines
Returns:
Updated DocumentStructure object
"""
try:
# Create a copy to avoid modifying the original
doc = deepcopy(doc_structure)
# Parse references into a mapping
ref_map = self._parse_references(ref_bib)
if not ref_map:
self.logger.warning("No valid references found in ref_bib")
return doc
# Process all sections recursively
self._process_section_references(doc.toc, ref_map)
return doc
except Exception as e:
self.logger.error(f"Error processing references: {str(e)}")
return doc_structure # Return original if processing fails
def _process_section_references(self, sections: List[Section], ref_map: Dict[str, str]) -> None:
"""
Recursively process sections to add references
Args:
sections: List of Section objects
ref_map: Mapping of citation keys to full references
"""
for section in sections:
if section.content:
# Find citations in current section
cited_refs = self.find_citations(section.content)
if cited_refs:
# Get full references for citations
full_refs = []
for ref_key in cited_refs:
ref_text = ref_map.get(ref_key)
if ref_text:
full_refs.append(ref_text)
else:
self.logger.warning(f"Reference not found for citation key: {ref_key}")
# Add references to section content
if full_refs:
section.bibliography = "\n\n".join(full_refs)
# Process subsections recursively
if section.subsections:
self._process_section_references(section.subsections, ref_map)
def _parse_references(self, ref_bib: str) -> Dict[str, str]:
"""
Parse reference string into a mapping of citation keys to full references
Args:
ref_bib: Reference string with references separated by newlines
Returns:
Dict mapping citation keys to full reference text
"""
ref_map = {}
current_ref = []
current_key = None
try:
for line in ref_bib.split('\n'):
line = line.strip()
if not line:
continue
# New reference entry
if line.startswith('@'):
# Save previous reference if exists
if current_key and current_ref:
ref_map[current_key] = '\n'.join(current_ref)
current_ref = []
# Extract key from new reference
key_match = re.search(r'{(.*?),', line)
if key_match:
current_key = key_match.group(1)
current_ref.append(line)
else:
if current_ref is not None:
current_ref.append(line)
# Save last reference
if current_key and current_ref:
ref_map[current_key] = '\n'.join(current_ref)
except Exception as e:
self.logger.error(f"Error parsing references: {str(e)}")
return ref_map
# 编译一次正则表达式以提高效率
@staticmethod
def _clean_citation_key(key: str) -> str:
"""Clean individual citation key."""
return key.strip().strip(',').strip()
def _extract_keys_from_group(self, keys_str: str) -> Set[str]:
"""Extract and clean individual citation keys from a group."""
try:
# 分割多个引用键(支持逗号和分号分隔)
separators = '[,;]'
keys = re.split(separators, keys_str)
# 清理并过滤空键
return {self._clean_citation_key(k) for k in keys if self._clean_citation_key(k)}
except Exception as e:
self.logger.warning(f"Error processing citation group '{keys_str}': {e}")
return set()
def find_citations(self, content: str) -> Set[str]:
"""
Find citation keys in text content in various formats.
Args:
content: Text content to search for citations
Returns:
Set of unique citation keys
Examples:
Supported formats include:
- \cite{key1,key2}
- \cite[p. 1]{key}
- \citep{key}
- \citet{key}
- [cite:key1, key2]
- And many other variants
"""
citations = set()
if not content:
return citations
try:
# 对每个编译好的模式进行搜索
for pattern in COMPILED_PATTERNS:
matches = pattern.finditer(content)
for match in matches:
# 获取捕获组中的引用键
keys_str = match.group(1)
if keys_str:
# 提取并添加所有引用键
new_keys = self._extract_keys_from_group(keys_str)
citations.update(new_keys)
except Exception as e:
self.logger.error(f"Error finding citations: {str(e)}")
# 移除明显无效的键
citations = {key for key in citations
if key and not key.startswith(('\\', '{', '}', '[', ']'))}
return citations
def get_citation_contexts(self, content: str, context_chars: int = 100) -> dict:
"""
Find citations and their surrounding context.
Args:
content: Text content to search for citations
context_chars: Number of characters of context to include before/after
Returns:
Dict mapping citation keys to lists of context strings
"""
contexts = {}
if not content:
return contexts
try:
for pattern in COMPILED_PATTERNS:
matches = pattern.finditer(content)
for match in matches:
# 获取匹配的位置
start = max(0, match.start() - context_chars)
end = min(len(content), match.end() + context_chars)
# 获取上下文
context = content[start:end]
# 获取并处理引用键
keys_str = match.group(1)
keys = self._extract_keys_from_group(keys_str)
# 为每个键添加上下文
for key in keys:
if key not in contexts:
contexts[key] = []
contexts[key].append(context)
except Exception as e:
self.logger.error(f"Error finding citation contexts: {str(e)}")
return contexts
async def process(self, arxiv_id_or_url: str) -> List[SectionFragment]:
"""
Process ArXiv paper and convert to list of SectionFragments.
Each fragment represents the smallest section unit.
Args:
arxiv_id_or_url: ArXiv paper ID or URL
Returns:
List[SectionFragment]: List of processed paper fragments
"""
try:
arxiv_id = self._normalize_arxiv_id(arxiv_id_or_url)
paper_dir = self.root_dir / arxiv_id
# Check if paper directory exists, if not, try to download
if not paper_dir.exists():
self.logger.info(f"Downloading paper {arxiv_id}")
await self.download_paper(arxiv_id, paper_dir)
# Find main TeX file
main_tex = self.tex_processor.find_main_tex_file(str(paper_dir))
if not main_tex:
raise RuntimeError(f"No main TeX file found in {paper_dir}")
# 读取主 TeX 文件内容
main_tex_content = read_tex_file(main_tex)
# Get all related TeX files and references
tex_files = self.tex_processor.resolve_includes(main_tex)
ref_bib = self.tex_processor.resolve_references(main_tex, paper_dir)
if not tex_files:
raise RuntimeError(f"No valid TeX files found for {arxiv_id}")
# Reset document structure for new processing
self.document_structure = DocumentStructure()
# 提取作者信息
author_extractor = LatexAuthorExtractor()
authors = author_extractor.extract_authors(main_tex_content)
self.document_structure.authors = authors # 保存到文档结构中
# Process each TeX file
for file_path in tex_files:
self.logger.info(f"Processing TeX file: {file_path}")
tex_content = read_tex_file(file_path)
if tex_content:
additional_doc = self.document_parser.parse(tex_content)
self.document_structure = self.document_structure.merge(additional_doc)
# Process references if available
if ref_bib:
self.document_structure = self.process_references(self.document_structure, ref_bib)
self.logger.info("Successfully processed references")
else:
self.logger.info("No references found to process")
# Generate table of contents once
section_tree = self.document_structure.generate_toc_tree()
# Convert DocumentStructure to SectionFragments
fragments = self._convert_to_fragments(
doc_structure=self.document_structure,
arxiv_id=arxiv_id,
section_tree=section_tree
)
return fragments
except Exception as e:
self.logger.error(f"Failed to process {arxiv_id_or_url}: {str(e)}")
raise
def _convert_to_fragments(self,
doc_structure: DocumentStructure,
arxiv_id: str,
section_tree: str) -> List[SectionFragment]:
"""
Convert DocumentStructure to list of SectionFragments.
Creates a fragment for each leaf section in the document hierarchy.
Args:
doc_structure: Source DocumentStructure
arxiv_id: ArXiv paper ID
section_tree: Pre-generated table of contents tree
Returns:
List[SectionFragment]: List of paper fragments
"""
fragments = []
# Create a base template for all fragments to avoid repetitive assignments
base_fragment_template = {
'title': doc_structure.title,
'authors': doc_structure.authors,
'abstract': doc_structure.abstract,
'catalogs': section_tree,
'arxiv_id': arxiv_id
}
def get_leaf_sections(section: Section, path: List[str] = None) -> None:
"""
Recursively find all leaf sections and create fragments.
A leaf section is one that has content but no subsections, or has neither.
Args:
section: Current section being processed
path: List of section titles forming the path to current section
"""
if path is None:
path = []
current_path = path + [section.title]
if not section.subsections:
# This is a leaf section, create a fragment if it has content
if section.content or section.bibliography:
fragment = SectionFragment(
**base_fragment_template,
current_section="/".join(current_path),
content=self._clean_content(section.content),
bibliography=section.bibliography
)
if self._validate_fragment(fragment):
fragments.append(fragment)
else:
# Process each subsection
for subsection in section.subsections:
get_leaf_sections(subsection, current_path)
# Process all top-level sections
for section in doc_structure.toc:
get_leaf_sections(section)
# Add a fragment for the abstract if it exists
if doc_structure.abstract:
abstract_fragment = SectionFragment(
**base_fragment_template,
current_section="Abstract",
content=self._clean_content(doc_structure.abstract)
)
if self._validate_fragment(abstract_fragment):
fragments.insert(0, abstract_fragment)
self.logger.info(f"Created {len(fragments)} fragments")
return fragments
def _validate_fragment(self, fragment: SectionFragment) -> bool:
"""
Validate if the fragment has all required fields with meaningful content.
Args:
fragment: SectionFragment to validate
Returns:
bool: True if fragment is valid, False otherwise
"""
try:
return all([
fragment.title.strip(),
fragment.catalogs.strip(),
fragment.current_section.strip(),
fragment.content.strip() or fragment.bibliography.strip()
])
except AttributeError:
return False
def _clean_content(self, content: str) -> str:
"""
Clean and normalize content text.
Args:
content: Raw content text
Returns:
str: Cleaned content text
"""
if not content:
return ""
# Remove excessive whitespace
content = re.sub(r'\s+', ' ', content)
# Remove remaining LaTeX artifacts
content = re.sub(r'\\item\s*', '', content) # Convert \item to bullet points
content = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', content) # Remove simple LaTeX commands
# Clean special characters
content = content.replace('\\\\', '\n') # Convert LaTeX newlines to actual newlines
content = re.sub(r'\s*\n\s*', '\n', content) # Clean up newlines
return content.strip()
def process_arxiv_sync(splitter: ArxivSplitter, arxiv_id: str) -> tuple[List[SectionFragment], str, List[Path]]:
"""
同步处理 ArXiv 文档并返回分割后的片段
Args:
splitter: ArxivSplitter 实例
arxiv_id: ArXiv 文档ID
Returns:
list: 分割后的文档片段列表
"""
try:
from crazy_functions.doc_fns.tex_html_formatter import PaperHtmlFormatter
# 创建一个异步函数来执行异步操作
async def _process():
return await splitter.process(arxiv_id)
# 使用 asyncio.run() 运行异步函数
output_files=[]
fragments = asyncio.run(_process())
file_save_path = splitter.root_dir / "arxiv_fragments"
# 保存片段到文件
try:
md_output_dir = save_fragments_to_file(
fragments,
output_dir = file_save_path
)
output_files.append(md_output_dir)
except:
pass
# 创建论文格式化器
formatter = PaperContentFormatter()
# 准备元数据
# 创建格式化选项
metadata = PaperMetadata(
title=fragments[0].title,
authors=fragments[0].authors,
abstract=fragments[0].abstract,
catalogs=fragments[0].catalogs,
arxiv_id=fragments[0].arxiv_id
)
# 格式化内容
formatted_content = formatter.format(fragments, metadata)
try:
html_formatter = PaperHtmlFormatter(fragments, file_save_path)
html_output_dir = html_formatter.save_html()
output_files.append(html_output_dir)
except:
pass
return fragments, formatted_content, output_files
except Exception as e:
print(f"✗ Processing failed for {arxiv_id}: {str(e)}")
raise
def test_arxiv_splitter():
"""测试ArXiv分割器的功能"""
# 测试配置
test_cases = [
{
"arxiv_id": "2411.03663",
"expected_title": "Large Language Models and Simple Scripts",
"min_fragments": 10,
},
# {
# "arxiv_id": "1805.10988",
# "expected_title": "RAG vs Fine-tuning",
# "min_fragments": 15,
# }
]
# 创建分割器实例
splitter = ArxivSplitter(
root_dir="private_upload/default_user"
)
for case in test_cases:
print(f"\nTesting paper: {case['arxiv_id']}")
try:
# fragments = await splitter.process(case['arxiv_id'])
fragments, formatted_content, output_dir = process_arxiv_sync(splitter, case['arxiv_id'])
# 保存fragments
for fragment in fragments:
# 长度检查
print((fragment.content))
print(len(fragment.content))
# 类型检查
print(output_dir)
except Exception as e:
print(f"✗ Test failed for {case['arxiv_id']}: {str(e)}")
raise
if __name__ == "__main__":
test_arxiv_splitter()

View File

@@ -1,177 +0,0 @@
import re
from typing import Optional
class LatexAuthorExtractor:
def __init__(self):
# Patterns for matching author blocks with balanced braces
self.author_block_patterns = [
# Standard LaTeX patterns with optional arguments
r'\\author(?:\s*\[[^\]]*\])?\s*\{((?:[^{}]|{(?:[^{}]|{[^{}]*})*})*)\}',
r'\\(?:title)?author[s]?\s*\{((?:[^{}]|{(?:[^{}]|{[^{}]*})*})*)\}',
r'\\name[s]?\s*\{((?:[^{}]|{(?:[^{}]|{[^{}]*})*})*)\}',
r'\\Author[s]?\s*\{((?:[^{}]|{(?:[^{}]|{[^{}]*})*})*)\}',
r'\\AUTHOR[S]?\s*\{((?:[^{}]|{(?:[^{}]|{[^{}]*})*})*)\}',
# Conference and journal specific patterns
r'\\addauthor\s*\{((?:[^{}]|{(?:[^{}]|{[^{}]*})*})*)\}',
r'\\IEEEauthor\s*\{((?:[^{}]|{(?:[^{}]|{[^{}]*})*})*)\}',
r'\\speaker\s*\{((?:[^{}]|{(?:[^{}]|{[^{}]*})*})*)\}',
r'\\authorrunning\s*\{((?:[^{}]|{(?:[^{}]|{[^{}]*})*})*)\}',
# Academic publisher specific patterns
r'\\alignauthor\s*\{((?:[^{}]|{(?:[^{}]|{[^{}]*})*})*)\}',
r'\\spauthor\s*\{((?:[^{}]|{(?:[^{}]|{[^{}]*})*})*)\}',
r'\\authors\s*\{((?:[^{}]|{(?:[^{}]|{[^{}]*})*})*)\}',
]
# Cleaning patterns for LaTeX commands and formatting
self.cleaning_patterns = [
# Text formatting commands - preserve content
(r'\\textbf\{([^}]+)\}', r'\1'),
(r'\\textit\{([^}]+)\}', r'\1'),
(r'\\emph\{([^}]+)\}', r'\1'),
(r'\\texttt\{([^}]+)\}', r'\1'),
(r'\\textrm\{([^}]+)\}', r'\1'),
(r'\\text\{([^}]+)\}', r'\1'),
# Affiliation and footnote markers
(r'\$\^{[^}]+}\$', ''),
(r'\^{[^}]+}', ''),
(r'\\thanks\{[^}]+\}', ''),
(r'\\footnote\{[^}]+\}', ''),
# Email and contact formatting
(r'\\email\{([^}]+)\}', r'\1'),
(r'\\href\{[^}]+\}\{([^}]+)\}', r'\1'),
# Institution formatting
(r'\\inst\{[^}]+\}', ''),
(r'\\affil\{[^}]+\}', ''),
# Special characters and symbols
(r'\\&', '&'),
(r'\\\\\s*', ' '),
(r'\\,', ' '),
(r'\\;', ' '),
(r'\\quad', ' '),
(r'\\qquad', ' '),
# Math mode content
(r'\$[^$]+\$', ''),
# Common symbols
(r'\\dagger', ''),
(r'\\ddagger', ''),
(r'\\ast', '*'),
(r'\\star', ''),
# Remove remaining LaTeX commands
(r'\\[a-zA-Z]+', ''),
# Clean up remaining special characters
(r'[\\{}]', '')
]
def extract_author_block(self, text: str) -> Optional[str]:
"""
Extract the complete author block from LaTeX text.
Args:
text (str): Input LaTeX text
Returns:
Optional[str]: Extracted author block or None if not found
"""
try:
if not text:
return None
for pattern in self.author_block_patterns:
match = re.search(pattern, text, re.DOTALL | re.MULTILINE)
if match:
return match.group(1).strip()
return None
except (AttributeError, IndexError) as e:
print(f"Error extracting author block: {e}")
return None
def clean_tex_commands(self, text: str) -> str:
"""
Remove LaTeX commands and formatting from text while preserving content.
Args:
text (str): Text containing LaTeX commands
Returns:
str: Cleaned text with commands removed
"""
if not text:
return ""
cleaned_text = text
# Apply cleaning patterns
for pattern, replacement in self.cleaning_patterns:
cleaned_text = re.sub(pattern, replacement, cleaned_text)
# Clean up whitespace
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
cleaned_text = cleaned_text.strip()
return cleaned_text
def extract_authors(self, text: str) -> Optional[str]:
"""
Extract and clean author information from LaTeX text.
Args:
text (str): Input LaTeX text
Returns:
Optional[str]: Cleaned author information or None if extraction fails
"""
try:
if not text:
return None
# Extract author block
author_block = self.extract_author_block(text)
if not author_block:
return None
# Clean LaTeX commands
cleaned_authors = self.clean_tex_commands(author_block)
return cleaned_authors or None
except Exception as e:
print(f"Error processing text: {e}")
return None
def test_author_extractor():
"""Test the LatexAuthorExtractor with sample inputs."""
test_cases = [
# Basic test case
(r"\author{John Doe}", "John Doe"),
# Test with multiple authors
(r"\author{Alice Smith \and Bob Jones}", "Alice Smith and Bob Jones"),
# Test with affiliations
(r"\author[1]{John Smith}\affil[1]{University}", "John Smith"),
]
extractor = LatexAuthorExtractor()
for i, (input_tex, expected) in enumerate(test_cases, 1):
result = extractor.extract_authors(input_tex)
print(f"\nTest case {i}:")
print(f"Input: {input_tex[:50]}...")
print(f"Expected: {expected[:50]}...")
print(f"Got: {result[:50]}...")
print(f"Pass: {bool(result and result.strip() == expected.strip())}")
if __name__ == "__main__":
test_author_extractor()

View File

@@ -1,290 +0,0 @@
"""
LaTeX Document Parser
This module provides functionality for parsing and extracting structured information from LaTeX documents,
including metadata, document structure, and content. It uses modular design and clean architecture principles.
"""
import logging
import re
from abc import ABC, abstractmethod
from copy import deepcopy
from dataclasses import dataclass, field
from typing import List, Dict
from crazy_functions.rag_fns.arxiv_fns.latex_cleaner import clean_latex_commands
from crazy_functions.rag_fns.arxiv_fns.section_extractor import Section, EnhancedSectionExtractor
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def read_tex_file(file_path):
encodings = ['utf-8', 'latin1', 'gbk', 'gb2312', 'ascii']
for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
return f.read()
except UnicodeDecodeError:
continue
@dataclass
class DocumentStructure:
title: str = ''
authors: str = ''
abstract: str = ''
toc: List[Section] = field(default_factory=list)
metadata: Dict[str, str] = field(default_factory=dict)
def merge(self, other: 'DocumentStructure', strategy: str = 'smart') -> 'DocumentStructure':
"""
Merge this document structure with another one.
Args:
other: Another DocumentStructure to merge with
strategy: Merge strategy - 'smart' (default) or 'append'
'smart' - Intelligently merge sections with same titles
'append' - Simply append sections from other document
"""
merged = deepcopy(self)
# Merge title if needed
if not merged.title and other.title:
merged.title = other.title
# Merge abstract
merged.abstract = self._merge_abstract(merged.abstract, other.abstract)
# Merge metadata
merged.metadata.update(other.metadata)
if strategy == 'append':
merged.toc.extend(deepcopy(other.toc))
else: # smart merge
# Create sections lookup for efficient merging
sections_map = {s.title: s for s in merged.toc}
for other_section in other.toc:
if other_section.title in sections_map:
# Merge existing section
idx = next(i for i, s in enumerate(merged.toc)
if s.title == other_section.title)
merged.toc[idx] = merged.toc[idx].merge(other_section)
else:
# Add new section
merged.toc.append(deepcopy(other_section))
return merged
@staticmethod
def _merge_abstract(abstract1: str, abstract2: str) -> str:
"""Merge abstracts intelligently."""
if not abstract1:
return abstract2
if not abstract2:
return abstract1
# Combine non-empty abstracts with a separator
return f"{abstract1}\n\n{abstract2}"
def generate_toc_tree(self, indent_char: str = " ", abstract_preview_length: int = 0) -> str:
"""
Generate a tree-like string representation of the table of contents including abstract.
Args:
indent_char: Character(s) used for indentation. Default is two spaces.
abstract_preview_length: Maximum length of abstract preview. Default is 200 characters.
Returns:
str: A formatted string showing the hierarchical document structure with abstract
"""
def _format_section(section: Section, level: int = 0) -> str:
# Create the current section line with proper indentation
current_line = f"{indent_char * level}{'' if level > 0 else ''} {section.title}\n"
# Recursively process subsections
subsections = ""
if section.subsections:
subsections = "".join(_format_section(subsec, level + 1)
for subsec in section.subsections)
return current_line + subsections
result = []
# Add document title if it exists
if self.title:
result.append(f"{self.title}\n")
# Add abstract if it exists
if self.abstract:
result.append("\n□ Abstract:")
# Format abstract content with word wrap
abstract_preview = self.abstract[:abstract_preview_length]
if len(self.abstract) > abstract_preview_length:
abstract_preview += "..."
# Split abstract into lines and indent them
wrapped_lines = []
current_line = ""
for word in abstract_preview.split():
if len(current_line) + len(word) + 1 <= 80: # 80 characters per line
current_line = (current_line + " " + word).strip()
else:
wrapped_lines.append(current_line)
current_line = word
if current_line:
wrapped_lines.append(current_line)
# Add formatted abstract lines
for line in wrapped_lines:
result.append(f"\n{indent_char}{line}")
result.append("\n") # Add extra newline after abstract
# Add table of contents header if there are sections
if self.toc:
result.append("\n◈ Table of Contents:\n")
# Add all top-level sections and their subsections
result.extend(_format_section(section, 0) for section in self.toc)
return "".join(result)
class BaseExtractor(ABC):
"""Base class for LaTeX content extractors."""
@abstractmethod
def extract(self, content: str) -> str:
"""Extract specific content from LaTeX document."""
pass
class TitleExtractor(BaseExtractor):
"""Extracts title from LaTeX document."""
PATTERNS = [
r'\\title{(.+?)}',
r'\\title\[.*?\]{(.+?)}',
r'\\Title{(.+?)}',
r'\\TITLE{(.+?)}',
r'\\begin{document}\s*\\section[*]?{(.+?)}',
r'\\maketitle\s*\\section[*]?{(.+?)}',
r'\\chapter[*]?{(.+?)}'
]
def extract(self, content: str) -> str:
"""Extract title using defined patterns."""
for pattern in self.PATTERNS:
matches = list(re.finditer(pattern, content, re.IGNORECASE | re.DOTALL))
for match in matches:
title = match.group(1).strip()
if title:
return clean_latex_commands(title)
return ''
class AbstractExtractor(BaseExtractor):
"""Extracts abstract from LaTeX document."""
PATTERNS = [
r'\\begin{abstract}(.*?)\\end{abstract}',
r'\\abstract{(.*?)}',
r'\\ABSTRACT{(.*?)}',
r'\\Abstract{(.*?)}',
r'\\begin{Abstract}(.*?)\\end{Abstract}',
r'\\section[*]?{(?:Abstract|ABSTRACT)}\s*(.*?)(?:\\section|\Z)',
r'\\chapter[*]?{(?:Abstract|ABSTRACT)}\s*(.*?)(?:\\chapter|\Z)'
]
def extract(self, content: str) -> str:
"""Extract abstract using defined patterns."""
for pattern in self.PATTERNS:
matches = list(re.finditer(pattern, content, re.IGNORECASE | re.DOTALL))
for match in matches:
abstract = match.group(1).strip()
if abstract:
return clean_latex_commands(abstract)
return ''
class EssayStructureParser:
"""Main class for parsing LaTeX documents."""
def __init__(self):
self.title_extractor = TitleExtractor()
self.abstract_extractor = AbstractExtractor()
self.section_extractor = EnhancedSectionExtractor() # Using the enhanced extractor
def parse(self, content: str) -> DocumentStructure:
"""Parse LaTeX document and extract structured information."""
try:
content = self._preprocess_content(content)
return DocumentStructure(
title=self.title_extractor.extract(content),
abstract=self.abstract_extractor.extract(content),
toc=self.section_extractor.extract(content)
)
except Exception as e:
logger.error(f"Error parsing LaTeX document: {str(e)}")
raise
def _preprocess_content(self, content: str) -> str:
"""Preprocess LaTeX content for parsing."""
# Remove comments
content = re.sub(r'(?<!\\)%.*$', '', content, flags=re.MULTILINE)
return content
def pretty_print_structure(doc: DocumentStructure, max_content_length: int = 100):
"""Print document structure in a readable format."""
print(f"Title: {doc.title}\n")
print(f"Abstract: {doc.abstract}\n")
print("Table of Contents:")
def print_section(section: Section, indent: int = 0):
print(" " * indent + f"- {section.title}")
if section.content:
preview = section.content[:max_content_length]
if len(section.content) > max_content_length:
preview += "..."
print(" " * (indent + 1) + f"Content: {preview}")
for subsection in section.subsections:
print_section(subsection, indent + 1)
for section in doc.toc:
print_section(section)
# Example usage:
if __name__ == "__main__":
# Test with a file
file_path = 'test_cache/2411.03663/neurips_2024.tex'
main_tex = read_tex_file(file_path)
# Parse main file
parser = EssayStructureParser()
main_doc = parser.parse(main_tex)
# Merge other documents
file_path_list = [
"test_cache/2411.03663/1_intro.tex",
"test_cache/2411.03663/0_abstract.tex",
"test_cache/2411.03663/2_pre.tex",
"test_cache/2411.03663/3_method.tex",
"test_cache/2411.03663/4_experiment.tex",
"test_cache/2411.03663/5_related_work.tex",
"test_cache/2411.03663/6_conclu.tex",
"test_cache/2411.03663/reference.bib"
]
for file_path in file_path_list:
tex_content = read_tex_file(file_path)
additional_doc = parser.parse(tex_content)
main_doc = main_doc.merge(additional_doc)
tree = main_doc.generate_toc_tree()
pretty_print_structure(main_doc)

View File

@@ -1,329 +0,0 @@
import logging
import re
from dataclasses import dataclass, field
from enum import Enum
from functools import lru_cache
from typing import Set, Dict, Pattern, Optional, List, Tuple
class EnvType(Enum):
"""Environment classification types."""
PRESERVE = "preserve" # Preserve complete environment including commands
REMOVE = "remove" # Remove environment completely
EXTRACT = "extract" # Extract and clean content
@dataclass
class LatexConfig:
"""Configuration for LaTeX processing."""
preserve_envs: Set[str] = field(default_factory=lambda: {
# Math environments - preserve complete content
'equation', 'equation*', 'align', 'align*', 'displaymath',
'math', 'eqnarray', 'eqnarray*', 'gather', 'gather*',
'multline', 'multline*', 'flalign', 'flalign*',
'alignat', 'alignat*', 'cases', 'split', 'aligned',
# Tables and figures - preserve structure and content
'table', 'table*', 'tabular', 'tabularx', 'array', 'matrix',
'figure', 'figure*', 'subfigure', 'wrapfigure',
'minipage', 'tabbing', 'verbatim', 'longtable',
'sidewaystable', 'sidewaysfigure', 'floatrow',
# Arrays and matrices
'pmatrix', 'bmatrix', 'Bmatrix', 'vmatrix', 'Vmatrix',
'smallmatrix', 'array', 'matrix*', 'pmatrix*', 'bmatrix*',
# Algorithms and code
'algorithm', 'algorithmic', 'lstlisting', 'verbatim',
'minted', 'listing', 'algorithmic*', 'algorithm2e',
# Theorems and proofs
'theorem', 'proof', 'definition', 'lemma', 'corollary',
'proposition', 'example', 'remark', 'note', 'claim',
'axiom', 'property', 'assumption', 'conjecture', 'observation',
# Bibliography
'thebibliography', 'bibliography', 'references'
})
# 引用类命令的特殊处理配置
citation_commands: Set[str] = field(default_factory=lambda: {
# Basic citations
'cite', 'citep', 'citet', 'citeyear', 'citeauthor',
'citeyearpar', 'citetext', 'citenum',
# Natbib citations
'citefullauthor', 'citealp', 'citealt', 'citename',
'citepalias', 'citetalias', 'citetext',
# Cross-references
'ref', 'eqref', 'pageref', 'autoref', 'nameref', 'cref',
'Cref', 'vref', 'Vref', 'fref', 'pref',
# Hyperref
'hyperref', 'href', 'url',
# Labels
'label', 'tag'
})
preserve_commands: Set[str] = field(default_factory=lambda: {
# Text formatting
'emph', 'textbf', 'textit', 'underline', 'texttt', 'footnote',
'section', 'subsection', 'subsubsection', 'paragraph', 'part',
'chapter', 'title', 'author', 'date', 'thanks',
# Math operators and symbols
'frac', 'sum', 'int', 'prod', 'lim', 'sup', 'inf',
'partial', 'nabla', 'implies', 'iff', 'therefore',
'exists', 'forall', 'in', 'subset', 'subseteq',
# Greek letters and math symbols
'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta',
'eta', 'theta', 'iota', 'kappa', 'lambda', 'mu',
'nu', 'xi', 'pi', 'rho', 'sigma', 'tau',
'upsilon', 'phi', 'chi', 'psi', 'omega',
'Gamma', 'Delta', 'Theta', 'Lambda', 'Xi', 'Pi',
'Sigma', 'Upsilon', 'Phi', 'Psi', 'Omega',
# Math commands
'left', 'right', 'big', 'Big', 'bigg', 'Bigg',
'mathbf', 'mathit', 'mathsf', 'mathtt', 'mathbb',
'mathcal', 'mathfrak', 'mathscr', 'mathrm', 'mathop',
'operatorname', 'overline', 'underline', 'overbrace',
'underbrace', 'overset', 'underset', 'stackrel',
# Spacing and alignment
'quad', 'qquad', 'hspace', 'vspace', 'medskip',
'bigskip', 'smallskip', 'hfill', 'vfill', 'centering',
'raggedright', 'raggedleft'
})
remove_commands: Set[str] = field(default_factory=lambda: {
# Document setup
'documentclass', 'usepackage', 'input', 'include', 'includeonly',
'bibliographystyle', 'frontmatter', 'mainmatter',
'newtheorem', 'theoremstyle', 'proofname',
'newcommand', 'renewcommand', 'providecommand', 'DeclareMathOperator',
'newenvironment',
# Layout and spacing
'pagestyle', 'thispagestyle', 'newpage', 'clearpage',
'pagebreak', 'linebreak', 'newline', 'setlength',
'setcounter', 'addtocounter', 'makeatletter',
'makeatother', 'pagenumbering'
})
latex_chars: Dict[str, str] = field(default_factory=lambda: {
'~': ' ', '\\&': '&', '\\%': '%', '\\_': '_', '\\$': '$',
'\\#': '#', '\\{': '{', '\\}': '}', '``': '"', "''": '"',
'\\textbackslash': '\\', '\\ldots': '...', '\\dots': '...',
'\\textasciitilde': '~', '\\textasciicircum': '^'
})
# 保留原始格式的特殊命令模式
special_command_patterns: List[Tuple[str, str]] = field(default_factory=lambda: [
(r'\\cite\*?(?:\[[^\]]*\])?{([^}]+)}', r'\\cite{\1}'),
(r'\\ref\*?{([^}]+)}', r'\\ref{\1}'),
(r'\\label{([^}]+)}', r'\\label{\1}'),
(r'\\eqref{([^}]+)}', r'\\eqref{\1}'),
(r'\\autoref{([^}]+)}', r'\\autoref{\1}'),
(r'\\url{([^}]+)}', r'\\url{\1}'),
(r'\\href{([^}]+)}{([^}]+)}', r'\\href{\1}{\2}')
])
class LatexCleaner:
"""Enhanced LaTeX text cleaner that preserves mathematical content and citations."""
def __init__(self, config: Optional[LatexConfig] = None):
self.config = config or LatexConfig()
self.logger = logging.getLogger(__name__)
# 初始化正则表达式缓存
self._regex_cache = {}
@lru_cache(maxsize=128)
def _get_env_pattern(self, env_name: str) -> Pattern:
"""Get cached regex pattern for environment matching."""
return re.compile(fr'\\begin{{{env_name}}}(.*?)\\end{{{env_name}}}', re.DOTALL)
def _get_env_type(self, env_name: str) -> EnvType:
"""Determine environment processing type."""
if env_name.rstrip('*') in {name.rstrip('*') for name in self.config.preserve_envs}:
return EnvType.PRESERVE
elif env_name in {'comment'}:
return EnvType.REMOVE
return EnvType.EXTRACT
def _preserve_special_commands(self, text: str) -> str:
"""Preserve special commands like citations and references with their complete structure."""
for pattern, replacement in self.config.special_command_patterns:
if pattern not in self._regex_cache:
self._regex_cache[pattern] = re.compile(pattern)
def replace_func(match):
# 保持原始命令格式
return match.group(0)
text = self._regex_cache[pattern].sub(replace_func, text)
return text
def _process_environment(self, match: re.Match) -> str:
"""Process LaTeX environments while preserving complete content for special environments."""
try:
env_name = match.group(1)
content = match.group(2)
env_type = self._get_env_type(env_name)
if env_type == EnvType.PRESERVE:
# 完整保留环境内容
complete_env = match.group(0)
return f"\n[BEGIN_{env_name}]\n{complete_env}\n[END_{env_name}]\n"
elif env_type == EnvType.REMOVE:
return ' '
else:
# 处理嵌套环境
return self._clean_nested_environments(content)
except Exception as e:
self.logger.error(f"Error processing environment {match.group(1) if match else 'unknown'}: {e}")
return match.group(0)
def _preserve_inline_math(self, text: str) -> str:
"""Preserve complete inline math content."""
def preserve_math(match):
return f" {match.group(0)} "
patterns = [
(r'\$[^$]+\$', preserve_math),
(r'\\[\(\[].*?\\[\)\]]', preserve_math),
(r'\\begin{math}.*?\\end{math}', preserve_math)
]
for pattern, handler in patterns:
if pattern not in self._regex_cache:
self._regex_cache[pattern] = re.compile(pattern, re.DOTALL)
text = self._regex_cache[pattern].sub(handler, text)
return text
def _clean_nested_environments(self, text: str) -> str:
"""Process nested environments recursively."""
pattern = r'\\begin{(\w+)}(.*?)\\end{\1}'
if pattern not in self._regex_cache:
self._regex_cache[pattern] = re.compile(pattern, re.DOTALL)
return self._regex_cache[pattern].sub(self._process_environment, text)
def _clean_commands(self, text: str) -> str:
"""Clean LaTeX commands while preserving important content."""
# 首先处理特殊命令
text = self._preserve_special_commands(text)
# 保留内联数学
text = self._preserve_inline_math(text)
# 移除指定的命令
for cmd in self.config.remove_commands:
if cmd not in self._regex_cache:
self._regex_cache[cmd] = re.compile(
fr'\\{cmd}\*?(?:\[.*?\])?(?:{{.*?}})*'
)
text = self._regex_cache[cmd].sub('', text)
# 处理带内容的命令
def handle_command(match: re.Match) -> str:
cmd = match.group(1).rstrip('*')
if cmd in self.config.preserve_commands or cmd in self.config.citation_commands:
return match.group(0) # 完整保留命令和内容
return ' '
if 'command_pattern' not in self._regex_cache:
self._regex_cache['command_pattern'] = re.compile(
r'\\(\w+)\*?(?:\[.*?\])?{(.*?)}'
)
text = self._regex_cache['command_pattern'].sub(handle_command, text)
return text
def _normalize_text(self, text: str) -> str:
"""Normalize text while preserving special content markers."""
# 替换特殊字符
for char, replacement in self.config.latex_chars.items():
text = text.replace(char, replacement)
# 清理空白字符,同时保留环境标记
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\s*\[BEGIN_(\w+)\]\s*', r'\n[BEGIN_\1]\n', text)
text = re.sub(r'\s*\[END_(\w+)\]\s*', r'\n[END_\1]\n', text)
# 保持块级环境之间的分隔
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
def clean_text(self, text: str) -> str:
"""Clean LaTeX text while preserving mathematical content, citations, and special environments."""
if not text:
return ""
try:
# 移除注释
text = re.sub(r'(?<!\\)%.*?(?=\n|$)', '', text, flags=re.MULTILINE)
# 处理环境
text = self._clean_nested_environments(text)
# 清理命令并规范化
text = self._clean_commands(text)
text = self._normalize_text(text)
return text
except Exception as e:
self.logger.error(f"Error cleaning text: {e}")
return text # 发生错误时返回原始文本
def clean_latex_commands(text: str) -> str:
"""Convenience function for quick text cleaning with default config."""
cleaner = LatexCleaner()
return cleaner.clean_text(text)
# Example usage:
if __name__ == "__main__":
text = r"""
\documentclass{article}
\begin{document}
\section{Introduction}
This is a reference to \cite{smith2020} and equation \eqref{eq:main}.
\begin{equation}\label{eq:main}
E = mc^2 \times \sum_{i=1}^{n} x_i
\end{equation}
See Figure \ref{fig:example} for details.
\begin{figure}
\includegraphics{image.png}
\caption{Example figure\label
\textbf{Important} result: $E=mc^2$ and
\begin{equation}
F = ma
\end{equation}
\label{sec:intro}
"""
# Custom configuration
config = LatexConfig(
preserve_envs={},
preserve_commands={'textbf', 'emph'},
latex_chars={'~': ' ', '\\&': '&'}
)
def read_tex_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return content
except FileNotFoundError:
return "文件未找到,请检查路径是否正确。"
except Exception as e:
return f"读取文件时发生错误: {e}"
# 使用函数
file_path = 'test_cache/2411.03663/neurips_2024.tex'
content = read_tex_file(file_path)
cleaner = LatexCleaner(config)
text = cleaner.clean_text(text)
print(text)

View File

@@ -1,396 +0,0 @@
from dataclasses import dataclass
@dataclass
class LaTeXPatterns:
"""LaTeX模式存储类用于集中管理所有LaTeX相关的正则表达式模式"""
special_envs = {
'math': [
# 基础数学环境
r'\\begin{(equation|align|gather|eqnarray|multline|flalign|alignat)\*?}.*?\\end{\1\*?}',
r'\$\$.*?\$\$',
r'\$[^$]+\$',
# 矩阵环境
r'\\begin{(matrix|pmatrix|bmatrix|Bmatrix|vmatrix|Vmatrix|smallmatrix)\*?}.*?\\end{\1\*?}',
# 数组环境
r'\\begin{(array|cases|aligned|gathered|split)\*?}.*?\\end{\1\*?}',
# 其他数学环境
r'\\begin{(subequations|math|displaymath)\*?}.*?\\end{\1\*?}'
],
'table': [
# 基础表格环境
r'\\begin{(table|tabular|tabularx|tabulary|longtable)\*?}.*?\\end{\1\*?}',
# 复杂表格环境
r'\\begin{(tabu|supertabular|xtabular|mpsupertabular)\*?}.*?\\end{\1\*?}',
# 自定义表格环境
r'\\begin{(threeparttable|tablefootnote)\*?}.*?\\end{\1\*?}',
# 表格注释环境
r'\\begin{(tablenotes)\*?}.*?\\end{\1\*?}'
],
'figure': [
# 图片环境
r'\\begin{figure\*?}.*?\\end{figure\*?}',
r'\\begin{(subfigure|wrapfigure)\*?}.*?\\end{\1\*?}',
# 图片插入命令
r'\\includegraphics(\[.*?\])?\{.*?\}',
# tikz 图形环境
r'\\begin{(tikzpicture|pgfpicture)\*?}.*?\\end{\1\*?}',
# 其他图形环境
r'\\begin{(picture|pspicture)\*?}.*?\\end{\1\*?}'
],
'algorithm': [
# 算法环境
r'\\begin{(algorithm|algorithmic|algorithm2e|algorithmicx)\*?}.*?\\end{\1\*?}',
r'\\begin{(lstlisting|verbatim|minted|listing)\*?}.*?\\end{\1\*?}',
# 代码块环境
r'\\begin{(code|verbatimtab|verbatimwrite)\*?}.*?\\end{\1\*?}',
# 伪代码环境
r'\\begin{(pseudocode|procedure)\*?}.*?\\end{\1\*?}'
],
'list': [
# 列表环境
r'\\begin{(itemize|enumerate|description)\*?}.*?\\end{\1\*?}',
r'\\begin{(list|compactlist|bulletlist)\*?}.*?\\end{\1\*?}',
# 自定义列表环境
r'\\begin{(tasks|todolist)\*?}.*?\\end{\1\*?}'
],
'theorem': [
# 定理类环境
r'\\begin{(theorem|lemma|proposition|corollary)\*?}.*?\\end{\1\*?}',
r'\\begin{(definition|example|proof|remark)\*?}.*?\\end{\1\*?}',
# 其他证明环境
r'\\begin{(axiom|property|assumption|conjecture)\*?}.*?\\end{\1\*?}'
],
'box': [
# 文本框环境
r'\\begin{(tcolorbox|mdframed|framed|shaded)\*?}.*?\\end{\1\*?}',
r'\\begin{(boxedminipage|shadowbox)\*?}.*?\\end{\1\*?}',
# 强调环境
r'\\begin{(important|warning|info|note)\*?}.*?\\end{\1\*?}'
],
'quote': [
# 引用环境
r'\\begin{(quote|quotation|verse|abstract)\*?}.*?\\end{\1\*?}',
r'\\begin{(excerpt|epigraph)\*?}.*?\\end{\1\*?}'
],
'bibliography': [
# 参考文献环境
r'\\begin{(thebibliography|bibliography)\*?}.*?\\end{\1\*?}',
r'\\begin{(biblist|citelist)\*?}.*?\\end{\1\*?}'
],
'index': [
# 索引环境
r'\\begin{(theindex|printindex)\*?}.*?\\end{\1\*?}',
r'\\begin{(glossary|acronym)\*?}.*?\\end{\1\*?}'
]
}
# 章节模式
section_patterns = [
# 基础章节命令
r'\\chapter\{([^}]+)\}',
r'\\section\{([^}]+)\}',
r'\\subsection\{([^}]+)\}',
r'\\subsubsection\{([^}]+)\}',
r'\\paragraph\{([^}]+)\}',
r'\\subparagraph\{([^}]+)\}',
# 带星号的变体(不编号)
r'\\chapter\*\{([^}]+)\}',
r'\\section\*\{([^}]+)\}',
r'\\subsection\*\{([^}]+)\}',
r'\\subsubsection\*\{([^}]+)\}',
r'\\paragraph\*\{([^}]+)\}',
r'\\subparagraph\*\{([^}]+)\}',
# 特殊章节
r'\\part\{([^}]+)\}',
r'\\part\*\{([^}]+)\}',
r'\\appendix\{([^}]+)\}',
# 前言部分
r'\\frontmatter\{([^}]+)\}',
r'\\mainmatter\{([^}]+)\}',
r'\\backmatter\{([^}]+)\}',
# 目录相关
r'\\tableofcontents',
r'\\listoffigures',
r'\\listoftables',
# 自定义章节命令
r'\\addchap\{([^}]+)\}', # KOMA-Script类
r'\\addsec\{([^}]+)\}', # KOMA-Script类
r'\\minisec\{([^}]+)\}', # KOMA-Script类
# 带可选参数的章节命令
r'\\chapter\[([^]]+)\]\{([^}]+)\}',
r'\\section\[([^]]+)\]\{([^}]+)\}',
r'\\subsection\[([^]]+)\]\{([^}]+)\}'
]
# 包含模式
include_patterns = [
r'\\(input|include|subfile)\{([^}]+)\}'
]
metadata_patterns = {
# 标题相关
'title': [
r'\\title\{([^}]+)\}',
r'\\Title\{([^}]+)\}',
r'\\doctitle\{([^}]+)\}',
r'\\subtitle\{([^}]+)\}',
r'\\chapter\*?\{([^}]+)\}', # 第一章可能作为标题
r'\\maketitle\s*\\section\*?\{([^}]+)\}' # 第一节可能作为标题
],
# 摘要相关
'abstract': [
r'\\begin{abstract}(.*?)\\end{abstract}',
r'\\abstract\{([^}]+)\}',
r'\\begin{摘要}(.*?)\\end{摘要}',
r'\\begin{Summary}(.*?)\\end{Summary}',
r'\\begin{synopsis}(.*?)\\end{synopsis}',
r'\\begin{abstracten}(.*?)\\end{abstracten}' # 英文摘要
],
# 作者信息
'author': [
r'\\author\{([^}]+)\}',
r'\\Author\{([^}]+)\}',
r'\\authorinfo\{([^}]+)\}',
r'\\authors\{([^}]+)\}',
r'\\author\[([^]]+)\]\{([^}]+)\}', # 带附加信息的作者
r'\\begin{authors}(.*?)\\end{authors}'
],
# 日期相关
'date': [
r'\\date\{([^}]+)\}',
r'\\Date\{([^}]+)\}',
r'\\submitdate\{([^}]+)\}',
r'\\publishdate\{([^}]+)\}',
r'\\revisiondate\{([^}]+)\}'
],
# 关键词
'keywords': [
r'\\keywords\{([^}]+)\}',
r'\\Keywords\{([^}]+)\}',
r'\\begin{keywords}(.*?)\\end{keywords}',
r'\\key\{([^}]+)\}',
r'\\begin{关键词}(.*?)\\end{关键词}'
],
# 机构/单位
'institution': [
r'\\institute\{([^}]+)\}',
r'\\institution\{([^}]+)\}',
r'\\affiliation\{([^}]+)\}',
r'\\organization\{([^}]+)\}',
r'\\department\{([^}]+)\}'
],
# 学科/主题
'subject': [
r'\\subject\{([^}]+)\}',
r'\\Subject\{([^}]+)\}',
r'\\field\{([^}]+)\}',
r'\\discipline\{([^}]+)\}'
],
# 版本信息
'version': [
r'\\version\{([^}]+)\}',
r'\\revision\{([^}]+)\}',
r'\\release\{([^}]+)\}'
],
# 许可证/版权
'license': [
r'\\license\{([^}]+)\}',
r'\\copyright\{([^}]+)\}',
r'\\begin{license}(.*?)\\end{license}'
],
# 联系方式
'contact': [
r'\\email\{([^}]+)\}',
r'\\phone\{([^}]+)\}',
r'\\address\{([^}]+)\}',
r'\\contact\{([^}]+)\}'
],
# 致谢
'acknowledgments': [
r'\\begin{acknowledgments}(.*?)\\end{acknowledgments}',
r'\\acknowledgments\{([^}]+)\}',
r'\\thanks\{([^}]+)\}',
r'\\begin{致谢}(.*?)\\end{致谢}'
],
# 项目/基金
'funding': [
r'\\funding\{([^}]+)\}',
r'\\grant\{([^}]+)\}',
r'\\project\{([^}]+)\}',
r'\\support\{([^}]+)\}'
],
# 分类号/编号
'classification': [
r'\\classification\{([^}]+)\}',
r'\\serialnumber\{([^}]+)\}',
r'\\id\{([^}]+)\}',
r'\\doi\{([^}]+)\}'
],
# 语言
'language': [
r'\\documentlanguage\{([^}]+)\}',
r'\\lang\{([^}]+)\}',
r'\\language\{([^}]+)\}'
]
}
latex_only_patterns = {
# 文档类和包引入
r'\\documentclass(\[.*?\])?\{.*?\}',
r'\\usepackage(\[.*?\])?\{.*?\}',
# 常见的文档设置命令
r'\\setlength\{.*?\}\{.*?\}',
r'\\newcommand\{.*?\}(\[.*?\])?\{.*?\}',
r'\\renewcommand\{.*?\}(\[.*?\])?\{.*?\}',
r'\\definecolor\{.*?\}\{.*?\}\{.*?\}',
# 页面设置相关
r'\\pagestyle\{.*?\}',
r'\\thispagestyle\{.*?\}',
# 其他常见的设置命令
r'\\bibliographystyle\{.*?\}',
r'\\bibliography\{.*?\}',
r'\\setcounter\{.*?\}\{.*?\}',
# 字体和文本设置命令
r'\\makeFNbottom',
r'\\@setfontsize\\[A-Z]+\{.*?\}\{.*?\}', # 匹配字体大小设置
r'\\renewcommand\\[A-Z]+\{\\@setfontsize\\[A-Z]+\{.*?\}\{.*?\}\}',
r'\\renewcommand\{?\\thefootnote\}?\{\\fnsymbol\{footnote\}\}',
r'\\renewcommand\\footnoterule\{.*?\}',
r'\\color\{.*?\}',
# 页面和节标题设置
r'\\setcounter\{secnumdepth\}\{.*?\}',
r'\\renewcommand\\@biblabel\[.*?\]\{.*?\}',
r'\\renewcommand\\@makefntext\[.*?\](\{.*?\})*',
r'\\renewcommand\{?\\figurename\}?\{.*?\}',
# 字体样式设置
r'\\sectionfont\{.*?\}',
r'\\subsectionfont\{.*?\}',
r'\\subsubsectionfont\{.*?\}',
# 间距和布局设置
r'\\setstretch\{.*?\}',
r'\\setlength\{\\skip\\footins\}\{.*?\}',
r'\\setlength\{\\footnotesep\}\{.*?\}',
r'\\setlength\{\\jot\}\{.*?\}',
r'\\hrule\s+width\s+.*?\s+height\s+.*?',
# makeatletter 和 makeatother
r'\\makeatletter\s*',
r'\\makeatother\s*',
r'\\footnotetext\{[^}]*\$\^{[^}]*}\$[^}]*\}', # 带有上标的脚注
# r'\\footnotetext\{[^}]*\}', # 普通脚注
# r'\\footnotetext\{.*?(?:\$\^{.*?}\$)?.*?(?:email\s*:\s*[^}]*)?.*?\}', # 带有邮箱的脚注
# r'\\footnotetext\{.*?(?:ESI|DOI).*?\}', # 带有 DOI 或 ESI 引用的脚注
# 文档结构命令
r'\\begin\{document\}',
r'\\end\{document\}',
r'\\maketitle',
r'\\printbibliography',
r'\\newpage',
# 输入文件命令
r'\\input\{[^}]*\}',
r'\\input\{.*?\.tex\}', # 特别匹配 .tex 后缀的输入
# 脚注相关
# r'\\footnotetext\[\d+\]\{[^}]*\}', # 带编号的脚注
# 致谢环境
r'\\begin\{ack\}',
r'\\end\{ack\}',
r'\\begin\{ack\}[^\n]*(?:\n.*?)*?\\end\{ack\}', # 匹配整个致谢环境及其内容
# 其他文档控制命令
r'\\renewcommand\{\\thefootnote\}\{\\fnsymbol\{footnote\}\}',
}
math_envs = [
# 基础数学环境
(r'\\begin{equation\*?}.*?\\end{equation\*?}', 'equation'), # 单行公式
(r'\\begin{align\*?}.*?\\end{align\*?}', 'align'), # 多行对齐公式
(r'\\begin{gather\*?}.*?\\end{gather\*?}', 'gather'), # 多行居中公式
(r'\$\$.*?\$\$', 'display'), # 行间公式
(r'\$.*?\$', 'inline'), # 行内公式
# 矩阵环境
(r'\\begin{matrix}.*?\\end{matrix}', 'matrix'), # 基础矩阵
(r'\\begin{pmatrix}.*?\\end{pmatrix}', 'pmatrix'), # 圆括号矩阵
(r'\\begin{bmatrix}.*?\\end{bmatrix}', 'bmatrix'), # 方括号矩阵
(r'\\begin{vmatrix}.*?\\end{vmatrix}', 'vmatrix'), # 竖线矩阵
(r'\\begin{Vmatrix}.*?\\end{Vmatrix}', 'Vmatrix'), # 双竖线矩阵
(r'\\begin{smallmatrix}.*?\\end{smallmatrix}', 'smallmatrix'), # 小号矩阵
# 数组环境
(r'\\begin{array}.*?\\end{array}', 'array'), # 数组
(r'\\begin{cases}.*?\\end{cases}', 'cases'), # 分段函数
# 多行公式环境
(r'\\begin{multline\*?}.*?\\end{multline\*?}', 'multline'), # 多行单个公式
(r'\\begin{split}.*?\\end{split}', 'split'), # 拆分长公式
(r'\\begin{alignat\*?}.*?\\end{alignat\*?}', 'alignat'), # 对齐环境带间距控制
(r'\\begin{flalign\*?}.*?\\end{flalign\*?}', 'flalign'), # 完全左对齐
# 特殊数学环境
(r'\\begin{subequations}.*?\\end{subequations}', 'subequations'), # 子公式编号
(r'\\begin{gathered}.*?\\end{gathered}', 'gathered'), # 居中对齐组
(r'\\begin{aligned}.*?\\end{aligned}', 'aligned'), # 内部对齐组
# 定理类环境
(r'\\begin{theorem}.*?\\end{theorem}', 'theorem'), # 定理
(r'\\begin{lemma}.*?\\end{lemma}', 'lemma'), # 引理
(r'\\begin{proof}.*?\\end{proof}', 'proof'), # 证明
# 数学模式中的表格环境
(r'\\begin{tabular}.*?\\end{tabular}', 'tabular'), # 表格
(r'\\begin{array}.*?\\end{array}', 'array'), # 数组
# 其他专业数学环境
(r'\\begin{CD}.*?\\end{CD}', 'CD'), # 交换图
(r'\\begin{boxed}.*?\\end{boxed}', 'boxed'), # 带框公式
(r'\\begin{empheq}.*?\\end{empheq}', 'empheq'), # 强调公式
# 化学方程式环境 (需要加载 mhchem 包)
(r'\\begin{reaction}.*?\\end{reaction}', 'reaction'), # 化学反应式
(r'\\ce\{.*?\}', 'chemequation'), # 化学方程式
# 物理单位环境 (需要加载 siunitx 包)
(r'\\SI\{.*?\}\{.*?\}', 'SI'), # 物理单位
(r'\\si\{.*?\}', 'si'), # 单位
# 补充环境
(r'\\begin{equation\+}.*?\\end{equation\+}', 'equation+'), # breqn包的自动换行公式
(r'\\begin{dmath\*?}.*?\\end{dmath\*?}', 'dmath'), # breqn包的显示数学模式
(r'\\begin{dgroup\*?}.*?\\end{dgroup\*?}', 'dgroup'), # breqn包的公式组
]
# 示例使用函数
# 使用示例

View File

@@ -1,416 +0,0 @@
import logging
import re
from copy import deepcopy
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Tuple
# 配置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class SectionLevel(Enum):
CHAPTER = 0
SECTION = 1
SUBSECTION = 2
SUBSUBSECTION = 3
PARAGRAPH = 4
SUBPARAGRAPH = 5
def __lt__(self, other):
if not isinstance(other, SectionLevel):
return NotImplemented
return self.value < other.value
def __le__(self, other):
if not isinstance(other, SectionLevel):
return NotImplemented
return self.value <= other.value
def __gt__(self, other):
if not isinstance(other, SectionLevel):
return NotImplemented
return self.value > other.value
def __ge__(self, other):
if not isinstance(other, SectionLevel):
return NotImplemented
return self.value >= other.value
@dataclass
class Section:
level: SectionLevel
title: str
content: str = ''
bibliography: str = ''
subsections: List['Section'] = field(default_factory=list)
def merge(self, other: 'Section') -> 'Section':
"""Merge this section with another section."""
if self.title != other.title or self.level != other.level:
raise ValueError("Can only merge sections with same title and level")
merged = deepcopy(self)
merged.content = self._merge_content(self.content, other.content)
# Create subsections lookup for efficient merging
subsections_map = {s.title: s for s in merged.subsections}
for other_subsection in other.subsections:
if other_subsection.title in subsections_map:
# Merge existing subsection
idx = next(i for i, s in enumerate(merged.subsections)
if s.title == other_subsection.title)
merged.subsections[idx] = merged.subsections[idx].merge(other_subsection)
else:
# Add new subsection
merged.subsections.append(deepcopy(other_subsection))
return merged
@staticmethod
def _merge_content(content1: str, content2: str) -> str:
"""Merge content strings intelligently."""
if not content1:
return content2
if not content2:
return content1
# Combine non-empty contents with a separator
return f"{content1}\n\n{content2}"
@dataclass
class LatexEnvironment:
"""表示LaTeX环境的数据类"""
name: str
start: int
end: int
content: str
raw: str
class EnhancedSectionExtractor:
"""Enhanced section extractor with comprehensive content handling and hierarchy management."""
def __init__(self, preserve_environments: bool = True):
"""
初始化Section提取器
Args:
preserve_environments: 是否保留特定环境如equation, figure等的原始LaTeX代码
"""
self.preserve_environments = preserve_environments
# Section级别定义
self.section_levels = {
'chapter': SectionLevel.CHAPTER,
'section': SectionLevel.SECTION,
'subsection': SectionLevel.SUBSECTION,
'subsubsection': SectionLevel.SUBSUBSECTION,
'paragraph': SectionLevel.PARAGRAPH,
'subparagraph': SectionLevel.SUBPARAGRAPH
}
# 需要保留的环境类型
self.important_environments = {
'equation', 'equation*', 'align', 'align*',
'figure', 'table', 'algorithm', 'algorithmic',
'definition', 'theorem', 'lemma', 'proof',
'itemize', 'enumerate', 'description'
}
# 改进的section pattern
self.section_pattern = (
r'\\(?P<type>chapter|section|subsection|subsubsection|paragraph|subparagraph)'
r'\*?' # Optional star
r'(?:\[(?P<short>.*?)\])?' # Optional short title
r'{(?P<title>(?:[^{}]|\{[^{}]*\})*?)}' # Main title with nested braces support
)
# 环境匹配模式
self.environment_pattern = (
r'\\begin{(?P<env_name>[^}]+)}'
r'(?P<env_content>.*?)'
r'\\end{(?P=env_name)}'
)
def _find_environments(self, content: str) -> List[LatexEnvironment]:
"""
查找文档中的所有LaTeX环境。
支持嵌套环境的处理。
"""
environments = []
stack = []
# 使用正则表达式查找所有begin和end标记
begin_pattern = r'\\begin{([^}]+)}'
end_pattern = r'\\end{([^}]+)}'
# 组合模式来同时匹配begin和end
tokens = []
for match in re.finditer(fr'({begin_pattern})|({end_pattern})', content):
if match.group(1): # begin标记
tokens.append(('begin', match.group(1), match.start()))
else: # end标记
tokens.append(('end', match.group(2), match.start()))
# 处理环境嵌套
for token_type, env_name, pos in tokens:
if token_type == 'begin':
stack.append((env_name, pos))
elif token_type == 'end' and stack:
if stack[-1][0] == env_name:
start_env_name, start_pos = stack.pop()
env_content = content[start_pos:pos]
raw_content = content[start_pos:pos + len('\\end{' + env_name + '}')]
if start_env_name in self.important_environments:
environments.append(LatexEnvironment(
name=start_env_name,
start=start_pos,
end=pos + len('\\end{' + env_name + '}'),
content=env_content,
raw=raw_content
))
return sorted(environments, key=lambda x: x.start)
def _protect_environments(self, content: str) -> Tuple[str, Dict[str, str]]:
"""
保护重要的LaTeX环境用占位符替换它们。
返回处理后的内容和恢复映射。
"""
environments = self._find_environments(content)
replacements = {}
# 从后向前替换,避免位置改变的问题
for env in reversed(environments):
if env.name in self.important_environments:
placeholder = f'__ENV_{len(replacements)}__'
replacements[placeholder] = env.raw
content = content[:env.start] + placeholder + content[env.end:]
return content, replacements
def _restore_environments(self, content: str, replacements: Dict[str, str]) -> str:
"""
恢复之前保护的环境。
"""
for placeholder, original in replacements.items():
content = content.replace(placeholder, original)
return content
def extract(self, content: str) -> List[Section]:
"""
从LaTeX文档中提取sections及其内容。
Args:
content: LaTeX文档内容
Returns:
List[Section]: 提取的section列表包含层次结构
"""
try:
# 预处理:保护重要环境
if self.preserve_environments:
content, env_replacements = self._protect_environments(content)
# 查找所有sections
sections = self._find_all_sections(content)
if not sections:
return []
# 处理sections
root_sections = self._process_sections(content, sections)
# 如果需要,恢复环境
if self.preserve_environments:
for section in self._traverse_sections(root_sections):
section.content = self._restore_environments(section.content, env_replacements)
return root_sections
except Exception as e:
logger.error(f"Error extracting sections: {str(e)}")
raise
def _find_all_sections(self, content: str) -> List[dict]:
"""查找所有section命令及其位置。"""
sections = []
for match in re.finditer(self.section_pattern, content, re.DOTALL | re.MULTILINE):
section_type = match.group('type').lower()
if section_type not in self.section_levels:
continue
section = {
'type': section_type,
'level': self.section_levels[section_type],
'title': self._clean_title(match.group('title')),
'start': match.start(),
'command_end': match.end(),
}
sections.append(section)
return sorted(sections, key=lambda x: x['start'])
def _process_sections(self, content: str, sections: List[dict]) -> List[Section]:
"""处理sections以构建层次结构和提取内容。"""
# 计算content范围
self._calculate_content_ranges(content, sections)
# 构建层次结构
root_sections = []
section_stack = []
for section_info in sections:
new_section = Section(
level=section_info['level'],
title=section_info['title'],
content=self._extract_clean_content(content, section_info),
subsections=[]
)
# 调整堆栈以找到正确的父section
while section_stack and section_stack[-1].level.value >= new_section.level.value:
section_stack.pop()
if section_stack:
section_stack[-1].subsections.append(new_section)
else:
root_sections.append(new_section)
section_stack.append(new_section)
return root_sections
def _calculate_content_ranges(self, content: str, sections: List[dict]):
for i, current in enumerate(sections):
content_start = current['command_end']
# 找到下一个section无论什么级别
content_end = len(content)
for next_section in sections[i + 1:]:
content_end = next_section['start']
break
current['content_range'] = (content_start, content_end)
def _calculate_content_ranges_with_subsection_content(self, content: str, sections: List[dict]):
"""为每个section计算内容范围。"""
for i, current in enumerate(sections):
content_start = current['command_end']
# 找到下一个同级或更高级的section
content_end = len(content)
for next_section in sections[i + 1:]:
if next_section['level'] <= current['level']:
content_end = next_section['start']
break
current['content_range'] = (content_start, content_end)
def _extract_clean_content(self, content: str, section_info: dict) -> str:
"""提取并清理section内容。"""
start, end = section_info['content_range']
raw_content = content[start:end]
# 清理内容
clean_content = self._clean_content(raw_content)
return clean_content
def _clean_content(self, content: str) -> str:
"""清理LaTeX内容同时保留重要信息。"""
# 移除注释
content = re.sub(r'(?<!\\)%.*?\n', '\n', content)
# LaTeX命令处理规则
replacements = [
# 保留引用
(r'\\cite(?:\[.*?\])?{(.*?)}', r'[cite:\1]'),
# 保留脚注
(r'\\footnote{(.*?)}', r'[footnote:\1]'),
# 处理引用
(r'\\ref{(.*?)}', r'[ref:\1]'),
# 保留URL
(r'\\url{(.*?)}', r'[url:\1]'),
# 保留超链接
(r'\\href{(.*?)}{(.*?)}', r'[\2](\1)'),
# 处理文本格式命令
(r'\\(?:textbf|textit|emph){(.*?)}', r'\1'),
# 保留特殊字符
(r'\\([&%$#_{}])', r'\1'),
]
# 应用所有替换规则
for pattern, replacement in replacements:
content = re.sub(pattern, replacement, content, flags=re.DOTALL)
# 清理多余的空白
content = re.sub(r'\n\s*\n', '\n\n', content)
return content.strip()
def _clean_title(self, title: str) -> str:
"""清理section标题。"""
# 处理嵌套的花括号
while '{' in title:
title = re.sub(r'{([^{}]*)}', r'\1', title)
# 处理LaTeX命令
title = re.sub(r'\\[a-zA-Z]+(?:\[.*?\])?{(.*?)}', r'\1', title)
title = re.sub(r'\\([&%$#_{}])', r'\1', title)
return title.strip()
def _traverse_sections(self, sections: List[Section]) -> List[Section]:
"""遍历所有sections包括子sections"""
result = []
for section in sections:
result.append(section)
result.extend(self._traverse_sections(section.subsections))
return result
def test_enhanced_extractor():
"""使用复杂的测试用例测试提取器。"""
test_content = r"""
\section{Complex Examples}
Here's a complex section with various environments.
\begin{equation}
E = mc^2
\end{equation}
\subsection{Nested Environments}
This subsection has nested environments.
\begin{figure}
\begin{equation*}
f(x) = \int_0^x g(t) dt
\end{equation*}
\caption{A nested equation in a figure}
\end{figure}
"""
extractor = EnhancedSectionExtractor()
sections = extractor.extract(test_content)
def print_section(section, level=0):
print("\n" + " " * level + f"[{section.level.name}] {section.title}")
if section.content:
content_preview = section.content[:150] + "..." if len(section.content) > 150 else section.content
print(" " * (level + 1) + f"Content: {content_preview}")
for subsection in section.subsections:
print_section(subsection, level + 1)
print("\nExtracted Section Structure:")
for section in sections:
print_section(section)
if __name__ == "__main__":
test_enhanced_extractor()

View File

@@ -1,14 +0,0 @@
from dataclasses import dataclass
@dataclass
class SectionFragment:
"""Arxiv论文片段数据类"""
title: str # 论文标题
authors: str
abstract: str # 论文摘要
catalogs: str # 文章各章节的目录结构
arxiv_id: str = "" # 添加 arxiv_id 属性
current_section: str = "Introduction" # 当前片段所属的section或者subsection或者孙subsubsection名字
content: str = '' # 当前片段的内容
bibliography: str = '' # 当前片段的参考文献

View File

@@ -1,266 +0,0 @@
import logging
import os
import re
from pathlib import Path
from typing import List, Set, Optional
from crazy_functions.rag_fns.arxiv_fns.latex_patterns import LaTeXPatterns
class TexUtils:
"""TeX文档处理器类"""
def __init__(self, ):
"""
初始化TeX处理器
Args:
char_range: 字符数范围(最小值, 最大值)
"""
self.logger = logging.getLogger(__name__)
# 初始化LaTeX环境和命令模式
self._init_patterns()
self.latex_only_patterns = LaTeXPatterns.latex_only_patterns
def _init_patterns(self):
"""初始化LaTeX模式匹配规则"""
# 特殊环境模式
self.special_envs = LaTeXPatterns.special_envs
# 章节模式
self.section_patterns = LaTeXPatterns.section_patterns
# 包含模式
self.include_patterns = LaTeXPatterns.include_patterns
# 元数据模式
self.metadata_patterns = LaTeXPatterns.metadata_patterns
def read_file(self, file_path: str) -> Optional[str]:
"""
读取TeX文件内容支持多种编码
Args:
file_path: 文件路径
Returns:
Optional[str]: 文件内容或None
"""
encodings = ['utf-8', 'latin1', 'gbk', 'gb2312', 'ascii']
for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
return f.read()
except UnicodeDecodeError:
continue
self.logger.warning(f"Failed to read {file_path} with all encodings")
return None
def find_main_tex_file(self, directory: str) -> Optional[str]:
"""
查找主TeX文件
Args:
directory: 目录路径
Returns:
Optional[str]: 主文件路径或None
"""
tex_files = list(Path(directory).rglob("*.tex"))
if not tex_files:
return None
# 按优先级查找
for tex_file in tex_files:
content = self.read_file(str(tex_file))
if content:
if r'\documentclass' in content:
return str(tex_file)
if tex_file.name.lower() == 'main.tex':
return str(tex_file)
# 返回最大的tex文件
return str(max(tex_files, key=lambda x: x.stat().st_size))
def resolve_includes(self, tex_file: str, processed: Set[str] = None) -> List[str]:
"""
解析TeX文件中的include引用
Args:
tex_file: TeX文件路径
processed: 已处理的文件集合
Returns:
List[str]: 相关文件路径列表
"""
if processed is None:
processed = set()
if tex_file in processed:
return []
processed.add(tex_file)
result = [tex_file]
content = self.read_file(tex_file)
if not content:
return result
base_dir = Path(tex_file).parent
for pattern in self.include_patterns:
for match in re.finditer(pattern, content):
included_file = match.group(2)
if not included_file.endswith('.tex'):
included_file += '.tex'
full_path = str(base_dir / included_file)
if os.path.exists(full_path) and full_path not in processed:
result.extend(self.resolve_includes(full_path, processed))
return result
def resolve_references(self, tex_file: str, path_dir: str = None) -> str:
"""
解析TeX文件中的参考文献引用返回所有引用文献的内容只保留title、author和journal字段。
如果在tex_file目录下没找到bib文件会在path_dir中查找。
Args:
tex_file: TeX文件路径
path_dir: 额外的参考文献搜索路径
Returns:
str: 所有参考文献内容的字符串,只包含特定字段,不同参考文献之间用空行分隔
"""
all_references = [] # 存储所有参考文献内容
content = self.read_file(tex_file)
if not content:
return ""
# 扩展参考文献引用的模式
bib_patterns = [
r'\\bibliography\{([^}]+)\}',
r'\\addbibresource\{([^}]+)\}',
r'\\bibliographyfile\{([^}]+)\}',
r'\\begin\{thebibliography\}',
r'\\bibinput\{([^}]+)\}',
r'\\newrefsection\{([^}]+)\}'
]
base_dir = Path(tex_file).parent
found_in_tex_dir = False
# 首先在tex文件目录下查找显式引用的bib文件
for pattern in bib_patterns:
for match in re.finditer(pattern, content):
if not match.groups():
continue
bib_files = match.group(1).split(',')
for bib_file in bib_files:
bib_file = bib_file.strip()
if not bib_file.endswith('.bib'):
bib_file += '.bib'
full_path = str(base_dir / bib_file)
if os.path.exists(full_path):
found_in_tex_dir = True
bib_content = self.read_file(full_path)
if bib_content:
processed_refs = self._process_bib_content(bib_content)
all_references.extend(processed_refs)
# 如果在tex文件目录下没找到bib文件且提供了额外搜索路径
if not found_in_tex_dir and path_dir:
search_dir = Path(path_dir)
try:
for bib_path in search_dir.glob('**/*.bib'):
bib_content = self.read_file(str(bib_path))
if bib_content:
processed_refs = self._process_bib_content(bib_content)
all_references.extend(processed_refs)
except Exception as e:
print(f"Error searching in path_dir: {e}")
# 合并所有参考文献内容,用空行分隔
return "\n\n".join(all_references)
def _process_bib_content(self, content: str) -> List[str]:
"""
处理bib文件内容提取每个参考文献的特定字段
Args:
content: bib文件内容
Returns:
List[str]: 处理后的参考文献列表
"""
processed_refs = []
# 匹配完整的参考文献条目
ref_pattern = r'@\w+\{[^@]*\}'
# 匹配参考文献类型和键值
entry_start_pattern = r'@(\w+)\{([^,]*?),'
# 匹配字段
field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}'
# 查找所有参考文献条目
for ref_match in re.finditer(ref_pattern, content, re.DOTALL):
ref_content = ref_match.group(0)
# 获取参考文献类型和键值
entry_match = re.match(entry_start_pattern, ref_content)
if not entry_match:
continue
entry_type, cite_key = entry_match.groups()
# 提取需要的字段
needed_fields = {'title': None, 'author': None, 'journal': None}
for field_match in re.finditer(field_pattern, ref_content):
field_name, field_value = field_match.groups()
field_name = field_name.lower()
if field_name in needed_fields:
needed_fields[field_name] = field_value.strip()
# 构建新的参考文献条目
if any(needed_fields.values()): # 如果至少有一个需要的字段
ref_lines = [f"@{entry_type}{{{cite_key},"]
for field_name, field_value in needed_fields.items():
if field_value:
ref_lines.append(f" {field_name}={{{field_value}}},")
ref_lines[-1] = ref_lines[-1][:-1] # 移除最后一个逗号
ref_lines.append("}")
processed_refs.append("\n".join(ref_lines))
return processed_refs
def _extract_inline_references(self, content: str) -> str:
"""
从tex文件内容中提取直接写在文件中的参考文献
Args:
content: tex文件内容
Returns:
str: 提取的参考文献内容,如果没有找到则返回空字符串
"""
# 查找参考文献环境
bib_start = r'\\begin\{thebibliography\}'
bib_end = r'\\end\{thebibliography\}'
start_match = re.search(bib_start, content)
end_match = re.search(bib_end, content)
if start_match and end_match:
return content[start_match.start():end_match.end()]
return ""
def _preprocess_content(self, content: str) -> str:
"""预处理TeX内容"""
# 移除注释
content = re.sub(r'(?m)%.*$', '', content)
# 规范化空白字符
# content = re.sub(r'\s+', ' ', content)
content = re.sub(r'\n\s*\n', '\n\n', content)
return content.strip()

View File

@@ -1,165 +0,0 @@
import atexit
import os
from llama_index.core import Document
from llama_index.core.ingestion import run_transformations
from llama_index.core.schema import TextNode
from loguru import logger
from crazy_functions.rag_fns.vector_store_index import GptacVectorStoreIndex
from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel
DEFAULT_QUERY_GENERATION_PROMPT = """\
Now, you have context information as below:
---------------------
{context_str}
---------------------
Answer the user request below (use the context information if necessary, otherwise you can ignore them):
---------------------
{query_str}
"""
QUESTION_ANSWER_RECORD = """\
{{
"type": "This is a previous conversation with the user",
"question": "{question}",
"answer": "{answer}",
}}
"""
class SaveLoad():
def does_checkpoint_exist(self, checkpoint_dir=None):
import os, glob
if checkpoint_dir is None: checkpoint_dir = self.checkpoint_dir
if not os.path.exists(checkpoint_dir): return False
if len(glob.glob(os.path.join(checkpoint_dir, "*.json"))) == 0: return False
return True
def save_to_checkpoint(self, checkpoint_dir=None):
logger.info(f'saving vector store to: {checkpoint_dir}')
if checkpoint_dir is None: checkpoint_dir = self.checkpoint_dir
self.vs_index.storage_context.persist(persist_dir=checkpoint_dir)
def load_from_checkpoint(self, checkpoint_dir=None):
if checkpoint_dir is None: checkpoint_dir = self.checkpoint_dir
if self.does_checkpoint_exist(checkpoint_dir=checkpoint_dir):
logger.info('loading checkpoint from disk')
from llama_index.core import StorageContext, load_index_from_storage
storage_context = StorageContext.from_defaults(persist_dir=checkpoint_dir)
self.vs_index = load_index_from_storage(storage_context, embed_model=self.embed_model)
return self.vs_index
else:
return self.create_new_vs()
def create_new_vs(self):
return GptacVectorStoreIndex.default_vector_store(embed_model=self.embed_model)
def purge(self):
import shutil
shutil.rmtree(self.checkpoint_dir, ignore_errors=True)
self.vs_index = self.create_new_vs()
class LlamaIndexRagWorker(SaveLoad):
def __init__(self, user_name, llm_kwargs, auto_load_checkpoint=True, checkpoint_dir=None) -> None:
self.debug_mode = True
self.embed_model = OpenAiEmbeddingModel(llm_kwargs)
self.user_name = user_name
self.checkpoint_dir = checkpoint_dir
# 确保checkpoint_dir存在
if checkpoint_dir:
os.makedirs(checkpoint_dir, exist_ok=True)
logger.info(f"Initializing LlamaIndexRagWorker with checkpoint_dir: {checkpoint_dir}")
# 初始化向量存储
if auto_load_checkpoint and self.does_checkpoint_exist():
logger.info("Loading existing vector store from checkpoint")
self.vs_index = self.load_from_checkpoint()
else:
logger.info("Creating new vector store")
self.vs_index = self.create_new_vs()
# 注册退出时保存
atexit.register(self.save_to_checkpoint)
def add_text_to_vector_store(self, text: str) -> None:
"""添加文本到向量存储"""
try:
logger.info(f"Adding text to vector store (first 100 chars): {text[:100]}...")
node = TextNode(text=text)
nodes = run_transformations(
[node],
self.vs_index._transformations,
show_progress=True
)
self.vs_index.insert_nodes(nodes)
# 立即保存
self.save_to_checkpoint()
if self.debug_mode:
self.inspect_vector_store()
except Exception as e:
logger.error(f"Error adding text to vector store: {str(e)}")
raise
def save_to_checkpoint(self, checkpoint_dir=None):
"""保存向量存储到检查点"""
try:
if checkpoint_dir is None:
checkpoint_dir = self.checkpoint_dir
logger.info(f'Saving vector store to: {checkpoint_dir}')
if checkpoint_dir:
self.vs_index.storage_context.persist(persist_dir=checkpoint_dir)
logger.info('Vector store saved successfully')
else:
logger.warning('No checkpoint directory specified, skipping save')
except Exception as e:
logger.error(f"Error saving checkpoint: {str(e)}")
raise
def assign_embedding_model(self):
pass
def inspect_vector_store(self):
# This function is for debugging
self.vs_index.storage_context.index_store.to_dict()
docstore = self.vs_index.storage_context.docstore.docs
vector_store_preview = "\n".join([f"{_id} | {tn.text}" for _id, tn in docstore.items()])
logger.info('\n++ --------inspect_vector_store begin--------')
logger.info(vector_store_preview)
logger.info('oo --------inspect_vector_store end--------')
return vector_store_preview
def add_documents_to_vector_store(self, document_list):
documents = [Document(text=t) for t in document_list]
documents_nodes = run_transformations(
documents, # type: ignore
self.vs_index._transformations,
show_progress=True
)
self.vs_index.insert_nodes(documents_nodes)
if self.debug_mode: self.inspect_vector_store()
def remember_qa(self, question, answer):
formatted_str = QUESTION_ANSWER_RECORD.format(question=question, answer=answer)
self.add_text_to_vector_store(formatted_str)
def retrieve_from_store_with_query(self, query):
if self.debug_mode: self.inspect_vector_store()
retriever = self.vs_index.as_retriever()
return retriever.retrieve(query)
def build_prompt(self, query, nodes):
context_str = self.generate_node_array_preview(nodes)
return DEFAULT_QUERY_GENERATION_PROMPT.format(context_str=context_str, query_str=query)
def generate_node_array_preview(self, nodes):
buf = "\n".join(([f"(No.{i + 1} | score {n.score:.3f}): {n.text}" for i, n in enumerate(nodes)]))
if self.debug_mode: logger.info(buf)
return buf

View File

@@ -1,104 +0,0 @@
import atexit
import os
from typing import List
from llama_index.core import StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore
from loguru import logger
from crazy_functions.rag_fns.llama_index_worker import LlamaIndexRagWorker
from crazy_functions.rag_fns.vector_store_index import GptacVectorStoreIndex
from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel
DEFAULT_QUERY_GENERATION_PROMPT = """\
Now, you have context information as below:
---------------------
{context_str}
---------------------
Answer the user request below (use the context information if necessary, otherwise you can ignore them):
---------------------
{query_str}
"""
QUESTION_ANSWER_RECORD = """\
{{
"type": "This is a previous conversation with the user",
"question": "{question}",
"answer": "{answer}",
}}
"""
class MilvusSaveLoad():
def does_checkpoint_exist(self, checkpoint_dir=None):
import os, glob
if checkpoint_dir is None: checkpoint_dir = self.checkpoint_dir
if not os.path.exists(checkpoint_dir): return False
if len(glob.glob(os.path.join(checkpoint_dir, "*.json"))) == 0: return False
return True
def save_to_checkpoint(self, checkpoint_dir=None):
logger.info(f'saving vector store to: {checkpoint_dir}')
# if checkpoint_dir is None: checkpoint_dir = self.checkpoint_dir
# self.vs_index.storage_context.persist(persist_dir=checkpoint_dir)
def load_from_checkpoint(self, checkpoint_dir=None):
if checkpoint_dir is None: checkpoint_dir = self.checkpoint_dir
if self.does_checkpoint_exist(checkpoint_dir=checkpoint_dir):
logger.info('loading checkpoint from disk')
from llama_index.core import StorageContext, load_index_from_storage
storage_context = StorageContext.from_defaults(persist_dir=checkpoint_dir)
try:
self.vs_index = load_index_from_storage(storage_context, embed_model=self.embed_model)
return self.vs_index
except:
return self.create_new_vs(checkpoint_dir)
else:
return self.create_new_vs(checkpoint_dir)
def create_new_vs(self, checkpoint_dir, overwrite=False):
vector_store = MilvusVectorStore(
uri=os.path.join(checkpoint_dir, "milvus_demo.db"),
dim=self.embed_model.embedding_dimension(),
overwrite=overwrite
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = GptacVectorStoreIndex.default_vector_store(storage_context=storage_context,
embed_model=self.embed_model)
return index
def purge(self):
self.vs_index = self.create_new_vs(self.checkpoint_dir, overwrite=True)
class MilvusRagWorker(MilvusSaveLoad, LlamaIndexRagWorker):
def __init__(self, user_name, llm_kwargs, auto_load_checkpoint=True, checkpoint_dir=None) -> None:
self.debug_mode = True
self.embed_model = OpenAiEmbeddingModel(llm_kwargs)
self.user_name = user_name
self.checkpoint_dir = checkpoint_dir
if auto_load_checkpoint:
self.vs_index = self.load_from_checkpoint(checkpoint_dir)
else:
self.vs_index = self.create_new_vs(checkpoint_dir)
atexit.register(lambda: self.save_to_checkpoint(checkpoint_dir))
def inspect_vector_store(self):
# This function is for debugging
try:
self.vs_index.storage_context.index_store.to_dict()
docstore = self.vs_index.storage_context.docstore.docs
if not docstore.items():
raise ValueError("cannot inspect")
vector_store_preview = "\n".join([f"{_id} | {tn.text}" for _id, tn in docstore.items()])
except:
dummy_retrieve_res: List["NodeWithScore"] = self.vs_index.as_retriever().retrieve(' ')
vector_store_preview = "\n".join(
[f"{node.id_} | {node.text}" for node in dummy_retrieve_res]
)
logger.info('\n++ --------inspect_vector_store begin--------')
logger.info(vector_store_preview)
logger.info('oo --------inspect_vector_store end--------')
return vector_store_preview

View File

@@ -1,47 +0,0 @@
from llama_index.core import SimpleDirectoryReader
supports_format = ['.csv', '.docx', '.doc', '.epub', '.ipynb', '.mbox', '.md', '.pdf', '.txt', '.ppt',
'.pptm', '.pptx', '.py', '.xls', '.xlsx', '.html', '.json', '.xml', '.yaml', '.yml', '.m']
def read_docx_doc(file_path):
if file_path.split(".")[-1] == "docx":
from docx import Document
doc = Document(file_path)
file_content = "\n".join([para.text for para in doc.paragraphs])
else:
try:
import win32com.client
word = win32com.client.Dispatch("Word.Application")
word.visible = False
# 打开文件
doc = word.Documents.Open(os.getcwd() + '/' + file_path)
# file_content = doc.Content.Text
doc = word.ActiveDocument
file_content = doc.Range().Text
doc.Close()
word.Quit()
except:
raise RuntimeError('请先将.doc文档转换为.docx文档。')
return file_content
# 修改后的 extract_text 函数,结合 SimpleDirectoryReader 和自定义解析逻辑
import os
def extract_text(file_path):
_, ext = os.path.splitext(file_path.lower())
# 使用 SimpleDirectoryReader 处理它支持的文件格式
if ext in ['.docx', '.doc']:
return read_docx_doc(file_path)
try:
reader = SimpleDirectoryReader(input_files=[file_path])
documents = reader.load_data()
if len(documents) > 0:
return documents[0].text
except Exception as e:
pass
return None

View File

@@ -1,56 +0,0 @@
from typing import Any, List, Optional
from llama_index.core import VectorStoreIndex
from llama_index.core.callbacks.base import CallbackManager
from llama_index.core.schema import TransformComponent
from llama_index.core.service_context import ServiceContext
from llama_index.core.settings import (
Settings,
callback_manager_from_settings_or_context,
transformations_from_settings_or_context,
)
from llama_index.core.storage.storage_context import StorageContext
class GptacVectorStoreIndex(VectorStoreIndex):
@classmethod
def default_vector_store(
cls,
storage_context: Optional[StorageContext] = None,
show_progress: bool = False,
callback_manager: Optional[CallbackManager] = None,
transformations: Optional[List[TransformComponent]] = None,
# deprecated
service_context: Optional[ServiceContext] = None,
embed_model=None,
**kwargs: Any,
):
"""Create index from documents.
Args:
documents (Optional[Sequence[BaseDocument]]): List of documents to
build the index from.
"""
storage_context = storage_context or StorageContext.from_defaults()
docstore = storage_context.docstore
callback_manager = (
callback_manager
or callback_manager_from_settings_or_context(Settings, service_context)
)
transformations = transformations or transformations_from_settings_or_context(
Settings, service_context
)
with callback_manager.as_trace("index_construction"):
return cls(
nodes=[],
storage_context=storage_context,
callback_manager=callback_manager,
show_progress=show_progress,
transformations=transformations,
service_context=service_context,
embed_model=embed_model,
**kwargs,
)

View File

@@ -1,17 +1,16 @@
# From project chatglm-langchain
import threading
from toolbox import Singleton
import os
import shutil
import os
import uuid
import tqdm
import shutil
import threading
import numpy as np
from toolbox import Singleton
from loguru import logger
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from typing import List, Tuple
import numpy as np
from crazy_functions.vector_fns.general_file_loader import load_file
embedding_model_dict = {
@@ -151,17 +150,17 @@ class LocalDocQA:
failed_files = []
if isinstance(filepath, str):
if not os.path.exists(filepath):
logger.error("路径不存在")
print("路径不存在")
return None
elif os.path.isfile(filepath):
file = os.path.split(filepath)[-1]
try:
docs = load_file(filepath, SENTENCE_SIZE)
logger.info(f"{file} 已成功加载")
print(f"{file} 已成功加载")
loaded_files.append(filepath)
except Exception as e:
logger.error(e)
logger.error(f"{file} 未能成功加载")
print(e)
print(f"{file} 未能成功加载")
return None
elif os.path.isdir(filepath):
docs = []
@@ -171,23 +170,23 @@ class LocalDocQA:
docs += load_file(fullfilepath, SENTENCE_SIZE)
loaded_files.append(fullfilepath)
except Exception as e:
logger.error(e)
print(e)
failed_files.append(file)
if len(failed_files) > 0:
logger.error("以下文件未能成功加载:")
print("以下文件未能成功加载:")
for file in failed_files:
logger.error(f"{file}\n")
print(f"{file}\n")
else:
docs = []
for file in filepath:
docs += load_file(file, SENTENCE_SIZE)
logger.info(f"{file} 已成功加载")
print(f"{file} 已成功加载")
loaded_files.append(file)
if len(docs) > 0:
logger.info("文件加载完毕,正在生成向量库")
print("文件加载完毕,正在生成向量库")
if vs_path and os.path.isdir(vs_path):
try:
self.vector_store = FAISS.load_local(vs_path, text2vec)
@@ -234,7 +233,7 @@ class LocalDocQA:
prompt += "\n\n".join([f"({k}): " + doc.page_content for k, doc in enumerate(related_docs_with_score)])
prompt += "\n\n---\n\n"
prompt = prompt.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
# logger.info(prompt)
# print(prompt)
response = {"query": query, "source_documents": related_docs_with_score}
return response, prompt
@@ -263,7 +262,7 @@ def construct_vector_store(vs_id, vs_path, files, sentence_size, history, one_co
else:
pass
# file_status = "文件未成功加载,请重新上传文件"
# logger.info(file_status)
# print(file_status)
return local_doc_qa, vs_path
@Singleton
@@ -279,7 +278,7 @@ class knowledge_archive_interface():
if self.text2vec_large_chinese is None:
# < -------------------预热文本向量化模组--------------- >
from toolbox import ProxyNetworkActivate
logger.info('Checking Text2vec ...')
print('Checking Text2vec ...')
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络
self.text2vec_large_chinese = HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")

View File

@@ -10,7 +10,7 @@ def read_avail_plugin_enum():
from crazy_functional import get_crazy_functions
plugin_arr = get_crazy_functions()
# remove plugins with out explaination
plugin_arr = {k:v for k, v in plugin_arr.items() if ('Info' in v) and ('Function' in v)}
plugin_arr = {k:v for k, v in plugin_arr.items() if 'Info' in v}
plugin_arr_info = {"F_{:04d}".format(i):v["Info"] for i, v in enumerate(plugin_arr.values(), start=1)}
plugin_arr_dict = {"F_{:04d}".format(i):v for i, v in enumerate(plugin_arr.values(), start=1)}
plugin_arr_dict_parse = {"F_{:04d}".format(i):v for i, v in enumerate(plugin_arr.values(), start=1)}

View File

@@ -1,19 +1,17 @@
import re, requests, unicodedata, os
from toolbox import update_ui, get_log_folder
from toolbox import write_history_to_file, promote_file_to_downloadzone
from toolbox import CatchException, report_exception, get_conf
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from loguru import logger
import re, requests, unicodedata, os
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
def download_arxiv_(url_pdf):
if 'arxiv.org' not in url_pdf:
if ('.' in url_pdf) and ('/' not in url_pdf):
new_url = 'https://arxiv.org/abs/'+url_pdf
logger.info('下载编号:', url_pdf, '自动定位:', new_url)
print('下载编号:', url_pdf, '自动定位:', new_url)
# download_arxiv_(new_url)
return download_arxiv_(new_url)
else:
logger.info('不能识别的URL')
print('不能识别的URL')
return None
if 'abs' in url_pdf:
url_pdf = url_pdf.replace('abs', 'pdf')
@@ -44,12 +42,15 @@ def download_arxiv_(url_pdf):
requests_pdf_url = url_pdf
file_path = download_dir+title_str
logger.info('下载中')
print('下载中')
proxies = get_conf('proxies')
r = requests.get(requests_pdf_url, proxies=proxies)
with open(file_path, 'wb+') as f:
f.write(r.content)
logger.info('下载完成')
print('下载完成')
# print('输出下载命令:','aria2c -o \"%s\" %s'%(title_str,url_pdf))
# subprocess.call('aria2c --all-proxy=\"172.18.116.150:11084\" -o \"%s\" %s'%(download_dir+title_str,url_pdf), shell=True)
x = "%s %s %s.bib" % (paper_id, other_info['year'], other_info['authors'])
x = x.replace('?', '')\
@@ -62,9 +63,19 @@ def download_arxiv_(url_pdf):
def get_name(_url_):
import os
from bs4 import BeautifulSoup
logger.info('正在获取文献名!')
logger.info(_url_)
print('正在获取文献名!')
print(_url_)
# arxiv_recall = {}
# if os.path.exists('./arxiv_recall.pkl'):
# with open('./arxiv_recall.pkl', 'rb') as f:
# arxiv_recall = pickle.load(f)
# if _url_ in arxiv_recall:
# print('在缓存中')
# return arxiv_recall[_url_]
proxies = get_conf('proxies')
res = requests.get(_url_, proxies=proxies)
@@ -81,7 +92,7 @@ def get_name(_url_):
other_details['abstract'] = abstract
except:
other_details['year'] = ''
logger.info('年份获取失败')
print('年份获取失败')
# get author
try:
@@ -90,7 +101,7 @@ def get_name(_url_):
other_details['authors'] = authors
except:
other_details['authors'] = ''
logger.info('authors获取失败')
print('authors获取失败')
# get comment
try:
@@ -105,11 +116,11 @@ def get_name(_url_):
other_details['comment'] = ''
except:
other_details['comment'] = ''
logger.info('年份获取失败')
print('年份获取失败')
title_str = BeautifulSoup(
res.text, 'html.parser').find('title').contents[0]
logger.info('获取成功:', title_str)
print('获取成功:', title_str)
# arxiv_recall[_url_] = (title_str+'.pdf', other_details)
# with open('./arxiv_recall.pkl', 'wb') as f:
# pickle.dump(arxiv_recall, f)

View File

@@ -1,5 +1,6 @@
from toolbox import CatchException, update_ui
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
@CatchException
def 交互功能模板函数(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):

View File

@@ -16,8 +16,8 @@ Testing:
from toolbox import CatchException, update_ui, gen_time_str, trimmed_format_exc, is_the_upload_folder
from toolbox import promote_file_to_downloadzone, get_log_folder, update_ui_lastest_msg
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_plugin_arg
from crazy_functions.crazy_utils import input_clipping, try_install_deps
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_plugin_arg
from .crazy_utils import input_clipping, try_install_deps
from crazy_functions.gen_fns.gen_fns_shared import is_function_successfully_generated
from crazy_functions.gen_fns.gen_fns_shared import get_class_name
from crazy_functions.gen_fns.gen_fns_shared import subprocess_worker

View File

@@ -1,6 +1,6 @@
from toolbox import CatchException, update_ui, gen_time_str
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from crazy_functions.crazy_utils import input_clipping
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import input_clipping
import copy, json
@CatchException

View File

@@ -30,7 +30,7 @@ def gen_image(llm_kwargs, prompt, resolution="1024x1024", model="dall-e-2", qual
if style is not None:
data['style'] = style
response = requests.post(url, headers=headers, json=data, proxies=proxies)
# logger.info(response.content)
print(response.content)
try:
image_url = json.loads(response.content.decode('utf8'))['data'][0]['url']
except:
@@ -76,7 +76,7 @@ def edit_image(llm_kwargs, prompt, image_path, resolution="1024x1024", model="da
}
response = requests.post(url, headers=headers, files=files, proxies=proxies)
# logger.info(response.content)
print(response.content)
try:
image_url = json.loads(response.content.decode('utf8'))['data'][0]['url']
except:
@@ -108,7 +108,7 @@ def 图片生成_DALLE2(prompt, llm_kwargs, plugin_kwargs, chatbot, history, sys
chatbot.append((prompt, "[Local Message] 图像生成提示为空白,请在“输入区”输入图像生成提示。"))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 界面更新
return
chatbot.append(("您正在调用“图像生成”插件。", "[Local Message] 生成图像, 使用前请切换模型到GPT系列。如果中文Prompt效果不理想, 请尝试英文Prompt。正在处理中 ....."))
chatbot.append(("您正在调用“图像生成”插件。", "[Local Message] 生成图像, 请先把模型切换至gpt-*。如果中文Prompt效果不理想, 请尝试英文Prompt。正在处理中 ....."))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 由于请求gpt需要一段时间,我们先及时地做一次界面更新
if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
resolution = plugin_kwargs.get("advanced_arg", '1024x1024')
@@ -129,7 +129,7 @@ def 图片生成_DALLE3(prompt, llm_kwargs, plugin_kwargs, chatbot, history, sys
chatbot.append((prompt, "[Local Message] 图像生成提示为空白,请在“输入区”输入图像生成提示。"))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 界面更新
return
chatbot.append(("您正在调用“图像生成”插件。", "[Local Message] 生成图像, 使用前请切换模型到GPT系列。如果中文Prompt效果不理想, 请尝试英文Prompt。正在处理中 ....."))
chatbot.append(("您正在调用“图像生成”插件。", "[Local Message] 生成图像, 请先把模型切换至gpt-*。如果中文Prompt效果不理想, 请尝试英文Prompt。正在处理中 ....."))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 由于请求gpt需要一段时间,我们先及时地做一次界面更新
if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
resolution_arg = plugin_kwargs.get("advanced_arg", '1024x1024-standard-vivid').lower()
@@ -166,7 +166,7 @@ class ImageEditState(GptAcademicState):
return confirm, file
def lock_plugin(self, chatbot):
chatbot._cookies['lock_plugin'] = 'crazy_functions.Image_Generate->图片修改_DALLE2'
chatbot._cookies['lock_plugin'] = 'crazy_functions.图片生成->图片修改_DALLE2'
self.dump_state(chatbot)
def unlock_plugin(self, chatbot):

View File

@@ -6,14 +6,13 @@
"""
import time
from toolbox import CatchException, update_ui, gen_time_str, trimmed_format_exc, ProxyNetworkActivate
from toolbox import get_conf, select_api_key, update_ui_lastest_msg, Singleton
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_plugin_arg
from crazy_functions.crazy_utils import input_clipping, try_install_deps
from crazy_functions.agent_fns.persistent import GradioMultiuserManagerForPersistentClasses
from crazy_functions.agent_fns.auto_agent import AutoGenMath
from loguru import logger
import time
def remove_model_prefix(llm):
if llm.startswith('api2d-'): llm = llm.replace('api2d-', '')
@@ -81,12 +80,12 @@ def 多智能体终端(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_
persistent_key = f"{user_uuid}->多智能体终端"
if persistent_class_multi_user_manager.already_alive(persistent_key):
# 当已经存在一个正在运行的多智能体终端时,直接将用户输入传递给它,而不是再次启动一个新的多智能体终端
logger.info('[debug] feed new user input')
print('[debug] feed new user input')
executor = persistent_class_multi_user_manager.get(persistent_key)
exit_reason = yield from executor.main_process_ui_control(txt, create_or_resume="resume")
else:
# 运行多智能体终端 (首次)
logger.info('[debug] create new executor instance')
print('[debug] create new executor instance')
history = []
chatbot.append(["正在启动: 多智能体终端", "插件动态生成, 执行开始, 作者 Microsoft & Binary-Husky."])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

View File

@@ -1,5 +1,4 @@
from toolbox import CatchException, update_ui, promote_file_to_downloadzone, get_log_folder, get_user
from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty
import re
f_prefix = 'GPT-Academic对话存档'
@@ -10,61 +9,27 @@ def write_chat_to_file(chatbot, history=None, file_name=None):
"""
import os
import time
from themes.theme import advanced_css
if file_name is None:
file_name = f_prefix + time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + '.html'
fp = os.path.join(get_log_folder(get_user(chatbot), plugin_name='chat_history'), file_name)
with open(fp, 'w', encoding='utf8') as f:
from textwrap import dedent
form = dedent("""
<!DOCTYPE html><head><meta charset="utf-8"><title>对话存档</title><style>{CSS}</style></head>
<body>
<div class="test_temp1" style="width:10%; height: 500px; float:left;"></div>
<div class="test_temp2" style="width:80%;padding: 40px;float:left;padding-left: 20px;padding-right: 20px;box-shadow: rgba(0, 0, 0, 0.2) 0px 0px 8px 8px;border-radius: 10px;">
<div class="chat-body" style="display: flex;justify-content: center;flex-direction: column;align-items: center;flex-wrap: nowrap;">
{CHAT_PREVIEW}
<div></div>
<div></div>
<div style="text-align: center;width:80%;padding: 0px;float:left;padding-left:20px;padding-right:20px;box-shadow: rgba(0, 0, 0, 0.05) 0px 0px 1px 2px;border-radius: 1px;">对话原始数据</div>
{HISTORY_PREVIEW}
</div>
</div>
<div class="test_temp3" style="width:10%; height: 500px; float:left;"></div>
</body>
""")
qa_from = dedent("""
<div class="QaBox" style="width:80%;padding: 20px;margin-bottom: 20px;box-shadow: rgb(0 255 159 / 50%) 0px 0px 1px 2px;border-radius: 4px;">
<div class="Question" style="border-radius: 2px;">{QUESTION}</div>
<hr color="blue" style="border-top: dotted 2px #ccc;">
<div class="Answer" style="border-radius: 2px;">{ANSWER}</div>
</div>
""")
history_from = dedent("""
<div class="historyBox" style="width:80%;padding: 0px;float:left;padding-left:20px;padding-right:20px;box-shadow: rgba(0, 0, 0, 0.05) 0px 0px 1px 2px;border-radius: 1px;">
<div class="entry" style="border-radius: 2px;">{ENTRY}</div>
</div>
""")
CHAT_PREVIEW_BUF = ""
from themes.theme import advanced_css
f.write(f'<!DOCTYPE html><head><meta charset="utf-8"><title>对话历史</title><style>{advanced_css}</style></head>')
for i, contents in enumerate(chatbot):
question, answer = contents[0], contents[1]
if question is None: question = ""
try: question = str(question)
except: question = ""
if answer is None: answer = ""
try: answer = str(answer)
except: answer = ""
CHAT_PREVIEW_BUF += qa_from.format(QUESTION=question, ANSWER=answer)
HISTORY_PREVIEW_BUF = ""
for j, content in enumerate(contents):
try: # 这个bug没找到触发条件暂时先这样顶一下
if type(content) != str: content = str(content)
except:
continue
f.write(content)
if j == 0:
f.write('<hr style="border-top: dotted 3px #ccc;">')
f.write('<hr color="red"> \n\n')
f.write('<hr color="blue"> \n\n raw chat context:\n')
f.write('<code>')
for h in history:
HISTORY_PREVIEW_BUF += history_from.format(ENTRY=h)
html_content = form.format(CHAT_PREVIEW=CHAT_PREVIEW_BUF, HISTORY_PREVIEW=HISTORY_PREVIEW_BUF, CSS=advanced_css)
f.write(html_content)
f.write("\n>>>" + h)
f.write('</code>')
promote_file_to_downloadzone(fp, rename_file=file_name, chatbot=chatbot)
return '对话历史写入:' + fp
@@ -75,7 +40,7 @@ def gen_file_preview(file_name):
# pattern to match the text between <head> and </head>
pattern = re.compile(r'<head>.*?</head>', flags=re.DOTALL)
file_content = re.sub(pattern, '', file_content)
html, history = file_content.split('<hr color="blue"> \n\n 对话数据 (无渲染):\n')
html, history = file_content.split('<hr color="blue"> \n\n raw chat context:\n')
history = history.strip('<code>')
history = history.strip('</code>')
history = history.split("\n>>>")
@@ -86,25 +51,21 @@ def gen_file_preview(file_name):
def read_file_to_chat(chatbot, history, file_name):
with open(file_name, 'r', encoding='utf8') as f:
file_content = f.read()
from bs4 import BeautifulSoup
soup = BeautifulSoup(file_content, 'lxml')
# 提取QaBox信息
# pattern to match the text between <head> and </head>
pattern = re.compile(r'<head>.*?</head>', flags=re.DOTALL)
file_content = re.sub(pattern, '', file_content)
html, history = file_content.split('<hr color="blue"> \n\n raw chat context:\n')
history = history.strip('<code>')
history = history.strip('</code>')
history = history.split("\n>>>")
history = list(filter(lambda x:x!="", history))
html = html.split('<hr color="red"> \n\n')
html = list(filter(lambda x:x!="", html))
chatbot.clear()
qa_box_list = []
qa_boxes = soup.find_all("div", class_="QaBox")
for box in qa_boxes:
question = box.find("div", class_="Question").get_text(strip=False)
answer = box.find("div", class_="Answer").get_text(strip=False)
qa_box_list.append({"Question": question, "Answer": answer})
chatbot.append([question, answer])
# 提取historyBox信息
history_box_list = []
history_boxes = soup.find_all("div", class_="historyBox")
for box in history_boxes:
entry = box.find("div", class_="entry").get_text(strip=False)
history_box_list.append(entry)
history = history_box_list
chatbot.append([None, f"[Local Message] 载入对话{len(qa_box_list)}条,上下文{len(history)}条。"])
for i, h in enumerate(html):
i_say, gpt_say = h.split('<hr style="border-top: dotted 3px #ccc;">')
chatbot.append([i_say, gpt_say])
chatbot.append([f"存档文件详情?", f"[Local Message] 载入对话{len(html)}条,上下文{len(history)}条。"])
return chatbot, history
@CatchException
@@ -118,40 +79,11 @@ def 对话历史存档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_
system_prompt 给gpt的静默提醒
user_request 当前用户的请求信息IP地址等
"""
file_name = plugin_kwargs.get("file_name", None)
if (file_name is not None) and (file_name != "") and (not file_name.endswith('.html')): file_name += '.html'
else: file_name = None
chatbot.append((None, f"[Local Message] {write_chat_to_file(chatbot, history, file_name)},您可以调用下拉菜单中的“载入对话历史存档”还原当下的对话"))
chatbot.append(("保存当前对话",
f"[Local Message] {write_chat_to_file(chatbot, history)},您可以调用下拉菜单中的“载入对话历史存档”还原当下的对话。"))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间我们先及时地做一次界面更新
class Conversation_To_File_Wrap(GptAcademicPluginTemplate):
def __init__(self):
"""
请注意`execute`会执行在不同的线程中因此您在定义和使用类变量时应当慎之又慎
"""
pass
def define_arg_selection_menu(self):
"""
定义插件的二级选项菜单
第一个参数名称`file_name`参数`type`声明这是一个文本框文本框上方显示`title`文本框内部显示`description``default_value`为默认值
"""
gui_definition = {
"file_name": ArgProperty(title="保存文件名", description="输入对话存档文件名,留空则使用时间作为文件名", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步
}
return gui_definition
def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
"""
执行插件
"""
yield from 对话历史存档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
def hide_cwd(str):
import os
current_path = os.getcwd()
@@ -169,7 +101,7 @@ def 载入对话历史存档(txt, llm_kwargs, plugin_kwargs, chatbot, history, s
system_prompt 给gpt的静默提醒
user_request 当前用户的请求信息IP地址等
"""
from crazy_functions.crazy_utils import get_files_from_everything
from .crazy_utils import get_files_from_everything
success, file_manifest, _ = get_files_from_everything(txt, type='.html')
if not success:
@@ -216,3 +148,5 @@ def 删除所有本地对话历史记录(txt, llm_kwargs, plugin_kwargs, chatbot
chatbot.append([f"删除所有历史对话文件", f"已删除<br/>{local_history}"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return

View File

@@ -0,0 +1,127 @@
from toolbox import update_ui
from toolbox import CatchException, report_exception
from toolbox import write_history_to_file, promote_file_to_downloadzone
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
fast_debug = False
def 解析docx(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
import time, os
# pip install python-docx 用于docx格式跨平台
# pip install pywin32 用于doc格式仅支持Win平台
for index, fp in enumerate(file_manifest):
if fp.split(".")[-1] == "docx":
from docx import Document
doc = Document(fp)
file_content = "\n".join([para.text for para in doc.paragraphs])
else:
try:
import win32com.client
word = win32com.client.Dispatch("Word.Application")
word.visible = False
# 打开文件
doc = word.Documents.Open(os.getcwd() + '/' + fp)
# file_content = doc.Content.Text
doc = word.ActiveDocument
file_content = doc.Range().Text
doc.Close()
word.Quit()
except:
raise RuntimeError('请先将.doc文档转换为.docx文档。')
# private_upload里面的文件名在解压zip后容易出现乱码rar和7z格式正常故可以只分析文章内容不输入文件名
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
from request_llms.bridge_all import model_info
max_token = model_info[llm_kwargs['llm_model']]['max_token']
TOKEN_LIMIT_PER_FRAGMENT = max_token * 3 // 4
paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model'])
this_paper_history = []
for i, paper_frag in enumerate(paper_fragments):
i_say = f'请对下面的文章片段用中文做概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{paper_frag}```'
i_say_show_user = f'请对下面的文章片段做概述: {os.path.abspath(fp)}的第{i+1}/{len(paper_fragments)}个片段。'
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=i_say,
inputs_show_user=i_say_show_user,
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history=[],
sys_prompt="总结文章。"
)
chatbot[-1] = (i_say_show_user, gpt_say)
history.extend([i_say_show_user,gpt_say])
this_paper_history.extend([i_say_show_user,gpt_say])
# 已经对该文章的所有片段总结完毕,如果文章被切分了,
if len(paper_fragments) > 1:
i_say = f"根据以上的对话,总结文章{os.path.abspath(fp)}的主要内容。"
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=i_say,
inputs_show_user=i_say,
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history=this_paper_history,
sys_prompt="总结文章。"
)
history.extend([i_say,gpt_say])
this_paper_history.extend([i_say,gpt_say])
res = write_history_to_file(history)
promote_file_to_downloadzone(res, chatbot=chatbot)
chatbot.append(("完成了吗?", res))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
res = write_history_to_file(history)
promote_file_to_downloadzone(res, chatbot=chatbot)
chatbot.append(("所有文件都总结完成了吗?", res))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
@CatchException
def 总结word文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
import glob, os
# 基本信息:功能、贡献者
chatbot.append([
"函数插件功能?",
"批量总结Word文档。函数插件贡献者: JasonGuo1。注意, 如果是.doc文件, 请先转化为.docx格式。"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
from docx import Document
except:
report_exception(chatbot, history,
a=f"解析项目: {txt}",
b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade python-docx pywin32```。")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# 清空历史,以免输入溢出
history = []
# 检测输入参数,如没有给定输入参数,直接退出
if os.path.exists(txt):
project_folder = txt
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到本地项目或无权访问: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# 搜索需要处理的文件清单
if txt.endswith('.docx') or txt.endswith('.doc'):
file_manifest = [txt]
else:
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.docx', recursive=True)] + \
[f for f in glob.glob(f'{project_folder}/**/*.doc', recursive=True)]
# 如果没找到任何文件
if len(file_manifest) == 0:
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到任何.docx或doc文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# 开始正式执行任务
yield from 解析docx(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)

View File

@@ -1,5 +1,5 @@
from toolbox import CatchException, report_exception, select_api_key, update_ui, get_conf
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from toolbox import write_history_to_file, promote_file_to_downloadzone, get_log_folder
def split_audio_file(filename, split_duration=1000):

View File

@@ -1,6 +1,5 @@
import glob, shutil, os, re
from loguru import logger
from toolbox import update_ui, trimmed_format_exc, gen_time_str
import glob, time, os, re, logging
from toolbox import update_ui, trimmed_format_exc, gen_time_str, disable_auto_promotion
from toolbox import CatchException, report_exception, get_log_folder
from toolbox import write_history_to_file, promote_file_to_downloadzone
fast_debug = False
@@ -19,7 +18,7 @@ class PaperFileGroup():
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
self.get_token_num = get_token_num
def run_file_split(self, max_token_limit=2048):
def run_file_split(self, max_token_limit=1900):
"""
将长文本分离开来
"""
@@ -35,7 +34,7 @@ class PaperFileGroup():
self.sp_file_contents.append(segment)
self.sp_file_index.append(index)
self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.md")
logger.info('Segmentation: done')
logging.info('Segmentation: done')
def merge_result(self):
self.file_result = ["" for _ in range(len(self.file_paths))]
@@ -52,7 +51,7 @@ class PaperFileGroup():
return manifest
def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en'):
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
# <-------- 读取Markdown文件删除其中的所有注释 ---------->
pfg = PaperFileGroup()
@@ -65,25 +64,25 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
pfg.file_contents.append(file_content)
# <-------- 拆分过长的Markdown文件 ---------->
pfg.run_file_split(max_token_limit=1024)
pfg.run_file_split(max_token_limit=1500)
n_split = len(pfg.sp_file_contents)
# <-------- 多线程翻译开始 ---------->
if language == 'en->zh':
inputs_array = ["This is a Markdown file, translate it into Chinese, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" +
inputs_array = ["This is a Markdown file, translate it into Chinese, do not modify any existing Markdown commands:" +
f"\n\n{frag}" for frag in pfg.sp_file_contents]
inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
sys_prompt_array = ["You are a professional academic paper translator." + plugin_kwargs.get("additional_prompt", "") for _ in range(n_split)]
sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
elif language == 'zh->en':
inputs_array = [f"This is a Markdown file, translate it into English, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" +
inputs_array = [f"This is a Markdown file, translate it into English, do not modify any existing Markdown commands:" +
f"\n\n{frag}" for frag in pfg.sp_file_contents]
inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
sys_prompt_array = ["You are a professional academic paper translator." + plugin_kwargs.get("additional_prompt", "") for _ in range(n_split)]
sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
else:
inputs_array = [f"This is a Markdown file, translate it into {language}, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" +
inputs_array = [f"This is a Markdown file, translate it into {language}, do not modify any existing Markdown commands, only answer me with translated results:" +
f"\n\n{frag}" for frag in pfg.sp_file_contents]
inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
sys_prompt_array = ["You are a professional academic paper translator." + plugin_kwargs.get("additional_prompt", "") for _ in range(n_split)]
sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
inputs_array=inputs_array,
@@ -100,14 +99,9 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
for i_say, gpt_say in zip(gpt_response_collection[0::2], gpt_response_collection[1::2]):
pfg.sp_file_result.append(gpt_say)
pfg.merge_result()
output_file_arr = pfg.write_result(language)
for output_file in output_file_arr:
promote_file_to_downloadzone(output_file, chatbot=chatbot)
if 'markdown_expected_output_path' in plugin_kwargs:
expected_f_name = plugin_kwargs['markdown_expected_output_path']
shutil.copyfile(output_file, expected_f_name)
pfg.write_result(language)
except:
logger.error(trimmed_format_exc())
logging.error(trimmed_format_exc())
# <-------- 整理结果,退出 ---------->
create_report_file_name = gen_time_str() + f"-chatgpt.md"
@@ -127,7 +121,7 @@ def get_files_from_everything(txt, preference=''):
proxies = get_conf('proxies')
# 网络的远程文件
if preference == 'Github':
logger.info('正在从github下载资源 ...')
logging.info('正在从github下载资源 ...')
if not txt.endswith('.md'):
# Make a request to the GitHub API to retrieve the repository information
url = txt.replace("https://github.com/", "https://api.github.com/repos/") + '/readme'
@@ -165,6 +159,7 @@ def Markdown英译中(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_p
"函数插件功能?",
"对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
disable_auto_promotion(chatbot)
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
@@ -204,6 +199,7 @@ def Markdown中译英(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_p
"函数插件功能?",
"对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
disable_auto_promotion(chatbot)
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
@@ -236,6 +232,7 @@ def Markdown翻译指定语言(txt, llm_kwargs, plugin_kwargs, chatbot, history,
"函数插件功能?",
"对整个Markdown项目进行翻译。函数插件贡献者: Binary-Husky"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
disable_auto_promotion(chatbot)
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:

View File

@@ -1,18 +1,16 @@
from loguru import logger
from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str
from toolbox import CatchException, report_exception
from toolbox import write_history_to_file, promote_file_to_downloadzone
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from crazy_functions.crazy_utils import read_and_clean_pdf_text
from crazy_functions.crazy_utils import input_clipping
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import read_and_clean_pdf_text
from .crazy_utils import input_clipping
def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
file_write_buffer = []
for file_name in file_manifest:
logger.info('begin analysis on:', file_name)
print('begin analysis on:', file_name)
############################## <第 0 步切割PDF> ##################################
# 递归地切割PDF文件每一块尽量是完整的一个section比如introductionexperiment等必要时再进行切割
# 的长度必须小于 2500 个 Token
@@ -40,7 +38,7 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
last_iteration_result = paper_meta # 初始值是摘要
MAX_WORD_TOTAL = 4096 * 0.7
n_fragment = len(paper_fragments)
if n_fragment >= 20: logger.warning('文章极长,不能达到预期效果')
if n_fragment >= 20: print('文章极长,不能达到预期效果')
for i in range(n_fragment):
NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i]}"

View File

@@ -1,7 +1,6 @@
from loguru import logger
from toolbox import update_ui
from toolbox import CatchException, report_exception
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from toolbox import write_history_to_file, promote_file_to_downloadzone
fast_debug = False
@@ -58,6 +57,7 @@ def readPdf(pdfPath):
layout = device.get_result()
for obj in layout._objs:
if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
# print(obj.get_text())
outTextList.append(obj.get_text())
return outTextList
@@ -66,7 +66,7 @@ def readPdf(pdfPath):
def 解析Paper(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
import time, glob, os
from bs4 import BeautifulSoup
logger.info('begin analysis on:', file_manifest)
print('begin analysis on:', file_manifest)
for index, fp in enumerate(file_manifest):
if ".tex" in fp:
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
@@ -77,7 +77,7 @@ def 解析Paper(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbo
prefix = "接下来请你逐文件分析下面的论文文件,概括其内容" if index==0 else ""
i_say = prefix + f'请对下面的文章片段用中文做一个概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{file_content}```'
i_say_show_user = prefix + f'[{index+1}/{len(file_manifest)}] 请对下面的文章片段做一个概述: {os.path.abspath(fp)}'
i_say_show_user = prefix + f'[{index}/{len(file_manifest)}] 请对下面的文章片段做一个概述: {os.path.abspath(fp)}'
chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

View File

@@ -1,496 +0,0 @@
import os
import threading
import time
from dataclasses import dataclass
from typing import List, Tuple, Dict, Generator
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
from crazy_functions.rag_fns.rag_file_support import extract_text
from request_llms.bridge_all import model_info
from toolbox import update_ui, CatchException, report_exception
@dataclass
class FileFragment:
"""文件片段数据类,用于组织处理单元"""
file_path: str
content: str
rel_path: str
fragment_index: int
total_fragments: int
class BatchDocumentSummarizer:
"""优化的文档总结器 - 批处理版本"""
def __init__(self, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, history: List, system_prompt: str):
"""初始化总结器"""
self.llm_kwargs = llm_kwargs
self.plugin_kwargs = plugin_kwargs
self.chatbot = chatbot
self.history = history
self.system_prompt = system_prompt
self.failed_files = []
self.file_summaries_map = {}
def _get_token_limit(self) -> int:
"""获取模型token限制"""
max_token = model_info[self.llm_kwargs['llm_model']]['max_token']
return max_token * 3 // 4
def _create_batch_inputs(self, fragments: List[FileFragment]) -> Tuple[List, List, List]:
"""创建批处理输入"""
inputs_array = []
inputs_show_user_array = []
history_array = []
for frag in fragments:
if self.plugin_kwargs.get("advanced_arg"):
i_say = (f'请按照用户要求对文件内容进行处理,文件名为{os.path.basename(frag.file_path)}'
f'用户要求为:{self.plugin_kwargs["advanced_arg"]}'
f'文件内容是 ```{frag.content}```')
i_say_show_user = (f'正在处理 {frag.rel_path} (片段 {frag.fragment_index + 1}/{frag.total_fragments})')
else:
i_say = (f'请对下面的内容用中文做总结不超过500字文件名是{os.path.basename(frag.file_path)}'
f'内容是 ```{frag.content}```')
i_say_show_user = f'正在处理 {frag.rel_path} (片段 {frag.fragment_index + 1}/{frag.total_fragments})'
inputs_array.append(i_say)
inputs_show_user_array.append(i_say_show_user)
history_array.append([])
return inputs_array, inputs_show_user_array, history_array
def _process_single_file_with_timeout(self, file_info: Tuple[str, str], mutable_status: List) -> List[FileFragment]:
"""包装了超时控制的文件处理函数"""
def timeout_handler():
thread = threading.current_thread()
if hasattr(thread, '_timeout_occurred'):
thread._timeout_occurred = True
# 设置超时标记
thread = threading.current_thread()
thread._timeout_occurred = False
# 设置超时定时器
timer = threading.Timer(self.watch_dog_patience, timeout_handler)
timer.start()
try:
fp, project_folder = file_info
fragments = []
# 定期检查是否超时
def check_timeout():
if hasattr(thread, '_timeout_occurred') and thread._timeout_occurred:
raise TimeoutError("处理超时")
# 更新状态
mutable_status[0] = "检查文件大小"
mutable_status[1] = time.time()
check_timeout()
# 文件大小检查
if os.path.getsize(fp) > self.max_file_size:
self.failed_files.append((fp, f"文件过大:超过{self.max_file_size / 1024 / 1024}MB"))
mutable_status[2] = "文件过大"
return fragments
check_timeout()
# 更新状态
mutable_status[0] = "提取文件内容"
mutable_status[1] = time.time()
# 提取内容
content = extract_text(fp)
if content is None:
self.failed_files.append((fp, "文件解析失败:不支持的格式或文件损坏"))
mutable_status[2] = "格式不支持"
return fragments
elif not content.strip():
self.failed_files.append((fp, "文件内容为空"))
mutable_status[2] = "内容为空"
return fragments
check_timeout()
# 更新状态
mutable_status[0] = "分割文本"
mutable_status[1] = time.time()
# 分割文本
try:
paper_fragments = breakdown_text_to_satisfy_token_limit(
txt=content,
limit=self._get_token_limit(),
llm_model=self.llm_kwargs['llm_model']
)
except Exception as e:
self.failed_files.append((fp, f"文本分割失败:{str(e)}"))
mutable_status[2] = "分割失败"
return fragments
check_timeout()
# 处理片段
rel_path = os.path.relpath(fp, project_folder)
for i, frag in enumerate(paper_fragments):
if frag.strip():
fragments.append(FileFragment(
file_path=fp,
content=frag,
rel_path=rel_path,
fragment_index=i,
total_fragments=len(paper_fragments)
))
mutable_status[2] = "处理完成"
return fragments
except TimeoutError as e:
self.failed_files.append((fp, "处理超时"))
mutable_status[2] = "处理超时"
return []
except Exception as e:
self.failed_files.append((fp, f"处理失败:{str(e)}"))
mutable_status[2] = "处理异常"
return []
finally:
timer.cancel()
def prepare_fragments(self, project_folder: str, file_paths: List[str]) -> Generator:
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from typing import Generator, List
"""并行准备所有文件的处理片段"""
all_fragments = []
total_files = len(file_paths)
# 配置参数
self.refresh_interval = 0.2 # UI刷新间隔
self.watch_dog_patience = 5 # 看门狗超时时间
self.max_file_size = 10 * 1024 * 1024 # 10MB限制
self.max_workers = min(32, len(file_paths)) # 最多32个线程
# 创建有超时控制的线程池
executor = ThreadPoolExecutor(max_workers=self.max_workers)
# 用于跨线程状态传递的可变列表 - 增加文件名信息
mutable_status_array = [["等待中", time.time(), "pending", file_path] for file_path in file_paths]
# 创建文件处理任务
file_infos = [(fp, project_folder) for fp in file_paths]
# 提交所有任务,使用带超时控制的处理函数
futures = [
executor.submit(
self._process_single_file_with_timeout,
file_info,
mutable_status_array[i]
) for i, file_info in enumerate(file_infos)
]
# 更新UI的计数器
cnt = 0
try:
# 监控任务执行
while True:
time.sleep(self.refresh_interval)
cnt += 1
# 检查任务完成状态
worker_done = [f.done() for f in futures]
# 更新状态显示
status_str = ""
for i, (status, timestamp, desc, file_path) in enumerate(mutable_status_array):
# 获取文件名(去掉路径)
file_name = os.path.basename(file_path)
if worker_done[i]:
status_str += f"文件 {file_name}: {desc}\n"
else:
status_str += f"文件 {file_name}: {status} {desc}\n"
# 更新UI
self.chatbot[-1] = [
"处理进度",
f"正在处理文件...\n\n{status_str}" + "." * (cnt % 10 + 1)
]
yield from update_ui(chatbot=self.chatbot, history=self.history)
# 检查是否所有任务完成
if all(worker_done):
break
finally:
# 确保线程池正确关闭
executor.shutdown(wait=False)
# 收集结果
processed_files = 0
for future in futures:
try:
fragments = future.result(timeout=0.1) # 给予一个短暂的超时时间来获取结果
all_fragments.extend(fragments)
processed_files += 1
except concurrent.futures.TimeoutError:
# 处理获取结果超时
file_index = futures.index(future)
self.failed_files.append((file_paths[file_index], "结果获取超时"))
continue
except Exception as e:
# 处理其他异常
file_index = futures.index(future)
self.failed_files.append((file_paths[file_index], f"未知错误:{str(e)}"))
continue
# 最终进度更新
self.chatbot.append([
"文件处理完成",
f"成功处理 {len(all_fragments)} 个片段,失败 {len(self.failed_files)} 个文件"
])
yield from update_ui(chatbot=self.chatbot, history=self.history)
return all_fragments
def _process_fragments_batch(self, fragments: List[FileFragment]) -> Generator:
"""批量处理文件片段"""
from collections import defaultdict
batch_size = 64 # 每批处理的片段数
max_retries = 3 # 最大重试次数
retry_delay = 5 # 重试延迟(秒)
results = defaultdict(list)
# 按批次处理
for i in range(0, len(fragments), batch_size):
batch = fragments[i:i + batch_size]
inputs_array, inputs_show_user_array, history_array = self._create_batch_inputs(batch)
sys_prompt_array = ["请总结以下内容:"] * len(batch)
# 添加重试机制
for retry in range(max_retries):
try:
response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
inputs_array=inputs_array,
inputs_show_user_array=inputs_show_user_array,
llm_kwargs=self.llm_kwargs,
chatbot=self.chatbot,
history_array=history_array,
sys_prompt_array=sys_prompt_array,
)
# 处理响应
for j, frag in enumerate(batch):
summary = response_collection[j * 2 + 1]
if summary and summary.strip():
results[frag.rel_path].append({
'index': frag.fragment_index,
'summary': summary,
'total': frag.total_fragments
})
break # 成功处理,跳出重试循环
except Exception as e:
if retry == max_retries - 1: # 最后一次重试失败
for frag in batch:
self.failed_files.append((frag.file_path, f"处理失败:{str(e)}"))
else:
yield from update_ui(self.chatbot.append([f"批次处理失败,{retry_delay}秒后重试...", str(e)]))
time.sleep(retry_delay)
return results
def _generate_final_summary_request(self) -> Tuple[List, List, List]:
"""准备最终总结请求"""
if not self.file_summaries_map:
return (["无可用的文件总结"], ["生成最终总结"], [[]])
summaries = list(self.file_summaries_map.values())
if all(not summary for summary in summaries):
return (["所有文件处理均失败"], ["生成最终总结"], [[]])
if self.plugin_kwargs.get("advanced_arg"):
i_say = "根据以上所有文件的处理结果,按要求进行综合处理:" + self.plugin_kwargs['advanced_arg']
else:
i_say = "请根据以上所有文件的处理结果生成最终的总结不超过1000字。"
return ([i_say], [i_say], [summaries])
def process_files(self, project_folder: str, file_paths: List[str]) -> Generator:
"""处理所有文件"""
total_files = len(file_paths)
self.chatbot.append([f"开始处理", f"总计 {total_files} 个文件"])
yield from update_ui(chatbot=self.chatbot, history=self.history)
# 1. 准备所有文件片段
# 在 process_files 函数中:
fragments = yield from self.prepare_fragments(project_folder, file_paths)
if not fragments:
self.chatbot.append(["处理失败", "没有可处理的文件内容"])
return "没有可处理的文件内容"
# 2. 批量处理所有文件片段
self.chatbot.append([f"文件分析", f"共计 {len(fragments)} 个处理单元"])
yield from update_ui(chatbot=self.chatbot, history=self.history)
try:
file_summaries = yield from self._process_fragments_batch(fragments)
except Exception as e:
self.chatbot.append(["处理错误", f"批处理过程失败:{str(e)}"])
return "处理过程发生错误"
# 3. 为每个文件生成整体总结
self.chatbot.append(["生成总结", "正在汇总文件内容..."])
yield from update_ui(chatbot=self.chatbot, history=self.history)
# 处理每个文件的总结
for rel_path, summaries in file_summaries.items():
if len(summaries) > 1: # 多片段文件需要生成整体总结
sorted_summaries = sorted(summaries, key=lambda x: x['index'])
if self.plugin_kwargs.get("advanced_arg"):
i_say = f'请按照用户要求对文件内容进行处理,用户要求为:{self.plugin_kwargs["advanced_arg"]}'
else:
i_say = f"请总结文件 {os.path.basename(rel_path)} 的主要内容不超过500字。"
try:
summary_texts = [s['summary'] for s in sorted_summaries]
response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
inputs_array=[i_say],
inputs_show_user_array=[f"生成 {rel_path} 的处理结果"],
llm_kwargs=self.llm_kwargs,
chatbot=self.chatbot,
history_array=[summary_texts],
sys_prompt_array=["你是一个优秀的助手,"],
)
self.file_summaries_map[rel_path] = response_collection[1]
except Exception as e:
self.chatbot.append(["警告", f"文件 {rel_path} 总结生成失败:{str(e)}"])
self.file_summaries_map[rel_path] = "总结生成失败"
else: # 单片段文件直接使用其唯一的总结
self.file_summaries_map[rel_path] = summaries[0]['summary']
# 4. 生成最终总结
if total_files ==1:
return "文件数为1此时不调用总结模块"
else:
try:
# 收集所有文件的总结用于生成最终总结
file_summaries_for_final = []
for rel_path, summary in self.file_summaries_map.items():
file_summaries_for_final.append(f"文件 {rel_path} 的总结:\n{summary}")
if self.plugin_kwargs.get("advanced_arg"):
final_summary_prompt = ("根据以下所有文件的总结内容,按要求进行综合处理:" +
self.plugin_kwargs['advanced_arg'])
else:
final_summary_prompt = "请根据以下所有文件的总结内容,生成最终的总结报告。"
response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
inputs_array=[final_summary_prompt],
inputs_show_user_array=["生成最终总结报告"],
llm_kwargs=self.llm_kwargs,
chatbot=self.chatbot,
history_array=[file_summaries_for_final],
sys_prompt_array=["总结所有文件内容。"],
max_workers=1
)
return response_collection[1] if len(response_collection) > 1 else "生成总结失败"
except Exception as e:
self.chatbot.append(["错误", f"最终总结生成失败:{str(e)}"])
return "生成总结失败"
def save_results(self, final_summary: str):
"""保存结果到文件"""
from toolbox import promote_file_to_downloadzone, write_history_to_file
from crazy_functions.doc_fns.batch_file_query_doc import MarkdownFormatter, HtmlFormatter, WordFormatter
import os
timestamp = time.strftime("%Y%m%d_%H%M%S")
# 创建各种格式化器
md_formatter = MarkdownFormatter(final_summary, self.file_summaries_map, self.failed_files)
html_formatter = HtmlFormatter(final_summary, self.file_summaries_map, self.failed_files)
word_formatter = WordFormatter(final_summary, self.file_summaries_map, self.failed_files)
result_files = []
# 保存 Markdown
md_content = md_formatter.create_document()
result_file_md = write_history_to_file(
history=[md_content], # 直接传入内容列表
file_basename=f"文档总结_{timestamp}.md"
)
result_files.append(result_file_md)
# 保存 HTML
html_content = html_formatter.create_document()
result_file_html = write_history_to_file(
history=[html_content],
file_basename=f"文档总结_{timestamp}.html"
)
result_files.append(result_file_html)
# 保存 Word
doc = word_formatter.create_document()
# 由于 Word 文档需要用 doc.save(),我们使用与 md 文件相同的目录
result_file_docx = os.path.join(
os.path.dirname(result_file_md),
f"文档总结_{timestamp}.docx"
)
doc.save(result_file_docx)
result_files.append(result_file_docx)
# 添加到下载区
for file in result_files:
promote_file_to_downloadzone(file, chatbot=self.chatbot)
self.chatbot.append(["处理完成", f"结果已保存至: {', '.join(result_files)}"])
@CatchException
def 批量文件询问(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List,
history: List, system_prompt: str, user_request: str):
"""主函数 - 优化版本"""
# 初始化
import glob
import re
from crazy_functions.rag_fns.rag_file_support import supports_format
from toolbox import report_exception
summarizer = BatchDocumentSummarizer(llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
chatbot.append(["函数插件功能", f"作者lbykkkk批量总结文件。支持格式: {', '.join(supports_format)}等其他文本格式文件如果长时间卡在文件处理过程请查看处理进度然后删除所有处于“pending”状态的文件然后重新上传处理。"])
yield from update_ui(chatbot=chatbot, history=history)
# 验证输入路径
if not os.path.exists(txt):
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到项目或无权访问: {txt}")
yield from update_ui(chatbot=chatbot, history=history)
return
# 获取文件列表
project_folder = txt
extract_folder = next((d for d in glob.glob(f'{project_folder}/*')
if os.path.isdir(d) and d.endswith('.extract')), project_folder)
exclude_patterns = r'/[^/]+\.(zip|rar|7z|tar|gz)$'
file_manifest = [f for f in glob.glob(f'{extract_folder}/**', recursive=True)
if os.path.isfile(f) and not re.search(exclude_patterns, f)]
if not file_manifest:
report_exception(chatbot, history, a=f"解析项目: {txt}", b="未找到支持的文件类型")
yield from update_ui(chatbot=chatbot, history=history)
return
# 处理所有文件并生成总结
final_summary = yield from summarizer.process_files(project_folder, file_manifest)
yield from update_ui(chatbot=chatbot, history=history)
# 保存结果
summarizer.save_results(final_summary)
yield from update_ui(chatbot=chatbot, history=history)

View File

@@ -1,11 +1,11 @@
from toolbox import CatchException, report_exception, get_log_folder, gen_time_str
from toolbox import update_ui, promote_file_to_downloadzone, update_ui_lastest_msg, disable_auto_promotion
from toolbox import write_history_to_file, promote_file_to_downloadzone
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from crazy_functions.crazy_utils import read_and_clean_pdf_text
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from .crazy_utils import read_and_clean_pdf_text
from .pdf_fns.parse_pdf import parse_pdf, get_avail_grobid_url, translate_pdf
from shared_utils.colorful import *
from colorful import *
import copy
import os
import math
@@ -60,7 +60,7 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
# 清空历史,以免输入溢出
history = []
from crazy_functions.crazy_utils import get_files_from_everything
from .crazy_utils import get_files_from_everything
success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf')
if len(file_manifest) > 0:
# 尝试导入依赖,如果缺少依赖,则给出安装建议

View File

@@ -1,16 +1,83 @@
from toolbox import get_log_folder
from toolbox import update_ui, promote_file_to_downloadzone
from toolbox import CatchException, report_exception, get_log_folder, gen_time_str, check_packages
from toolbox import update_ui, promote_file_to_downloadzone, update_ui_lastest_msg, disable_auto_promotion
from toolbox import write_history_to_file, promote_file_to_downloadzone
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from crazy_functions.crazy_utils import read_and_clean_pdf_text
from shared_utils.colorful import *
from loguru import logger
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from .crazy_utils import read_and_clean_pdf_text
from .pdf_fns.parse_pdf import parse_pdf, get_avail_grobid_url, translate_pdf
from colorful import *
import os
def 解析PDF_简单拆解(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
@CatchException
def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
disable_auto_promotion(chatbot)
# 基本信息:功能、贡献者
chatbot.append([
"函数插件功能?",
"批量翻译PDF文档。函数插件贡献者: Binary-Husky"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
check_packages(["fitz", "tiktoken", "scipdf"])
except:
report_exception(chatbot, history,
a=f"解析项目: {txt}",
b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf tiktoken scipdf_parser```。")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# 清空历史,以免输入溢出
history = []
from .crazy_utils import get_files_from_everything
success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf')
# 检测输入参数,如没有给定输入参数,直接退出
if not success:
if txt == "": txt = '空空如也的输入栏'
# 如果没找到任何文件
if len(file_manifest) == 0:
report_exception(chatbot, history,
a=f"解析项目: {txt}", b=f"找不到任何.pdf拓展名的文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
# 开始正式执行任务
grobid_url = get_avail_grobid_url()
if grobid_url is not None:
yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url)
else:
yield from update_ui_lastest_msg("GROBID服务不可用请检查config中的GROBID_URL。作为替代现在将执行效果稍差的旧版代码。", chatbot, history, delay=3)
yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
def 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url):
import copy, json
TOKEN_LIMIT_PER_FRAGMENT = 1024
generated_conclusion_files = []
generated_html_files = []
DST_LANG = "中文"
from crazy_functions.pdf_fns.report_gen_html import construct_html
for index, fp in enumerate(file_manifest):
chatbot.append(["当前进度:", f"正在连接GROBID服务请稍候: {grobid_url}\n如果等待时间过长请修改config中的GROBID_URL可修改成本地GROBID服务。"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
article_dict = parse_pdf(fp, grobid_url)
grobid_json_res = os.path.join(get_log_folder(), gen_time_str() + "grobid.json")
with open(grobid_json_res, 'w+', encoding='utf8') as f:
f.write(json.dumps(article_dict, indent=4, ensure_ascii=False))
promote_file_to_downloadzone(grobid_json_res, chatbot=chatbot)
if article_dict is None: raise RuntimeError("解析PDF失败请检查PDF是否损坏。")
yield from translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG)
chatbot.append(("给出输出文件清单", str(generated_conclusion_files + generated_html_files)))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
"""
注意此函数已经弃用新函数位于crazy_functions/pdf_fns/parse_pdf.py
此函数已经弃用
"""
import copy
TOKEN_LIMIT_PER_FRAGMENT = 1024
@@ -49,8 +116,7 @@ def 解析PDF_简单拆解(file_manifest, project_folder, llm_kwargs, plugin_kwa
chatbot=chatbot,
history_array=[[paper_meta] for _ in paper_fragments],
sys_prompt_array=[
"请你作为一个学术翻译,负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" + plugin_kwargs.get("additional_prompt", "")
for _ in paper_fragments],
"请你作为一个学术翻译,负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" for _ in paper_fragments],
# max_workers=5 # OpenAI所允许的最大并行过载
)
gpt_response_collection_md = copy.deepcopy(gpt_response_collection)
@@ -94,7 +160,7 @@ def 解析PDF_简单拆解(file_manifest, project_folder, llm_kwargs, plugin_kwa
generated_html_files.append(ch.save_file(create_report_file_name))
except:
from toolbox import trimmed_format_exc
logger.error('writing html result failed:', trimmed_format_exc())
print('writing html result failed:', trimmed_format_exc())
# 准备文件的下载
for pdf_path in generated_conclusion_files:

View File

@@ -1,5 +1,4 @@
import os
from loguru import logger
from toolbox import CatchException, update_ui, gen_time_str, promote_file_to_downloadzone
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from crazy_functions.crazy_utils import input_clipping
@@ -35,10 +34,10 @@ def eval_manim(code):
return f'gpt_log/{time_str}.mp4'
except subprocess.CalledProcessError as e:
output = e.output.decode()
logger.error(f"Command returned non-zero exit status {e.returncode}: {output}.")
print(f"Command returned non-zero exit status {e.returncode}: {output}.")
return f"Evaluating python script failed: {e.output}."
except:
logger.error('generating mp4 failed')
print('generating mp4 failed')
return "Generating mp4 failed."

View File

@@ -1,12 +1,13 @@
from loguru import logger
from toolbox import update_ui
from toolbox import CatchException, report_exception
from crazy_functions.crazy_utils import read_and_clean_pdf_text
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import read_and_clean_pdf_text
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
fast_debug = False
def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
logger.info('begin analysis on:', file_name)
import tiktoken
print('begin analysis on:', file_name)
############################## <第 0 步切割PDF> ##################################
# 递归地切割PDF文件每一块尽量是完整的一个section比如introductionexperiment等必要时再进行切割
@@ -35,7 +36,7 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
last_iteration_result = paper_meta # 初始值是摘要
MAX_WORD_TOTAL = 4096
n_fragment = len(paper_fragments)
if n_fragment >= 20: logger.warning('文章极长,不能达到预期效果')
if n_fragment >= 20: print('文章极长,不能达到预期效果')
for i in range(n_fragment):
NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i]}"
@@ -56,7 +57,7 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
chatbot.append([i_say_show_user, gpt_say])
############################## <第 4 步设置一个token上限防止回答时Token溢出> ##################################
from crazy_functions.crazy_utils import input_clipping
from .crazy_utils import input_clipping
_, final_results = input_clipping("", final_results, max_token_limit=3200)
yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了

View File

@@ -1,21 +1,22 @@
from loguru import logger
from toolbox import update_ui
from toolbox import CatchException, report_exception
from toolbox import write_history_to_file, promote_file_to_downloadzone
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
fast_debug = False
def 生成函数注释(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
import time, os
logger.info('begin analysis on:', file_manifest)
print('begin analysis on:', file_manifest)
for index, fp in enumerate(file_manifest):
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
file_content = f.read()
i_say = f'请对下面的程序文件做一个概述并对文件中的所有函数生成注释使用markdown表格输出结果文件名是{os.path.relpath(fp, project_folder)},文件内容是 ```{file_content}```'
i_say_show_user = f'[{index+1}/{len(file_manifest)}] 请对下面的程序文件做一个概述,并对文件中的所有函数生成注释: {os.path.abspath(fp)}'
i_say_show_user = f'[{index}/{len(file_manifest)}] 请对下面的程序文件做一个概述,并对文件中的所有函数生成注释: {os.path.abspath(fp)}'
chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
if not fast_debug:
msg = '正常'
# ** gpt request **
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
@@ -24,8 +25,9 @@ def 生成函数注释(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
chatbot[-1] = (i_say_show_user, gpt_say)
history.append(i_say_show_user); history.append(gpt_say)
yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面
time.sleep(2)
if not fast_debug: time.sleep(2)
if not fast_debug:
res = write_history_to_file(history)
promote_file_to_downloadzone(res, chatbot=chatbot)
chatbot.append(("完成了吗?", res))

View File

@@ -1,9 +1,6 @@
from toolbox import CatchException, update_ui, report_exception
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from crazy_functions.plugin_template.plugin_class_template import (
GptAcademicPluginTemplate,
)
from crazy_functions.plugin_template.plugin_class_template import ArgProperty
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
import datetime
#以下是每类图表的PROMPT
SELECT_PROMPT = """
@@ -23,21 +20,19 @@ SELECT_PROMPT = """
#没有思维导图!!!测试发现模型始终会优先选择思维导图
#流程图
PROMPT_1 = """
请你给出围绕“{subject}”的逻辑关系图使用mermaid语法注意需要使用双引号将内容括起来。
mermaid语法举例
请你给出围绕“{subject}”的逻辑关系图使用mermaid语法mermaid语法举例
```mermaid
graph TD
P("编程") --> L1("Python")
P("编程") --> L2("C")
P("编程") --> L3("C++")
P("编程") --> L4("Javascipt")
P("编程") --> L5("PHP")
P(编程) --> L1(Python)
P(编程) --> L2(C)
P(编程) --> L3(C++)
P(编程) --> L4(Javascipt)
P(编程) --> L5(PHP)
```
"""
#序列图
PROMPT_2 = """
请你给出围绕“{subject}”的序列图使用mermaid语法
mermaid语法举例
请你给出围绕“{subject}”的序列图使用mermaid语法mermaid语法举例
```mermaid
sequenceDiagram
participant A as 用户
@@ -50,8 +45,7 @@ sequenceDiagram
"""
#类图
PROMPT_3 = """
请你给出围绕“{subject}”的类图使用mermaid语法
mermaid语法举例
请你给出围绕“{subject}”的类图使用mermaid语法mermaid语法举例
```mermaid
classDiagram
Class01 <|-- AveryLongClass : Cool
@@ -71,8 +65,7 @@ classDiagram
"""
#饼图
PROMPT_4 = """
请你给出围绕“{subject}”的饼图使用mermaid语法注意需要使用双引号将内容括起来。
mermaid语法举例
请你给出围绕“{subject}”的饼图使用mermaid语法mermaid语法举例
```mermaid
pie title Pets adopted by volunteers
"" : 386
@@ -82,39 +75,36 @@ pie title Pets adopted by volunteers
"""
#甘特图
PROMPT_5 = """
请你给出围绕“{subject}”的甘特图使用mermaid语法注意需要使用双引号将内容括起来。
mermaid语法举例
请你给出围绕“{subject}”的甘特图使用mermaid语法mermaid语法举例
```mermaid
gantt
title "项目开发流程"
title 项目开发流程
dateFormat YYYY-MM-DD
section "设计"
"需求分析" :done, des1, 2024-01-06,2024-01-08
"原型设计" :active, des2, 2024-01-09, 3d
"UI设计" : des3, after des2, 5d
section "开发"
"前端开发" :2024-01-20, 10d
"后端开发" :2024-01-20, 10d
section 设计
需求分析 :done, des1, 2024-01-06,2024-01-08
原型设计 :active, des2, 2024-01-09, 3d
UI设计 : des3, after des2, 5d
section 开发
前端开发 :2024-01-20, 10d
后端开发 :2024-01-20, 10d
```
"""
#状态图
PROMPT_6 = """
请你给出围绕“{subject}”的状态图使用mermaid语法注意需要使用双引号将内容括起来。
mermaid语法举例
请你给出围绕“{subject}”的状态图使用mermaid语法mermaid语法举例
```mermaid
stateDiagram-v2
[*] --> "Still"
"Still" --> [*]
"Still" --> "Moving"
"Moving" --> "Still"
"Moving" --> "Crash"
"Crash" --> [*]
[*] --> Still
Still --> [*]
Still --> Moving
Moving --> Still
Moving --> Crash
Crash --> [*]
```
"""
#实体关系图
PROMPT_7 = """
请你给出围绕“{subject}”的实体关系图使用mermaid语法
mermaid语法举例
请你给出围绕“{subject}”的实体关系图使用mermaid语法mermaid语法举例
```mermaid
erDiagram
CUSTOMER ||--o{ ORDER : places
@@ -136,170 +126,116 @@ erDiagram
"""
#象限提示图
PROMPT_8 = """
请你给出围绕“{subject}”的象限图使用mermaid语法注意需要使用双引号将内容括起来。
mermaid语法举例
请你给出围绕“{subject}”的象限图使用mermaid语法mermaid语法举例
```mermaid
graph LR
A["Hard skill"] --> B("Programming")
A["Hard skill"] --> C("Design")
D["Soft skill"] --> E("Coordination")
D["Soft skill"] --> F("Communication")
A[Hard skill] --> B(Programming)
A[Hard skill] --> C(Design)
D[Soft skill] --> E(Coordination)
D[Soft skill] --> F(Communication)
```
"""
#思维导图
PROMPT_9 = """
{subject}
==========
请给出上方内容的思维导图充分考虑其之间的逻辑使用mermaid语法注意需要使用双引号将内容括起来。
mermaid语法举例
请给出上方内容的思维导图充分考虑其之间的逻辑使用mermaid语法mermaid语法举例
```mermaid
mindmap
root((mindmap))
("Origins")
("Long history")
Origins
Long history
::icon(fa fa-book)
("Popularisation")
("British popular psychology author Tony Buzan")
::icon(fa fa-user)
("Research")
("On effectiveness<br/>and features")
::icon(fa fa-search)
("On Automatic creation")
::icon(fa fa-robot)
("Uses")
("Creative techniques")
::icon(fa fa-lightbulb-o)
("Strategic planning")
::icon(fa fa-flag)
("Argument mapping")
::icon(fa fa-comments)
("Tools")
("Pen and paper")
::icon(fa fa-pencil)
("Mermaid")
::icon(fa fa-code)
Popularisation
British popular psychology author Tony Buzan
Research
On effectiveness<br/>and features
On Automatic creation
Uses
Creative techniques
Strategic planning
Argument mapping
Tools
Pen and paper
Mermaid
```
"""
def 解析历史输入(history,llm_kwargs,file_manifest,chatbot,plugin_kwargs):
############################## <第 0 步,切割输入> ##################################
# 借用PDF切割中的函数对文本进行切割
TOKEN_LIMIT_PER_FRAGMENT = 2500
txt = (
str(history).encode("utf-8", "ignore").decode()
) # avoid reading non-utf8 chars
from crazy_functions.pdf_fns.breakdown_txt import (
breakdown_text_to_satisfy_token_limit,
)
txt = breakdown_text_to_satisfy_token_limit(
txt=txt, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs["llm_model"]
)
txt = str(history).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
txt = breakdown_text_to_satisfy_token_limit(txt=txt, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model'])
############################## <第 1 步,迭代地历遍整个文章,提取精炼信息> ##################################
results = []
MAX_WORD_TOTAL = 4096
n_txt = len(txt)
last_iteration_result = "从以下文本中提取摘要。"
if n_txt >= 20: print('文章极长,不能达到预期效果')
for i in range(n_txt):
NUM_OF_WORD = MAX_WORD_TOTAL // n_txt
i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words in Chinese: {txt[i]}"
i_say_show_user = f"[{i+1}/{n_txt}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {txt[i][:200]} ...."
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
i_say,
i_say_show_user, # i_say=真正给chatgpt的提问 i_say_show_user=给用户看的提问
llm_kwargs,
chatbot,
history=[
"The main content of the previous section is?",
last_iteration_result,
], # 迭代上一次的结果
sys_prompt="Extracts the main content from the text section where it is located for graphing purposes, answer me with Chinese.", # 提示
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, # i_say=真正给chatgpt的提问 i_say_show_user=给用户看的提问
llm_kwargs, chatbot,
history=["The main content of the previous section is?", last_iteration_result], # 迭代上一次的结果
sys_prompt="Extracts the main content from the text section where it is located for graphing purposes, answer me with Chinese." # 提示
)
results.append(gpt_say)
last_iteration_result = gpt_say
############################## <第 2 步,根据整理的摘要选择图表类型> ##################################
gpt_say = str(plugin_kwargs) # 将图表类型参数赋值为插件参数
results_txt = "\n".join(results) # 合并摘要
if gpt_say not in [
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
]: # 如插件参数不正确则使用对话模型判断
i_say_show_user = (
f"接下来将判断适合的图表类型,如连续3次判断失败将会使用流程图进行绘制"
)
gpt_say = "[Local Message] 收到。" # 用户提示
chatbot.append([i_say_show_user, gpt_say])
yield from update_ui(chatbot=chatbot, history=[]) # 更新UI
if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
gpt_say = plugin_kwargs.get("advanced_arg", "") #将图表类型参数赋值为插件参数
results_txt = '\n'.join(results) #合并摘要
if gpt_say not in ['1','2','3','4','5','6','7','8','9']: #如插件参数不正确则使用对话模型判断
i_say_show_user = f'接下来将判断适合的图表类型,如连续3次判断失败将会使用流程图进行绘制'; gpt_say = "[Local Message] 收到。" # 用户提示
chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[]) # 更新UI
i_say = SELECT_PROMPT.format(subject=results_txt)
i_say_show_user = f'请判断适合使用的流程图类型,其中数字对应关系为:1-流程图,2-序列图,3-类图,4-饼图,5-甘特图,6-状态图,7-实体关系图,8-象限提示图。由于不管提供文本是什么,模型大概率认为"思维导图"最合适,因此思维导图仅能通过参数调用。'
for i in range(3):
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=i_say,
inputs_show_user=i_say_show_user,
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history=[],
sys_prompt="",
llm_kwargs=llm_kwargs, chatbot=chatbot, history=[],
sys_prompt=""
)
if gpt_say in [
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
]: # 判断返回是否正确
if gpt_say in ['1','2','3','4','5','6','7','8','9']: #判断返回是否正确
break
if gpt_say not in ["1", "2", "3", "4", "5", "6", "7", "8", "9"]:
gpt_say = "1"
if gpt_say not in ['1','2','3','4','5','6','7','8','9']:
gpt_say = '1'
############################## <第 3 步,根据选择的图表类型绘制图表> ##################################
if gpt_say == "1":
if gpt_say == '1':
i_say = PROMPT_1.format(subject=results_txt)
elif gpt_say == "2":
elif gpt_say == '2':
i_say = PROMPT_2.format(subject=results_txt)
elif gpt_say == "3":
elif gpt_say == '3':
i_say = PROMPT_3.format(subject=results_txt)
elif gpt_say == "4":
elif gpt_say == '4':
i_say = PROMPT_4.format(subject=results_txt)
elif gpt_say == "5":
elif gpt_say == '5':
i_say = PROMPT_5.format(subject=results_txt)
elif gpt_say == "6":
elif gpt_say == '6':
i_say = PROMPT_6.format(subject=results_txt)
elif gpt_say == "7":
elif gpt_say == '7':
i_say = PROMPT_7.replace("{subject}", results_txt) #由于实体关系图用到了{}符号
elif gpt_say == "8":
elif gpt_say == '8':
i_say = PROMPT_8.format(subject=results_txt)
elif gpt_say == "9":
elif gpt_say == '9':
i_say = PROMPT_9.format(subject=results_txt)
i_say_show_user = f"请根据判断结果绘制相应的图表。如需绘制思维导图请使用参数调用,同时过大的图表可能需要复制到在线编辑器中进行渲染。"
i_say_show_user = f'请根据判断结果绘制相应的图表。如需绘制思维导图请使用参数调用,同时过大的图表可能需要复制到在线编辑器中进行渲染。'
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=i_say,
inputs_show_user=i_say_show_user,
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history=[],
sys_prompt="",
llm_kwargs=llm_kwargs, chatbot=chatbot, history=[],
sys_prompt=""
)
history.append(gpt_say)
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新
@CatchException
def 生成多种Mermaid图表(
txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port
):
def 生成多种Mermaid图表(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
"""
txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径
llm_kwargs gpt模型参数如温度和top_p等一般原样传递下去就行
@@ -312,21 +248,15 @@ def 生成多种Mermaid图表(
import os
# 基本信息:功能、贡献者
chatbot.append(
[
chatbot.append([
"函数插件功能?",
"根据当前聊天历史或指定的路径文件(文件内容优先)绘制多种mermaid图表将会由对话模型首先判断适合的图表类型随后绘制图表。\
\n您也可以使用插件参数指定绘制的图表类型,函数插件贡献者: Menghuan1918",
]
)
\n您也可以使用插件参数指定绘制的图表类型,函数插件贡献者: Menghuan1918"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
if os.path.exists(txt): #如输入区无内容则直接解析历史记录
from crazy_functions.pdf_fns.parse_word import extract_text_from_files
file_exist, final_result, page_one, file_manifest, excption = (
extract_text_from_files(txt, chatbot, history)
)
file_exist, final_result, page_one, file_manifest, excption = extract_text_from_files(txt, chatbot, history)
else:
file_exist = False
excption = ""
@@ -334,104 +264,33 @@ def 生成多种Mermaid图表(
if excption != "":
if excption == "word":
report_exception(
chatbot,
history,
report_exception(chatbot, history,
a = f"解析项目: {txt}",
b=f"找到了.doc文件但是该文件格式不被支持请先转化为.docx格式。",
)
b = f"找到了.doc文件但是该文件格式不被支持请先转化为.docx格式。")
elif excption == "pdf":
report_exception(
chatbot,
history,
report_exception(chatbot, history,
a = f"解析项目: {txt}",
b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。",
)
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。")
elif excption == "word_pip":
report_exception(
chatbot,
history,
report_exception(chatbot, history,
a=f"解析项目: {txt}",
b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade python-docx pywin32```。",
)
b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade python-docx pywin32```。")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
else:
if not file_exist:
history.append(txt) #如输入区不是文件则将输入区内容加入历史记录
i_say_show_user = f"首先你从历史记录中提取摘要。"
gpt_say = "[Local Message] 收到。" # 用户提示
chatbot.append([i_say_show_user, gpt_say])
yield from update_ui(chatbot=chatbot, history=history) # 更新UI
yield from 解析历史输入(
history, llm_kwargs, file_manifest, chatbot, plugin_kwargs
)
i_say_show_user = f'首先你从历史记录中提取摘要。'; gpt_say = "[Local Message] 收到。" # 用户提示
chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=history) # 更新UI
yield from 解析历史输入(history,llm_kwargs,file_manifest,chatbot,plugin_kwargs)
else:
file_num = len(file_manifest)
for i in range(file_num): #依次处理文件
i_say_show_user = f"[{i+1}/{file_num}]处理文件{file_manifest[i]}"
gpt_say = "[Local Message] 收到。" # 用户提示
chatbot.append([i_say_show_user, gpt_say])
yield from update_ui(chatbot=chatbot, history=history) # 更新UI
i_say_show_user = f"[{i+1}/{file_num}]处理文件{file_manifest[i]}"; gpt_say = "[Local Message] 收到。" # 用户提示
chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=history) # 更新UI
history = [] #如输入区内容为文件则清空历史记录
history.append(final_result[i])
yield from 解析历史输入(
history, llm_kwargs, file_manifest, chatbot, plugin_kwargs
)
class Mermaid_Gen(GptAcademicPluginTemplate):
def __init__(self):
pass
def define_arg_selection_menu(self):
gui_definition = {
"Type_of_Mermaid": ArgProperty(
title="绘制的Mermaid图表类型",
options=[
"由LLM决定",
"流程图",
"序列图",
"类图",
"饼图",
"甘特图",
"状态图",
"实体关系图",
"象限提示图",
"思维导图",
],
default_value="由LLM决定",
description="选择'由LLM决定'时将由对话模型判断适合的图表类型(不包括思维导图),选择其他类型时将直接绘制指定的图表类型。",
type="dropdown",
).model_dump_json(),
}
return gui_definition
def execute(
txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request
):
options = [
"由LLM决定",
"流程图",
"序列图",
"类图",
"饼图",
"甘特图",
"状态图",
"实体关系图",
"象限提示图",
"思维导图",
]
plugin_kwargs = options.index(plugin_kwargs['Type_of_Mermaid'])
yield from 生成多种Mermaid图表(
txt,
llm_kwargs,
plugin_kwargs,
chatbot,
history,
system_prompt,
user_request,
)
yield from 解析历史输入(history,llm_kwargs,file_manifest,chatbot,plugin_kwargs)

View File

@@ -1,6 +1,6 @@
from toolbox import CatchException, update_ui, ProxyNetworkActivate, update_ui_lastest_msg, get_log_folder, get_user
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_files_from_everything
from loguru import logger
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_files_from_everything
install_msg ="""
1. python -m pip install torch --index-url https://download.pytorch.org/whl/cpu
@@ -40,7 +40,7 @@ def 知识库文件注入(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
except Exception as e:
chatbot.append(["依赖不足", f"{str(e)}\n\n导入依赖失败。请用以下命令安装" + install_msg])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# from crazy_functions.crazy_utils import try_install_deps
# from .crazy_utils import try_install_deps
# try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain'])
# yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history)
return
@@ -60,7 +60,7 @@ def 知识库文件注入(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
# < -------------------预热文本向量化模组--------------- >
chatbot.append(['<br/>'.join(file_manifest), "正在预热文本向量化模组, 如果是第一次运行, 将消耗较长时间下载中文向量化模型..."])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
logger.info('Checking Text2vec ...')
print('Checking Text2vec ...')
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络
HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")
@@ -68,7 +68,7 @@ def 知识库文件注入(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
# < -------------------构建知识库--------------- >
chatbot.append(['<br/>'.join(file_manifest), "正在构建知识库..."])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
logger.info('Establishing knowledge archive ...')
print('Establishing knowledge archive ...')
with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络
kai = knowledge_archive_interface()
vs_path = get_log_folder(user=get_user(chatbot), plugin_name='vec_store')
@@ -93,7 +93,7 @@ def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
except Exception as e:
chatbot.append(["依赖不足", f"{str(e)}\n\n导入依赖失败。请用以下命令安装" + install_msg])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# from crazy_functions.crazy_utils import try_install_deps
# from .crazy_utils import try_install_deps
# try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain'])
# yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history)
return

View File

@@ -1,5 +1,5 @@
from toolbox import CatchException, update_ui
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, input_clipping
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, input_clipping
import requests
from bs4 import BeautifulSoup
from request_llms.bridge_all import model_info
@@ -23,8 +23,8 @@ def google(query, proxies):
item = {'title': title, 'link': link}
results.append(item)
# for r in results:
# print(r['link'])
for r in results:
print(r['link'])
return results
def scrape_text(url, proxies) -> str:

View File

@@ -1,5 +1,5 @@
from toolbox import CatchException, update_ui
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, input_clipping
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, input_clipping
import requests
from bs4 import BeautifulSoup
from request_llms.bridge_all import model_info
@@ -22,8 +22,8 @@ def bing_search(query, proxies=None):
item = {'title': title, 'link': link}
results.append(item)
# for r in results:
# print(r['link'])
for r in results:
print(r['link'])
return results

View File

@@ -64,7 +64,7 @@ def parseNotebook(filename, enable_markdown=1):
def ipynb解释(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
enable_markdown = plugin_kwargs.get("advanced_arg", "1")

View File

@@ -1,12 +1,12 @@
from toolbox import update_ui, promote_file_to_downloadzone
from toolbox import update_ui, promote_file_to_downloadzone, disable_auto_promotion
from toolbox import CatchException, report_exception, write_history_to_file
from shared_utils.fastapi_server import validate_path_safety
from crazy_functions.crazy_utils import input_clipping
from .crazy_utils import input_clipping
def 解析源代码新(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
import os, copy
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
disable_auto_promotion(chatbot=chatbot)
summary_batch_isolation = True
inputs_array = []
@@ -23,7 +23,7 @@ def 解析源代码新(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
file_content = f.read()
prefix = "接下来请你逐文件分析下面的工程" if index==0 else ""
i_say = prefix + f'请对下面的程序文件做一个概述文件名是{os.path.relpath(fp, project_folder)},文件代码是 ```{file_content}```'
i_say_show_user = prefix + f'[{index+1}/{len(file_manifest)}] 请对下面的程序文件做一个概述: {fp}'
i_say_show_user = prefix + f'[{index}/{len(file_manifest)}] 请对下面的程序文件做一个概述: {fp}'
# 装载请求内容
inputs_array.append(i_say)
inputs_show_user_array.append(i_say_show_user)
@@ -128,7 +128,6 @@ def 解析一个Python项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, s
import glob, os
if os.path.exists(txt):
project_folder = txt
validate_path_safety(project_folder, chatbot.get_user())
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
@@ -147,7 +146,6 @@ def 解析一个Matlab项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, s
import glob, os
if os.path.exists(txt):
project_folder = txt
validate_path_safety(project_folder, chatbot.get_user())
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a = f"解析Matlab项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
@@ -166,7 +164,6 @@ def 解析一个C项目的头文件(txt, llm_kwargs, plugin_kwargs, chatbot, his
import glob, os
if os.path.exists(txt):
project_folder = txt
validate_path_safety(project_folder, chatbot.get_user())
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
@@ -187,7 +184,6 @@ def 解析一个C项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, system
import glob, os
if os.path.exists(txt):
project_folder = txt
validate_path_safety(project_folder, chatbot.get_user())
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
@@ -210,7 +206,6 @@ def 解析一个Java项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys
import glob, os
if os.path.exists(txt):
project_folder = txt
validate_path_safety(project_folder, chatbot.get_user())
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到本地项目或无权访问: {txt}")
@@ -233,7 +228,6 @@ def 解析一个前端项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, s
import glob, os
if os.path.exists(txt):
project_folder = txt
validate_path_safety(project_folder, chatbot.get_user())
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到本地项目或无权访问: {txt}")
@@ -263,7 +257,6 @@ def 解析一个Golang项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, s
import glob, os
if os.path.exists(txt):
project_folder = txt
validate_path_safety(project_folder, chatbot.get_user())
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到本地项目或无权访问: {txt}")
@@ -285,7 +278,6 @@ def 解析一个Rust项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys
import glob, os
if os.path.exists(txt):
project_folder = txt
validate_path_safety(project_folder, chatbot.get_user())
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到本地项目或无权访问: {txt}")
@@ -306,7 +298,6 @@ def 解析一个Lua项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
import glob, os
if os.path.exists(txt):
project_folder = txt
validate_path_safety(project_folder, chatbot.get_user())
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
@@ -329,7 +320,6 @@ def 解析一个CSharp项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, s
import glob, os
if os.path.exists(txt):
project_folder = txt
validate_path_safety(project_folder, chatbot.get_user())
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
@@ -355,19 +345,15 @@ def 解析任意code项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys
pattern_except_suffix = [_.lstrip(" ^*.,").rstrip(" ,") for _ in txt_pattern.split(" ") if _ != "" and _.strip().startswith("^*.")]
pattern_except_suffix += ['zip', 'rar', '7z', 'tar', 'gz'] # 避免解析压缩文件
# 将要忽略匹配的文件名(例如: ^README.md)
pattern_except_name = [_.lstrip(" ^*,").rstrip(" ,").replace(".", r"\.") # 移除左边通配符,移除右侧逗号,转义点号
for _ in txt_pattern.split(" ") # 以空格分割
if (_ != "" and _.strip().startswith("^") and not _.strip().startswith("^*.")) # ^开始,但不是^*.开始
]
pattern_except_name = [_.lstrip(" ^*,").rstrip(" ,").replace(".", "\.") for _ in txt_pattern.split(" ") if _ != "" and _.strip().startswith("^") and not _.strip().startswith("^*.")]
# 生成正则表达式
pattern_except = r'/[^/]+\.(' + "|".join(pattern_except_suffix) + ')$'
pattern_except = '/[^/]+\.(' + "|".join(pattern_except_suffix) + ')$'
pattern_except += '|/(' + "|".join(pattern_except_name) + ')$' if pattern_except_name != [] else ''
history.clear()
import glob, os, re
if os.path.exists(txt):
project_folder = txt
validate_path_safety(project_folder, chatbot.get_user())
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")

View File

@@ -1,5 +1,5 @@
from toolbox import CatchException, update_ui, get_conf
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
import datetime
@CatchException
def 同时问询(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):

View File

@@ -1,13 +1,11 @@
from toolbox import update_ui
from toolbox import CatchException, get_conf, markdown_convertion
from request_llms.bridge_all import predict_no_ui_long_connection
from crazy_functions.crazy_utils import input_clipping
from crazy_functions.agent_fns.watchdog import WatchDog
from crazy_functions.live_audio.aliyunASR import AliyunASR
from loguru import logger
from request_llms.bridge_all import predict_no_ui_long_connection
import threading, time
import numpy as np
from .live_audio.aliyunASR import AliyunASR
import json
import re
@@ -44,9 +42,9 @@ class AsyncGptTask():
gpt_say_partial = predict_no_ui_long_connection(inputs=i_say, llm_kwargs=llm_kwargs, history=history, sys_prompt=sys_prompt,
observe_window=observe_window[index], console_slience=True)
except ConnectionAbortedError as token_exceed_err:
logger.error('至少一个线程任务Token溢出而失败', e)
print('至少一个线程任务Token溢出而失败', e)
except Exception as e:
logger.error('至少一个线程任务意外失败', e)
print('至少一个线程任务意外失败', e)
def add_async_gpt_task(self, i_say, chatbot_index, llm_kwargs, history, system_prompt):
self.observe_future.append([""])

View File

@@ -1,18 +1,19 @@
from toolbox import update_ui
from toolbox import CatchException, report_exception
from toolbox import write_history_to_file, promote_file_to_downloadzone
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
def 解析Paper(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
import time, glob, os
print('begin analysis on:', file_manifest)
for index, fp in enumerate(file_manifest):
with open(fp, 'r', encoding='utf-8', errors='replace') as f:
file_content = f.read()
prefix = "接下来请你逐文件分析下面的论文文件,概括其内容" if index==0 else ""
i_say = prefix + f'请对下面的文章片段用中文做一个概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{file_content}```'
i_say_show_user = prefix + f'[{index+1}/{len(file_manifest)}] 请对下面的文章片段做一个概述: {os.path.abspath(fp)}'
i_say_show_user = prefix + f'[{index}/{len(file_manifest)}] 请对下面的文章片段做一个概述: {os.path.abspath(fp)}'
chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

View File

@@ -1,4 +1,4 @@
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from toolbox import CatchException, report_exception, promote_file_to_downloadzone
from toolbox import update_ui, update_ui_lastest_msg, disable_auto_promotion, write_history_to_file
import logging

View File

@@ -0,0 +1,28 @@
# encoding: utf-8
# @Time : 2023/4/19
# @Author : Spike
# @Descr :
from toolbox import update_ui
from toolbox import CatchException, report_execption, write_results_to_file
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
@CatchException
def 猜你想问(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
if txt:
show_say = txt
prompt = txt+'\n回答完问题后,再列出用户可能提出的三个问题。'
else:
prompt = history[-1]+"\n分析上述回答,再列出用户可能提出的三个问题。"
show_say = '分析上述回答,再列出用户可能提出的三个问题。'
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=prompt,
inputs_show_user=show_say,
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history=history,
sys_prompt=system_prompt
)
chatbot[-1] = (show_say, gpt_say)
history.extend([show_say, gpt_say])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

View File

@@ -2,10 +2,6 @@ from toolbox import CatchException, update_ui
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
import datetime
####################################################################################################################
# Demo 1: 一个非常简单的插件 #########################################################################################
####################################################################################################################
高阶功能模板函数示意图 = f"""
```mermaid
flowchart TD
@@ -30,7 +26,7 @@ flowchart TD
"""
@CatchException
def 高阶功能模板函数(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request, num_day=5):
def 高阶功能模板函数(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
"""
# 高阶功能模板函数示意图https://mermaid.live/edit#pako:eNptk1tvEkEYhv8KmattQpvlvOyFCcdeeaVXuoYssBwie8gyhCIlqVoLhrbbtAWNUpEGUkyMEDW2Fmn_DDOL_8LZHdOwxrnamX3f7_3mmZk6yKhZCfAgV1KrmYKoQ9fDuKC4yChX0nld1Aou1JzjznQ5fWmejh8LYHW6vG2a47YAnlCLNSIRolnenKBXI_zRIBrcuqRT890u7jZx7zMDt-AaMbnW1--5olGiz2sQjwfoQxsZL0hxplSSU0-rop4vrzmKR6O2JxYjHmwcL2Y_HDatVMkXlf86YzHbGY9bO5j8XE7O8Nsbc3iNB3ukL2SMcH-XIQBgWoVOZzxuOxOJOyc63EPGV6ZQLENVrznViYStTiaJ2vw2M2d9bByRnOXkgCnXylCSU5quyto_IcmkbdvctELmJ-j1ASW3uB3g5xOmKqVTmqr_Na3AtuS_dtBFm8H90XJyHkDDT7S9xXWb4HGmRChx64AOL5HRpUm411rM5uh4H78Z4V7fCZzytjZz2seto9XaNPFue07clLaVZF8UNLygJ-VES8lah_n-O-5Ozc7-77NzJ0-K0yr0ZYrmHdqAk50t2RbA4qq9uNohBASw7YpSgaRkLWCCAtxAlnRZLGbJba9bPwUAC5IsCYAnn1kpJ1ZKUACC0iBSsQLVBzUlA3ioVyQ3qGhZEUrxokiehAz4nFgqk1VNVABfB1uAD_g2_AGPl-W8nMcbCvsDblADfNCz4feyobDPy3rYEMtxwYYbPFNVUoHdCPmDHBv2cP4AMfrCbiBli-Q-3afv0X6WdsIjW2-10fgDy1SAig
@@ -47,7 +43,7 @@ def 高阶功能模板函数(txt, llm_kwargs, plugin_kwargs, chatbot, history, s
"您正在调用插件:历史上的今天",
"[Local Message] 请注意,您正在调用一个[函数插件]的模板该函数面向希望实现更多有趣功能的开发者它可以作为创建新功能函数的模板该函数只有20多行代码。此外我们也提供可同步处理大量文件的多线程Demo供您参考。您若希望分享新的功能模组请不吝PR" + 高阶功能模板函数示意图))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间我们先及时地做一次界面更新
for i in range(int(num_day)):
for i in range(5):
currentMonth = (datetime.date.today() + datetime.timedelta(days=i)).month
currentDay = (datetime.date.today() + datetime.timedelta(days=i)).day
i_say = f'历史中哪些事件发生在{currentMonth}{currentDay}列举两条并发送相关图片。发送图片时请使用Markdown将Unsplash API中的PUT_YOUR_QUERY_HERE替换成描述该事件的一个最重要的单词。'
@@ -63,56 +59,6 @@ def 高阶功能模板函数(txt, llm_kwargs, plugin_kwargs, chatbot, history, s
####################################################################################################################
# Demo 2: 一个带二级菜单的插件 #######################################################################################
####################################################################################################################
from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty
class Demo_Wrap(GptAcademicPluginTemplate):
def __init__(self):
"""
请注意`execute`会执行在不同的线程中,因此您在定义和使用类变量时,应当慎之又慎!
"""
pass
def define_arg_selection_menu(self):
"""
定义插件的二级选项菜单
"""
gui_definition = {
"num_day":
ArgProperty(title="日期选择", options=["仅今天", "未来3天", "未来5天"], default_value="未来3天", description="", type="dropdown").model_dump_json(),
}
return gui_definition
def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
"""
执行插件
"""
num_day = plugin_kwargs["num_day"]
if num_day == "仅今天": num_day = 1
if num_day == "未来3天": num_day = 3
if num_day == "未来5天": num_day = 5
yield from 高阶功能模板函数(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request, num_day=num_day)
####################################################################################################################
# Demo 3: 绘制脑图的Demo ############################################################################################
####################################################################################################################
PROMPT = """
请你给出围绕“{subject}”的逻辑关系图使用mermaid语法mermaid语法举例
```mermaid

View File

@@ -4,9 +4,9 @@
# 1. 请在以下方案中选择任意一种,然后删除其他的方案
# 2. 修改你选择的方案中的environment环境变量详情请见github wiki或者config.py
# 3. 选择一种暴露服务端口的方法,并对相应的配置做出修改:
# 方法1: 适用于Linux很方便可惜windows不支持与宿主的网络融合为一体,这个是默认配置
# 方法1: 适用于Linux很方便可惜windows不支持与宿主的网络融合为一体,这个是默认配置
# network_mode: "host"
# 方法2: 适用于所有系统包括Windows和MacOS端口映射把容器的端口映射到宿主的端口注意您需要先删除network_mode: "host",再追加以下内容)
# 方法2: 适用于所有系统包括Windows和MacOS端口映射把容器的端口映射到宿主的端口注意您需要先删除network_mode: "host",再追加以下内容)
# ports:
# - "12345:12345" # 注意12345必须与WEB_PORT环境变量相互对应
# 4. 最后`docker-compose up`运行
@@ -25,7 +25,7 @@
## ===================================================
## ===================================================
## 方案零 部署项目的全部能力这个是包含cuda和latex的大型镜像。如果您网速慢、硬盘小或没有显卡则不推荐使用这个
## 方案零 部署项目的全部能力这个是包含cuda和latex的大型镜像。如果您网速慢、硬盘小或没有显卡则不推荐使用这个
## ===================================================
version: '3'
services:
@@ -63,10 +63,10 @@ services:
# count: 1
# capabilities: [gpu]
# WEB_PORT暴露方法1: 适用于Linux与宿主的网络融合
# WEB_PORT暴露方法1: 适用于Linux与宿主的网络融合
network_mode: "host"
# WEB_PORT暴露方法2: 适用于所有系统端口映射
# WEB_PORT暴露方法2: 适用于所有系统端口映射
# ports:
# - "12345:12345" # 12345必须与WEB_PORT相互对应
@@ -75,8 +75,10 @@ services:
bash -c "python3 -u main.py"
## ===================================================
## 方案一 如果不需要运行本地模型(仅 chatgpt, azure, 星火, 千帆, claude 等在线大模型服务)
## 方案一 如果不需要运行本地模型(仅 chatgpt, azure, 星火, 千帆, claude 等在线大模型服务)
## ===================================================
version: '3'
services:
@@ -95,16 +97,16 @@ services:
# DEFAULT_WORKER_NUM: ' 10 '
# AUTHENTICATION: ' [("username", "passwd"), ("username2", "passwd2")] '
# 「WEB_PORT暴露方法1: 适用于Linux」与宿主的网络融合
# 与宿主的网络融合
network_mode: "host"
# 启动命令
# 不使用代理网络拉取最新代码
command: >
bash -c "python3 -u main.py"
### ===================================================
### 方案二 如果需要运行ChatGLM + Qwen + MOSS等本地模型
### 方案二 如果需要运行ChatGLM + Qwen + MOSS等本地模型
### ===================================================
version: '3'
services:
@@ -128,10 +130,8 @@ services:
devices:
- /dev/nvidia0:/dev/nvidia0
# 「WEB_PORT暴露方法1: 适用于Linux」与宿主的网络融合
# 与宿主的网络融合
network_mode: "host"
# 启动命令
command: >
bash -c "python3 -u main.py"
@@ -139,9 +139,8 @@ services:
# command: >
# bash -c "pip install -r request_llms/requirements_qwen.txt && python3 -u main.py"
### ===================================================
### 方案三 如果需要运行ChatGPT + LLAMA + 盘古 + RWKV本地模型
### 方案三 如果需要运行ChatGPT + LLAMA + 盘古 + RWKV本地模型
### ===================================================
version: '3'
services:
@@ -165,22 +164,21 @@ services:
devices:
- /dev/nvidia0:/dev/nvidia0
# 「WEB_PORT暴露方法1: 适用于Linux」与宿主的网络融合
# 与宿主的网络融合
network_mode: "host"
# 启动命令
# 不使用代理网络拉取最新代码
command: >
python3 -u main.py
## ===================================================
## 方案四 ChatGPT + Latex
## 方案四 ChatGPT + Latex
## ===================================================
version: '3'
services:
gpt_academic_with_latex:
image: ghcr.io/binary-husky/gpt_academic_with_latex:master # (Auto Built by Dockerfile: docs/GithubAction+NoLocal+Latex)
# 对于ARM64设备请将以上镜像名称替换为 ghcr.io/binary-husky/gpt_academic_with_latex_arm:master
environment:
# 请查阅 `config.py` 以查看所有的配置信息
API_KEY: ' sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx '
@@ -192,16 +190,16 @@ services:
DEFAULT_WORKER_NUM: ' 10 '
WEB_PORT: ' 12303 '
# 「WEB_PORT暴露方法1: 适用于Linux」与宿主的网络融合
# 与宿主的网络融合
network_mode: "host"
# 启动命令
# 不使用代理网络拉取最新代码
command: >
bash -c "python3 -u main.py"
## ===================================================
## 方案五 ChatGPT + 语音助手 (请先阅读 docs/use_audio.md
## 方案五 ChatGPT + 语音助手 (请先阅读 docs/use_audio.md
## ===================================================
version: '3'
services:
@@ -225,9 +223,9 @@ services:
# (无需填写) ALIYUN_ACCESSKEY: ' LTAI5q6BrFUzoRXVGUWnekh1 '
# (无需填写) ALIYUN_SECRET: ' eHmI20AVWIaQZ0CiTD2bGQVsaP9i68 '
# 「WEB_PORT暴露方法1: 适用于Linux」与宿主的网络融合
# 与宿主的网络融合
network_mode: "host"
# 启动命令
# 不使用代理网络拉取最新代码
command: >
bash -c "python3 -u main.py"

View File

@@ -0,0 +1 @@
# 此Dockerfile不再维护请前往docs/GithubAction+JittorLLMs

View File

@@ -3,9 +3,6 @@
# 从NVIDIA源从而支持显卡检查宿主的nvidia-smi中的cuda版本必须>=11.3
FROM fuqingxu/11.3.1-runtime-ubuntu20.04-with-texlive:latest
# edge-tts需要的依赖某些pip包所需的依赖
RUN apt update && apt install ffmpeg build-essential -y
# use python3 as the system default python
WORKDIR /gpt
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.8

Some files were not shown because too many files have changed in this diff Show More