From 0a88d18c7a6b05b5f2fa50f336f947881c26730b Mon Sep 17 00:00:00 2001 From: binary-husky Date: Tue, 21 May 2024 08:51:29 +0000 Subject: [PATCH] secondary menu for pdf trans --- check_proxy.py | 4 +- crazy_functional.py | 8 +- ...量Markdown翻译.py => Markdown_Translate.py} | 6 +- crazy_functions/PDF_Translate.py | 362 +++--------------- crazy_functions/PDF_Translate_Wrap.py | 17 +- crazy_functions/crazy_utils.py | 2 +- crazy_functions/pdf_fns/parse_pdf.py | 6 +- crazy_functions/pdf_fns/parse_pdf_grobid.py | 26 ++ crazy_functions/pdf_fns/parse_pdf_legacy.py | 110 ++++++ .../pdf_fns/parse_pdf_via_doc2x.py | 160 ++++++++ .../plugin_template/plugin_class_template.py | 6 +- crazy_functions/批量翻译PDF文档_NOUGAT.py | 2 +- docs/self_analysis.md | 10 +- docs/translate_english.json | 2 +- docs/translate_japanese.json | 2 +- docs/translate_std.json | 2 +- docs/translate_traditionalchinese.json | 2 +- colorful.py => shared_utils/colorful.py | 0 shared_utils/config_loader.py | 2 +- shared_utils/connect_void_terminal.py | 4 +- shared_utils/cookie_manager.py | 34 +- tests/test_plugins.py | 4 +- themes/common.js | 55 ++- themes/gui_advanced_plugin_class.py | 7 +- 24 files changed, 461 insertions(+), 372 deletions(-) rename crazy_functions/{批量Markdown翻译.py => Markdown_Translate.py} (98%) create mode 100644 crazy_functions/pdf_fns/parse_pdf_grobid.py create mode 100644 crazy_functions/pdf_fns/parse_pdf_legacy.py create mode 100644 crazy_functions/pdf_fns/parse_pdf_via_doc2x.py rename colorful.py => shared_utils/colorful.py (100%) diff --git a/check_proxy.py b/check_proxy.py index 99592f7e..3a42a739 100644 --- a/check_proxy.py +++ b/check_proxy.py @@ -71,7 +71,7 @@ def patch_and_restart(path): import sys import time import glob - from colorful import print亮黄, print亮绿, print亮红 + from shared_utils.colorful import print亮黄, print亮绿, print亮红 # if not using config_private, move origin config.py as config_private.py if not os.path.exists('config_private.py'): print亮黄('由于您没有设置config_private.py私密配置,现将您的现有配置移动至config_private.py以防止配置丢失,', @@ -124,7 +124,7 @@ def auto_update(raise_error=False): current_version = f.read() current_version = json.loads(current_version)['version'] if (remote_version - current_version) >= 0.01-1e-5: - from colorful import print亮黄 + from shared_utils.colorful import print亮黄 print亮黄(f'\n新版本可用。新版本:{remote_version},当前版本:{current_version}。{new_feature}') print('(1)Github更新地址:\nhttps://github.com/binary-husky/chatgpt_academic\n') user_instruction = input('(2)是否一键更新代码(Y+回车=确认,输入其他/无输入+回车=不更新)?') diff --git a/crazy_functional.py b/crazy_functional.py index 8f57bdd0..3c65d7e4 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -25,14 +25,14 @@ def get_crazy_functions(): from crazy_functions.对话历史存档 import 载入对话历史存档 from crazy_functions.对话历史存档 import 删除所有本地对话历史记录 from crazy_functions.辅助功能 import 清除缓存 - from crazy_functions.批量Markdown翻译 import Markdown英译中 + from crazy_functions.Markdown_Translate import Markdown英译中 from crazy_functions.批量总结PDF文档 import 批量总结PDF文档 from crazy_functions.PDF_Translate import 批量翻译PDF文档 from crazy_functions.谷歌检索小助手 import 谷歌检索小助手 from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入 from crazy_functions.Latex全文润色 import Latex中文润色 from crazy_functions.Latex全文润色 import Latex英文纠错 - from crazy_functions.批量Markdown翻译 import Markdown中译英 + from crazy_functions.Markdown_Translate import Markdown中译英 from crazy_functions.虚空终端 import 虚空终端 from crazy_functions.生成多种Mermaid图表 import 生成多种Mermaid图表 @@ -209,7 +209,7 @@ def get_crazy_functions(): "精准翻译PDF论文": { "Group": "学术", "Color": "stop", - "AsButton": False, + "AsButton": True, "Info": "精准翻译PDF论文为中文 | 输入参数为路径", "Function": None, "Class": PDF_Tran, # 新一代插件都会写成 class @@ -461,7 +461,7 @@ def get_crazy_functions(): print("Load function plugin failed") try: - from crazy_functions.批量Markdown翻译 import Markdown翻译指定语言 + from crazy_functions.Markdown_Translate import Markdown翻译指定语言 function_plugins.update( { diff --git a/crazy_functions/批量Markdown翻译.py b/crazy_functions/Markdown_Translate.py similarity index 98% rename from crazy_functions/批量Markdown翻译.py rename to crazy_functions/Markdown_Translate.py index 61dd4b56..59f26fee 100644 --- a/crazy_functions/批量Markdown翻译.py +++ b/crazy_functions/Markdown_Translate.py @@ -72,17 +72,17 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch inputs_array = ["This is a Markdown file, translate it into Chinese, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" + f"\n\n{frag}" for frag in pfg.sp_file_contents] inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag] - sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)] + sys_prompt_array = ["You are a professional academic paper translator." + plugin_kwargs.get("additional_prompt", "") for _ in range(n_split)] elif language == 'zh->en': inputs_array = [f"This is a Markdown file, translate it into English, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" + f"\n\n{frag}" for frag in pfg.sp_file_contents] inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag] - sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)] + sys_prompt_array = ["You are a professional academic paper translator." + plugin_kwargs.get("additional_prompt", "") for _ in range(n_split)] else: inputs_array = [f"This is a Markdown file, translate it into {language}, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" + f"\n\n{frag}" for frag in pfg.sp_file_contents] inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag] - sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)] + sys_prompt_array = ["You are a professional academic paper translator." + plugin_kwargs.get("additional_prompt", "") for _ in range(n_split)] gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( inputs_array=inputs_array, diff --git a/crazy_functions/PDF_Translate.py b/crazy_functions/PDF_Translate.py index 88fb3f28..0f93c821 100644 --- a/crazy_functions/PDF_Translate.py +++ b/crazy_functions/PDF_Translate.py @@ -1,15 +1,12 @@ -from toolbox import CatchException, report_exception, get_log_folder, gen_time_str, check_packages -from toolbox import update_ui, promote_file_to_downloadzone, update_ui_lastest_msg, disable_auto_promotion -from toolbox import write_history_to_file, promote_file_to_downloadzone, get_conf, extract_archive -from toolbox import generate_file_link, zip_folder, trimmed_format_exc, trimmed_format_exc_markdown -from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive -from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency -from .crazy_utils import read_and_clean_pdf_text -from .crazy_utils import get_files_from_everything -from .pdf_fns.parse_pdf import parse_pdf, get_avail_grobid_url, translate_pdf -from colorful import * -import os - +from toolbox import CatchException, check_packages, get_conf +from toolbox import update_ui, update_ui_lastest_msg, disable_auto_promotion +from toolbox import trimmed_format_exc_markdown +from crazy_functions.crazy_utils import get_files_from_everything +from crazy_functions.pdf_fns.parse_pdf import get_avail_grobid_url +from crazy_functions.pdf_fns.parse_pdf_via_doc2x import 解析PDF_基于DOC2X +from crazy_functions.pdf_fns.parse_pdf_legacy import 解析PDF_简单拆解 +from crazy_functions.pdf_fns.parse_pdf_grobid import 解析PDF_基于GROBID +from shared_utils.colorful import * @CatchException def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): @@ -23,319 +20,64 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst try: check_packages(["fitz", "tiktoken", "scipdf"]) except: - report_exception(chatbot, history, - a=f"解析项目: {txt}", - b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf tiktoken scipdf_parser```。") + chatbot.append([None, f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf tiktoken scipdf_parser```。"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return # 清空历史,以免输入溢出 history = [] - success, file_manifest, project_folder = get_files_from_everything(txt, type='.pdf') + # 检测输入参数,如没有给定输入参数,直接退出 - if not success: - if txt == "": txt = '空空如也的输入栏' + if (not success) and txt == "": txt = '空空如也的输入栏。提示:请先上传文件(把PDF文件拖入对话)。' # 如果没找到任何文件 if len(file_manifest) == 0: - report_exception(chatbot, history, - a=f"解析项目: {txt}", b=f"找不到任何.pdf拓展名的文件: {txt}") + chatbot.append([None, f"找不到任何.pdf拓展名的文件: {txt}"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return # 开始正式执行任务 - DOC2X_API_KEY = get_conf("DOC2X_API_KEY") - # ------- 第一种方法,效果最好,但是需要DOC2X服务 ------- - if len(DOC2X_API_KEY) != 0: - try: - yield from 解析PDF_DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request) - return - except: - chatbot.append([None, f"DOC2X服务不可用,现在将执行效果稍差的旧版代码。{trimmed_format_exc_markdown()}"]) - yield from update_ui(chatbot=chatbot, history=history) + method = plugin_kwargs.get("pdf_parse_method", None) + if method == "DOC2X": + # ------- 第一种方法,效果最好,但是需要DOC2X服务 ------- + DOC2X_API_KEY = get_conf("DOC2X_API_KEY") + if len(DOC2X_API_KEY) != 0: + try: + yield from 解析PDF_基于DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request) + return + except: + chatbot.append([None, f"DOC2X服务不可用,现在将执行效果稍差的旧版代码。{trimmed_format_exc_markdown()}"]) + yield from update_ui(chatbot=chatbot, history=history) - # ------- 第二种方法,效果次优 ------- - grobid_url = get_avail_grobid_url() - if grobid_url is not None: - yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url) + if method == "GROBID": + # ------- 第二种方法,效果次优 ------- + grobid_url = get_avail_grobid_url() + if grobid_url is not None: + yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url) + return + + if method == "ClASSIC": + # ------- 第三种方法,早期代码,效果不理想 ------- + yield from update_ui_lastest_msg("GROBID服务不可用,请检查config中的GROBID_URL。作为替代,现在将执行效果稍差的旧版代码。", chatbot, history, delay=3) + yield from 解析PDF_简单拆解(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) return - # ------- 第三种方法,早期代码,效果不理想 ------- - yield from update_ui_lastest_msg("GROBID服务不可用,请检查config中的GROBID_URL。作为替代,现在将执行效果稍差的旧版代码。", chatbot, history, delay=3) - yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) - return - - - -def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request): - - def refresh_key(doc2x_api_key): - import requests, json - url = "https://api.doc2x.noedgeai.com/api/token/refresh" - res = requests.post( - url, - headers={"Authorization": "Bearer " + doc2x_api_key} - ) - res_json = [] - if res.status_code == 200: - decoded = res.content.decode("utf-8") - res_json = json.loads(decoded) - doc2x_api_key = res_json['data']['token'] - else: - raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) - return doc2x_api_key - - def pdf2markdown(filepath): - import requests, json, os - markdown_dir = get_log_folder(plugin_name="pdf_ocr") - doc2x_api_key = DOC2X_API_KEY - if doc2x_api_key.startswith('sk-'): - url = "https://api.doc2x.noedgeai.com/api/v1/pdf" - else: - doc2x_api_key = refresh_key(doc2x_api_key) - url = "https://api.doc2x.noedgeai.com/api/platform/pdf" - - chatbot.append((None, "加载PDF文件,发送至DOC2X解析...")) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - res = requests.post( - url, - files={"file": open(filepath, "rb")}, - data={"ocr": "1"}, - headers={"Authorization": "Bearer " + doc2x_api_key} - ) - res_json = [] - if res.status_code == 200: - decoded = res.content.decode("utf-8") - for z_decoded in decoded.split('\n'): - if len(z_decoded) == 0: continue - assert z_decoded.startswith("data: ") - z_decoded = z_decoded[len("data: "):] - decoded_json = json.loads(z_decoded) - res_json.append(decoded_json) - else: - raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) - uuid = res_json[0]['uuid'] - to = "md" # latex, md, docx - url = "https://api.doc2x.noedgeai.com/api/export"+"?request_id="+uuid+"&to="+to - - chatbot.append((None, f"读取解析: {url} ...")) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - res = requests.get(url, headers={"Authorization": "Bearer " + doc2x_api_key}) - md_zip_path = os.path.join(markdown_dir, gen_time_str() + '.zip') - if res.status_code == 200: - with open(md_zip_path, "wb") as f: f.write(res.content) - else: - raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) - promote_file_to_downloadzone(md_zip_path, chatbot=chatbot) - chatbot.append((None, f"完成解析 {md_zip_path} ...")) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return md_zip_path - - def deliver_to_markdown_plugin(md_zip_path, user_request): - from crazy_functions.批量Markdown翻译 import Markdown英译中 - import shutil, re - - time_tag = gen_time_str() - target_path_base = get_log_folder(chatbot.get_user()) - file_origin_name = os.path.basename(md_zip_path) - this_file_path = os.path.join(target_path_base, file_origin_name) - os.makedirs(target_path_base, exist_ok=True) - shutil.copyfile(md_zip_path, this_file_path) - ex_folder = this_file_path + ".extract" - extract_archive( - file_path=this_file_path, dest_dir=ex_folder - ) - - # edit markdown files - success, file_manifest, project_folder = get_files_from_everything(ex_folder, type='.md') - for generated_fp in file_manifest: - # 修正一些公式问题 - with open(generated_fp, 'r', encoding='utf8') as f: - content = f.read() - # 将公式中的\[ \]替换成$$ - content = content.replace(r'\[', r'$$').replace(r'\]', r'$$') - # 将公式中的\( \)替换成$ - content = content.replace(r'\(', r'$').replace(r'\)', r'$') - content = content.replace('```markdown', '\n').replace('```', '\n') - with open(generated_fp, 'w', encoding='utf8') as f: - f.write(content) - promote_file_to_downloadzone(generated_fp, chatbot=chatbot) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # 生成在线预览html - file_name = '在线预览翻译(原文)' + gen_time_str() + '.html' - preview_fp = os.path.join(ex_folder, file_name) - from shared_utils.advanced_markdown_format import markdown_convertion_for_file - with open(generated_fp, "r", encoding="utf-8") as f: - md = f.read() - # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染 - md = re.sub(r'^', r'😃
', md, flags=re.MULTILINE) - html = markdown_convertion_for_file(md) - with open(preview_fp, "w", encoding="utf-8") as f: f.write(html) - chatbot.append([None, f"生成在线预览:{generate_file_link([preview_fp])}"]) - promote_file_to_downloadzone(preview_fp, chatbot=chatbot) - - - - chatbot.append((None, f"调用Markdown插件 {ex_folder} ...")) - plugin_kwargs['markdown_expected_output_dir'] = ex_folder - - translated_f_name = 'translated_markdown.md' - generated_fp = plugin_kwargs['markdown_expected_output_path'] = os.path.join(ex_folder, translated_f_name) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - yield from Markdown英译中(ex_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request) - if os.path.exists(generated_fp): - # 修正一些公式问题 - with open(generated_fp, 'r', encoding='utf8') as f: content = f.read() - content = content.replace('```markdown', '\n').replace('```', '\n') - # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染 - content = re.sub(r'^
', r'😃
', content, flags=re.MULTILINE) - with open(generated_fp, 'w', encoding='utf8') as f: f.write(content) - # 生成在线预览html - file_name = '在线预览翻译' + gen_time_str() + '.html' - preview_fp = os.path.join(ex_folder, file_name) - from shared_utils.advanced_markdown_format import markdown_convertion_for_file - with open(generated_fp, "r", encoding="utf-8") as f: - md = f.read() - html = markdown_convertion_for_file(md) - with open(preview_fp, "w", encoding="utf-8") as f: f.write(html) - promote_file_to_downloadzone(preview_fp, chatbot=chatbot) - # 生成包含图片的压缩包 - dest_folder = get_log_folder(chatbot.get_user()) - zip_name = '翻译后的带图文档.zip' - zip_folder(source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name) - zip_fp = os.path.join(dest_folder, zip_name) - promote_file_to_downloadzone(zip_fp, chatbot=chatbot) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - md_zip_path = yield from pdf2markdown(fp) - yield from deliver_to_markdown_plugin(md_zip_path, user_request) - -def 解析PDF_DOC2X(file_manifest, *args): - for index, fp in enumerate(file_manifest): - yield from 解析PDF_DOC2X_单文件(fp, *args) - return - -def 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url): - import copy, json - TOKEN_LIMIT_PER_FRAGMENT = 1024 - generated_conclusion_files = [] - generated_html_files = [] - DST_LANG = "中文" - from crazy_functions.pdf_fns.report_gen_html import construct_html - for index, fp in enumerate(file_manifest): - chatbot.append(["当前进度:", f"正在连接GROBID服务,请稍候: {grobid_url}\n如果等待时间过长,请修改config中的GROBID_URL,可修改成本地GROBID服务。"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - article_dict = parse_pdf(fp, grobid_url) - grobid_json_res = os.path.join(get_log_folder(), gen_time_str() + "grobid.json") - with open(grobid_json_res, 'w+', encoding='utf8') as f: - f.write(json.dumps(article_dict, indent=4, ensure_ascii=False)) - promote_file_to_downloadzone(grobid_json_res, chatbot=chatbot) - - if article_dict is None: raise RuntimeError("解析PDF失败,请检查PDF是否损坏。") - yield from translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG) - chatbot.append(("给出输出文件清单", str(generated_conclusion_files + generated_html_files))) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - -def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): - """ - 此函数已经弃用 - """ - import copy - TOKEN_LIMIT_PER_FRAGMENT = 1024 - generated_conclusion_files = [] - generated_html_files = [] - from crazy_functions.pdf_fns.report_gen_html import construct_html - for index, fp in enumerate(file_manifest): - # 读取PDF文件 - file_content, page_one = read_and_clean_pdf_text(fp) - file_content = file_content.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars - page_one = str(page_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars - - # 递归地切割PDF文件 - from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit - paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) - page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=page_one, limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model']) - - # 为了更好的效果,我们剥离Introduction之后的部分(如果有) - paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] - - # 单线,获取文章meta信息 - paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive( - inputs=f"以下是一篇学术论文的基础信息,请从中提取出“标题”、“收录会议或期刊”、“作者”、“摘要”、“编号”、“作者邮箱”这六个部分。请用markdown格式输出,最后用中文翻译摘要部分。请提取:{paper_meta}", - inputs_show_user=f"请从{fp}中提取出“标题”、“收录会议或期刊”等基本信息。", - llm_kwargs=llm_kwargs, - chatbot=chatbot, history=[], - sys_prompt="Your job is to collect information from materials。", - ) - - # 多线,翻译 - gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( - inputs_array=[ - f"你需要翻译以下内容:\n{frag}" for frag in paper_fragments], - inputs_show_user_array=[f"\n---\n 原文: \n\n {frag.replace('#', '')} \n---\n 翻译:\n " for frag in paper_fragments], - llm_kwargs=llm_kwargs, - chatbot=chatbot, - history_array=[[paper_meta] for _ in paper_fragments], - sys_prompt_array=[ - "请你作为一个学术翻译,负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" for _ in paper_fragments], - # max_workers=5 # OpenAI所允许的最大并行过载 - ) - gpt_response_collection_md = copy.deepcopy(gpt_response_collection) - # 整理报告的格式 - for i,k in enumerate(gpt_response_collection_md): - if i%2==0: - gpt_response_collection_md[i] = f"\n\n---\n\n ## 原文[{i//2}/{len(gpt_response_collection_md)//2}]: \n\n {paper_fragments[i//2].replace('#', '')} \n\n---\n\n ## 翻译[{i//2}/{len(gpt_response_collection_md)//2}]:\n " - else: - gpt_response_collection_md[i] = gpt_response_collection_md[i] - final = ["一、论文概况\n\n---\n\n", paper_meta_info.replace('# ', '### ') + '\n\n---\n\n', "二、论文翻译", ""] - final.extend(gpt_response_collection_md) - create_report_file_name = f"{os.path.basename(fp)}.trans.md" - res = write_history_to_file(final, create_report_file_name) - promote_file_to_downloadzone(res, chatbot=chatbot) - - # 更新UI - generated_conclusion_files.append(f'{get_log_folder()}/{create_report_file_name}') - chatbot.append((f"{fp}完成了吗?", res)) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # write html - try: - ch = construct_html() - orig = "" - trans = "" - gpt_response_collection_html = copy.deepcopy(gpt_response_collection) - for i,k in enumerate(gpt_response_collection_html): - if i%2==0: - gpt_response_collection_html[i] = paper_fragments[i//2].replace('#', '') - else: - gpt_response_collection_html[i] = gpt_response_collection_html[i] - final = ["论文概况", paper_meta_info.replace('# ', '### '), "二、论文翻译", ""] - final.extend(gpt_response_collection_html) - for i, k in enumerate(final): - if i%2==0: - orig = k - if i%2==1: - trans = k - ch.add_row(a=orig, b=trans) - create_report_file_name = f"{os.path.basename(fp)}.trans.html" - generated_html_files.append(ch.save_file(create_report_file_name)) - except: - from toolbox import trimmed_format_exc - print('writing html result failed:', trimmed_format_exc()) - - # 准备文件的下载 - for pdf_path in generated_conclusion_files: - # 重命名文件 - rename_file = f'翻译-{os.path.basename(pdf_path)}' - promote_file_to_downloadzone(pdf_path, rename_file=rename_file, chatbot=chatbot) - for html_path in generated_html_files: - # 重命名文件 - rename_file = f'翻译-{os.path.basename(html_path)}' - promote_file_to_downloadzone(html_path, rename_file=rename_file, chatbot=chatbot) - chatbot.append(("给出输出文件清单", str(generated_conclusion_files + generated_html_files))) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - + if method is None: + # ------- 以上三种方法都试一遍 ------- + DOC2X_API_KEY = get_conf("DOC2X_API_KEY") + if len(DOC2X_API_KEY) != 0: + try: + yield from 解析PDF_基于DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request) + return + except: + chatbot.append([None, f"DOC2X服务不可用,正在尝试GROBID。{trimmed_format_exc_markdown()}"]) + yield from update_ui(chatbot=chatbot, history=history) + grobid_url = get_avail_grobid_url() + if grobid_url is not None: + yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url) + return + yield from update_ui_lastest_msg("GROBID服务不可用,请检查config中的GROBID_URL。作为替代,现在将执行效果稍差的旧版代码。", chatbot, history, delay=3) + yield from 解析PDF_简单拆解(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) + return diff --git a/crazy_functions/PDF_Translate_Wrap.py b/crazy_functions/PDF_Translate_Wrap.py index c3984896..7eaad37f 100644 --- a/crazy_functions/PDF_Translate_Wrap.py +++ b/crazy_functions/PDF_Translate_Wrap.py @@ -1,5 +1,6 @@ -from .PDF_Translate import 批量翻译PDF文档 from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty +from .PDF_Translate import 批量翻译PDF文档 + class PDF_Tran(GptAcademicPluginTemplate): def __init__(self): @@ -8,14 +9,16 @@ class PDF_Tran(GptAcademicPluginTemplate): def define_arg_selection_menu(self): gui_definition = { "main_input": - ArgProperty(title="PDF文件路径", description="上传文件后,会自动生成路径", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步 - "advanced_arg": - ArgProperty(title="高级参数输入区", description="无", default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步 - "additional_01": - ArgProperty(title="附属参数", description="无", default_value="没有附属参数", type="string").model_dump_json(), # 高级参数输入区,自动同步 + ArgProperty(title="PDF文件路径", description="请上传文件后,再点击该插件", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步 + "additional_prompt": + ArgProperty(title="额外提示词", description="例如:对专有名词、翻译语气等方面的要求", default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步 + "pdf_parse_method": + ArgProperty(title="PDF解析方法", options=["DOC2X", "GROBID", "ClASSIC"], description="无", default_value="GROBID", type="dropdown").model_dump_json(), } return gui_definition def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): - print(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request) + main_input = plugin_kwargs["main_input"] + additional_prompt = plugin_kwargs["additional_prompt"] + pdf_parse_method = plugin_kwargs["pdf_parse_method"] yield from 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request) \ No newline at end of file diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py index 83d10862..43f1dc80 100644 --- a/crazy_functions/crazy_utils.py +++ b/crazy_functions/crazy_utils.py @@ -349,7 +349,7 @@ def read_and_clean_pdf_text(fp): import fitz, copy import re import numpy as np - from colorful import print亮黄, print亮绿 + from shared_utils.colorful import print亮黄, print亮绿 fc = 0 # Index 0 文本 fs = 1 # Index 1 字体 fb = 2 # Index 2 框框 diff --git a/crazy_functions/pdf_fns/parse_pdf.py b/crazy_functions/pdf_fns/parse_pdf.py index a1b66d0d..f41b2f9b 100644 --- a/crazy_functions/pdf_fns/parse_pdf.py +++ b/crazy_functions/pdf_fns/parse_pdf.py @@ -4,7 +4,7 @@ from toolbox import promote_file_to_downloadzone from toolbox import write_history_to_file, promote_file_to_downloadzone from toolbox import get_conf from toolbox import ProxyNetworkActivate -from colorful import * +from shared_utils.colorful import * import requests import random import copy @@ -72,7 +72,7 @@ def produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chat generated_conclusion_files.append(res_path) return res_path -def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG): +def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG, plugin_kwargs={}): from crazy_functions.pdf_fns.report_gen_html import construct_html from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive @@ -138,7 +138,7 @@ def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_fi chatbot=chatbot, history_array=[meta for _ in inputs_array], sys_prompt_array=[ - "请你作为一个学术翻译,负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" for _ in inputs_array], + "请你作为一个学术翻译,负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" + plugin_kwargs.get("additional_prompt", "") for _ in inputs_array], ) # -=-=-=-=-=-=-=-= 写出Markdown文件 -=-=-=-=-=-=-=-= produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chatbot, fp, generated_conclusion_files) diff --git a/crazy_functions/pdf_fns/parse_pdf_grobid.py b/crazy_functions/pdf_fns/parse_pdf_grobid.py new file mode 100644 index 00000000..bb42ce6b --- /dev/null +++ b/crazy_functions/pdf_fns/parse_pdf_grobid.py @@ -0,0 +1,26 @@ +import os +from toolbox import CatchException, report_exception, get_log_folder, gen_time_str, check_packages +from toolbox import update_ui, promote_file_to_downloadzone, update_ui_lastest_msg, disable_auto_promotion +from toolbox import write_history_to_file, promote_file_to_downloadzone, get_conf, extract_archive +from crazy_functions.pdf_fns.parse_pdf import parse_pdf, translate_pdf + +def 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url): + import copy, json + TOKEN_LIMIT_PER_FRAGMENT = 1024 + generated_conclusion_files = [] + generated_html_files = [] + DST_LANG = "中文" + from crazy_functions.pdf_fns.report_gen_html import construct_html + for index, fp in enumerate(file_manifest): + chatbot.append(["当前进度:", f"正在连接GROBID服务,请稍候: {grobid_url}\n如果等待时间过长,请修改config中的GROBID_URL,可修改成本地GROBID服务。"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + article_dict = parse_pdf(fp, grobid_url) + grobid_json_res = os.path.join(get_log_folder(), gen_time_str() + "grobid.json") + with open(grobid_json_res, 'w+', encoding='utf8') as f: + f.write(json.dumps(article_dict, indent=4, ensure_ascii=False)) + promote_file_to_downloadzone(grobid_json_res, chatbot=chatbot) + if article_dict is None: raise RuntimeError("解析PDF失败,请检查PDF是否损坏。") + yield from translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG, plugin_kwargs=plugin_kwargs) + chatbot.append(("给出输出文件清单", str(generated_conclusion_files + generated_html_files))) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + diff --git a/crazy_functions/pdf_fns/parse_pdf_legacy.py b/crazy_functions/pdf_fns/parse_pdf_legacy.py new file mode 100644 index 00000000..482cd055 --- /dev/null +++ b/crazy_functions/pdf_fns/parse_pdf_legacy.py @@ -0,0 +1,110 @@ +from toolbox import get_log_folder +from toolbox import update_ui, promote_file_to_downloadzone +from toolbox import write_history_to_file, promote_file_to_downloadzone +from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency +from crazy_functions.crazy_utils import read_and_clean_pdf_text +from shared_utils.colorful import * +import os + +def 解析PDF_简单拆解(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): + """ + 注意:此函数已经弃用!!新函数位于:crazy_functions/pdf_fns/parse_pdf.py + """ + import copy + TOKEN_LIMIT_PER_FRAGMENT = 1024 + generated_conclusion_files = [] + generated_html_files = [] + from crazy_functions.pdf_fns.report_gen_html import construct_html + for index, fp in enumerate(file_manifest): + # 读取PDF文件 + file_content, page_one = read_and_clean_pdf_text(fp) + file_content = file_content.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars + page_one = str(page_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars + + # 递归地切割PDF文件 + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) + page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=page_one, limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model']) + + # 为了更好的效果,我们剥离Introduction之后的部分(如果有) + paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] + + # 单线,获取文章meta信息 + paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=f"以下是一篇学术论文的基础信息,请从中提取出“标题”、“收录会议或期刊”、“作者”、“摘要”、“编号”、“作者邮箱”这六个部分。请用markdown格式输出,最后用中文翻译摘要部分。请提取:{paper_meta}", + inputs_show_user=f"请从{fp}中提取出“标题”、“收录会议或期刊”等基本信息。", + llm_kwargs=llm_kwargs, + chatbot=chatbot, history=[], + sys_prompt="Your job is to collect information from materials。", + ) + + # 多线,翻译 + gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=[ + f"你需要翻译以下内容:\n{frag}" for frag in paper_fragments], + inputs_show_user_array=[f"\n---\n 原文: \n\n {frag.replace('#', '')} \n---\n 翻译:\n " for frag in paper_fragments], + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=[[paper_meta] for _ in paper_fragments], + sys_prompt_array=[ + "请你作为一个学术翻译,负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" + plugin_kwargs.get("additional_prompt", "") + for _ in paper_fragments], + # max_workers=5 # OpenAI所允许的最大并行过载 + ) + gpt_response_collection_md = copy.deepcopy(gpt_response_collection) + # 整理报告的格式 + for i,k in enumerate(gpt_response_collection_md): + if i%2==0: + gpt_response_collection_md[i] = f"\n\n---\n\n ## 原文[{i//2}/{len(gpt_response_collection_md)//2}]: \n\n {paper_fragments[i//2].replace('#', '')} \n\n---\n\n ## 翻译[{i//2}/{len(gpt_response_collection_md)//2}]:\n " + else: + gpt_response_collection_md[i] = gpt_response_collection_md[i] + final = ["一、论文概况\n\n---\n\n", paper_meta_info.replace('# ', '### ') + '\n\n---\n\n', "二、论文翻译", ""] + final.extend(gpt_response_collection_md) + create_report_file_name = f"{os.path.basename(fp)}.trans.md" + res = write_history_to_file(final, create_report_file_name) + promote_file_to_downloadzone(res, chatbot=chatbot) + + # 更新UI + generated_conclusion_files.append(f'{get_log_folder()}/{create_report_file_name}') + chatbot.append((f"{fp}完成了吗?", res)) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # write html + try: + ch = construct_html() + orig = "" + trans = "" + gpt_response_collection_html = copy.deepcopy(gpt_response_collection) + for i,k in enumerate(gpt_response_collection_html): + if i%2==0: + gpt_response_collection_html[i] = paper_fragments[i//2].replace('#', '') + else: + gpt_response_collection_html[i] = gpt_response_collection_html[i] + final = ["论文概况", paper_meta_info.replace('# ', '### '), "二、论文翻译", ""] + final.extend(gpt_response_collection_html) + for i, k in enumerate(final): + if i%2==0: + orig = k + if i%2==1: + trans = k + ch.add_row(a=orig, b=trans) + create_report_file_name = f"{os.path.basename(fp)}.trans.html" + generated_html_files.append(ch.save_file(create_report_file_name)) + except: + from toolbox import trimmed_format_exc + print('writing html result failed:', trimmed_format_exc()) + + # 准备文件的下载 + for pdf_path in generated_conclusion_files: + # 重命名文件 + rename_file = f'翻译-{os.path.basename(pdf_path)}' + promote_file_to_downloadzone(pdf_path, rename_file=rename_file, chatbot=chatbot) + for html_path in generated_html_files: + # 重命名文件 + rename_file = f'翻译-{os.path.basename(html_path)}' + promote_file_to_downloadzone(html_path, rename_file=rename_file, chatbot=chatbot) + chatbot.append(("给出输出文件清单", str(generated_conclusion_files + generated_html_files))) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + diff --git a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py new file mode 100644 index 00000000..180f0c56 --- /dev/null +++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py @@ -0,0 +1,160 @@ +from toolbox import get_log_folder, gen_time_str +from toolbox import update_ui, promote_file_to_downloadzone +from toolbox import promote_file_to_downloadzone, extract_archive +from toolbox import generate_file_link, zip_folder +from crazy_functions.crazy_utils import get_files_from_everything +from shared_utils.colorful import * +import os + + +def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request): + + def refresh_key(doc2x_api_key): + import requests, json + url = "https://api.doc2x.noedgeai.com/api/token/refresh" + res = requests.post( + url, + headers={"Authorization": "Bearer " + doc2x_api_key} + ) + res_json = [] + if res.status_code == 200: + decoded = res.content.decode("utf-8") + res_json = json.loads(decoded) + doc2x_api_key = res_json['data']['token'] + else: + raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + return doc2x_api_key + + def pdf2markdown(filepath): + import requests, json, os + markdown_dir = get_log_folder(plugin_name="pdf_ocr") + doc2x_api_key = DOC2X_API_KEY + if doc2x_api_key.startswith('sk-'): + url = "https://api.doc2x.noedgeai.com/api/v1/pdf" + else: + doc2x_api_key = refresh_key(doc2x_api_key) + url = "https://api.doc2x.noedgeai.com/api/platform/pdf" + + chatbot.append((None, "加载PDF文件,发送至DOC2X解析...")) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + res = requests.post( + url, + files={"file": open(filepath, "rb")}, + data={"ocr": "1"}, + headers={"Authorization": "Bearer " + doc2x_api_key} + ) + res_json = [] + if res.status_code == 200: + decoded = res.content.decode("utf-8") + for z_decoded in decoded.split('\n'): + if len(z_decoded) == 0: continue + assert z_decoded.startswith("data: ") + z_decoded = z_decoded[len("data: "):] + decoded_json = json.loads(z_decoded) + res_json.append(decoded_json) + else: + raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + uuid = res_json[0]['uuid'] + to = "md" # latex, md, docx + url = "https://api.doc2x.noedgeai.com/api/export"+"?request_id="+uuid+"&to="+to + + chatbot.append((None, f"读取解析: {url} ...")) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + res = requests.get(url, headers={"Authorization": "Bearer " + doc2x_api_key}) + md_zip_path = os.path.join(markdown_dir, gen_time_str() + '.zip') + if res.status_code == 200: + with open(md_zip_path, "wb") as f: f.write(res.content) + else: + raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + promote_file_to_downloadzone(md_zip_path, chatbot=chatbot) + chatbot.append((None, f"完成解析 {md_zip_path} ...")) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return md_zip_path + + def deliver_to_markdown_plugin(md_zip_path, user_request): + from crazy_functions.Markdown_Translate import Markdown英译中 + import shutil, re + + time_tag = gen_time_str() + target_path_base = get_log_folder(chatbot.get_user()) + file_origin_name = os.path.basename(md_zip_path) + this_file_path = os.path.join(target_path_base, file_origin_name) + os.makedirs(target_path_base, exist_ok=True) + shutil.copyfile(md_zip_path, this_file_path) + ex_folder = this_file_path + ".extract" + extract_archive( + file_path=this_file_path, dest_dir=ex_folder + ) + + # edit markdown files + success, file_manifest, project_folder = get_files_from_everything(ex_folder, type='.md') + for generated_fp in file_manifest: + # 修正一些公式问题 + with open(generated_fp, 'r', encoding='utf8') as f: + content = f.read() + # 将公式中的\[ \]替换成$$ + content = content.replace(r'\[', r'$$').replace(r'\]', r'$$') + # 将公式中的\( \)替换成$ + content = content.replace(r'\(', r'$').replace(r'\)', r'$') + content = content.replace('```markdown', '\n').replace('```', '\n') + with open(generated_fp, 'w', encoding='utf8') as f: + f.write(content) + promote_file_to_downloadzone(generated_fp, chatbot=chatbot) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 生成在线预览html + file_name = '在线预览翻译(原文)' + gen_time_str() + '.html' + preview_fp = os.path.join(ex_folder, file_name) + from shared_utils.advanced_markdown_format import markdown_convertion_for_file + with open(generated_fp, "r", encoding="utf-8") as f: + md = f.read() + # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染 + md = re.sub(r'^
', r'😃
', md, flags=re.MULTILINE) + html = markdown_convertion_for_file(md) + with open(preview_fp, "w", encoding="utf-8") as f: f.write(html) + chatbot.append([None, f"生成在线预览:{generate_file_link([preview_fp])}"]) + promote_file_to_downloadzone(preview_fp, chatbot=chatbot) + + + + chatbot.append((None, f"调用Markdown插件 {ex_folder} ...")) + plugin_kwargs['markdown_expected_output_dir'] = ex_folder + + translated_f_name = 'translated_markdown.md' + generated_fp = plugin_kwargs['markdown_expected_output_path'] = os.path.join(ex_folder, translated_f_name) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + yield from Markdown英译中(ex_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request) + if os.path.exists(generated_fp): + # 修正一些公式问题 + with open(generated_fp, 'r', encoding='utf8') as f: content = f.read() + content = content.replace('```markdown', '\n').replace('```', '\n') + # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染 + content = re.sub(r'^
', r'😃
', content, flags=re.MULTILINE) + with open(generated_fp, 'w', encoding='utf8') as f: f.write(content) + # 生成在线预览html + file_name = '在线预览翻译' + gen_time_str() + '.html' + preview_fp = os.path.join(ex_folder, file_name) + from shared_utils.advanced_markdown_format import markdown_convertion_for_file + with open(generated_fp, "r", encoding="utf-8") as f: + md = f.read() + html = markdown_convertion_for_file(md) + with open(preview_fp, "w", encoding="utf-8") as f: f.write(html) + promote_file_to_downloadzone(preview_fp, chatbot=chatbot) + # 生成包含图片的压缩包 + dest_folder = get_log_folder(chatbot.get_user()) + zip_name = '翻译后的带图文档.zip' + zip_folder(source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name) + zip_fp = os.path.join(dest_folder, zip_name) + promote_file_to_downloadzone(zip_fp, chatbot=chatbot) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + md_zip_path = yield from pdf2markdown(fp) + yield from deliver_to_markdown_plugin(md_zip_path, user_request) + +def 解析PDF_基于DOC2X(file_manifest, *args): + for index, fp in enumerate(file_manifest): + yield from 解析PDF_DOC2X_单文件(fp, *args) + return + + diff --git a/crazy_functions/plugin_template/plugin_class_template.py b/crazy_functions/plugin_template/plugin_class_template.py index 7a15c210..870204c6 100644 --- a/crazy_functions/plugin_template/plugin_class_template.py +++ b/crazy_functions/plugin_template/plugin_class_template.py @@ -1,12 +1,14 @@ import os, json, base64 from pydantic import BaseModel, Field from textwrap import dedent +from typing import List -class ArgProperty(BaseModel): +class ArgProperty(BaseModel): # PLUGIN_ARG_MENU title: str = Field(description="The title", default="") description: str = Field(description="The description", default="") default_value: str|float = Field(description="The default value", default="") type: str = Field(description="The type", default="") + options: List[str] = Field(default=[], description="List of options available for the argument") class GptAcademicPluginTemplate(): def __init__(self): @@ -41,8 +43,6 @@ class GptAcademicPluginTemplate(): raise ValueError("You can only have up to 8 arguments in the define_arg_selection") if "main_input" not in define_arg_selection: raise ValueError("You must have a 'main_input' in the define_arg_selection") - if "advanced_arg" not in define_arg_selection: - raise ValueError("You must have a 'main_input' in the define_arg_selection") DEFINE_ARG_INPUT_INTERFACE = json.dumps(define_arg_selection) # return dedent(""" diff --git a/crazy_functions/批量翻译PDF文档_NOUGAT.py b/crazy_functions/批量翻译PDF文档_NOUGAT.py index d5e33c2e..a124150f 100644 --- a/crazy_functions/批量翻译PDF文档_NOUGAT.py +++ b/crazy_functions/批量翻译PDF文档_NOUGAT.py @@ -5,7 +5,7 @@ from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency from .crazy_utils import read_and_clean_pdf_text from .pdf_fns.parse_pdf import parse_pdf, get_avail_grobid_url, translate_pdf -from colorful import * +from shared_utils.colorful import * import copy import os import math diff --git a/docs/self_analysis.md b/docs/self_analysis.md index d63179c4..b41222b6 100644 --- a/docs/self_analysis.md +++ b/docs/self_analysis.md @@ -25,7 +25,7 @@ | crazy_functions\对话历史存档.py | 将每次对话记录写入Markdown格式的文件中 | | crazy_functions\总结word文档.py | 对输入的word文档进行摘要生成 | | crazy_functions\总结音视频.py | 对输入的音视频文件进行摘要生成 | -| crazy_functions\批量Markdown翻译.py | 将指定目录下的Markdown文件进行中英文翻译 | +| crazy_functions\Markdown_Translate.py | 将指定目录下的Markdown文件进行中英文翻译 | | crazy_functions\批量总结PDF文档.py | 对PDF文件进行切割和摘要生成 | | crazy_functions\批量总结PDF文档pdfminer.py | 对PDF文件进行文本内容的提取和摘要生成 | | crazy_functions\PDF_Translate.py | 将指定目录下的PDF文件进行中英文翻译 | @@ -175,9 +175,9 @@ toolbox.py是一个工具类库,其中主要包含了一些函数装饰器和 该程序文件包括两个函数:split_audio_file()和AnalyAudio(),并且导入了一些必要的库并定义了一些工具函数。split_audio_file用于将音频文件分割成多个时长相等的片段,返回一个包含所有切割音频片段文件路径的列表,而AnalyAudio用来分析音频文件,通过调用whisper模型进行音频转文字并使用GPT模型对音频内容进行概述,最终将所有总结结果写入结果文件中。 -## [21/48] 请对下面的程序文件做一个概述: crazy_functions\批量Markdown翻译.py +## [21/48] 请对下面的程序文件做一个概述: crazy_functions\Markdown_Translate.py -该程序文件名为`批量Markdown翻译.py`,包含了以下功能:读取Markdown文件,将长文本分离开来,将Markdown文件进行翻译(英译中和中译英),整理结果并退出。程序使用了多线程以提高效率。程序使用了`tiktoken`依赖库,可能需要额外安装。文件中还有一些其他的函数和类,但与文件名所描述的功能无关。 +该程序文件名为`Markdown_Translate.py`,包含了以下功能:读取Markdown文件,将长文本分离开来,将Markdown文件进行翻译(英译中和中译英),整理结果并退出。程序使用了多线程以提高效率。程序使用了`tiktoken`依赖库,可能需要额外安装。文件中还有一些其他的函数和类,但与文件名所描述的功能无关。 ## [22/48] 请对下面的程序文件做一个概述: crazy_functions\批量总结PDF文档.py @@ -331,7 +331,7 @@ check_proxy.py, colorful.py, config.py, config_private.py, core_functional.py, c 这些程序源文件提供了基础的文本和语言处理功能、工具函数和高级插件,使 Chatbot 能够处理各种复杂的学术文本问题,包括润色、翻译、搜索、下载、解析等。 ## 用一张Markdown表格简要描述以下文件的功能: -crazy_functions\代码重写为全英文_多线程.py, crazy_functions\图片生成.py, crazy_functions\对话历史存档.py, crazy_functions\总结word文档.py, crazy_functions\总结音视频.py, crazy_functions\批量Markdown翻译.py, crazy_functions\批量总结PDF文档.py, crazy_functions\批量总结PDF文档pdfminer.py, crazy_functions\PDF_Translate.py, crazy_functions\理解PDF文档内容.py, crazy_functions\生成函数注释.py, crazy_functions\联网的ChatGPT.py, crazy_functions\解析JupyterNotebook.py, crazy_functions\解析项目源代码.py, crazy_functions\询问多个大语言模型.py, crazy_functions\读文章写摘要.py。根据以上分析,用一句话概括程序的整体功能。 +crazy_functions\代码重写为全英文_多线程.py, crazy_functions\图片生成.py, crazy_functions\对话历史存档.py, crazy_functions\总结word文档.py, crazy_functions\总结音视频.py, crazy_functions\Markdown_Translate.py, crazy_functions\批量总结PDF文档.py, crazy_functions\批量总结PDF文档pdfminer.py, crazy_functions\PDF_Translate.py, crazy_functions\理解PDF文档内容.py, crazy_functions\生成函数注释.py, crazy_functions\联网的ChatGPT.py, crazy_functions\解析JupyterNotebook.py, crazy_functions\解析项目源代码.py, crazy_functions\询问多个大语言模型.py, crazy_functions\读文章写摘要.py。根据以上分析,用一句话概括程序的整体功能。 | 文件名 | 功能简述 | | --- | --- | @@ -340,7 +340,7 @@ crazy_functions\代码重写为全英文_多线程.py, crazy_functions\图片生 | 对话历史存档.py | 将每次对话记录写入Markdown格式的文件中 | | 总结word文档.py | 对输入的word文档进行摘要生成 | | 总结音视频.py | 对输入的音视频文件进行摘要生成 | -| 批量Markdown翻译.py | 将指定目录下的Markdown文件进行中英文翻译 | +| Markdown_Translate.py | 将指定目录下的Markdown文件进行中英文翻译 | | 批量总结PDF文档.py | 对PDF文件进行切割和摘要生成 | | 批量总结PDF文档pdfminer.py | 对PDF文件进行文本内容的提取和摘要生成 | | PDF_Translate.py | 将指定目录下的PDF文件进行中英文翻译 | diff --git a/docs/translate_english.json b/docs/translate_english.json index 7669cff3..063c84bd 100644 --- a/docs/translate_english.json +++ b/docs/translate_english.json @@ -40,7 +40,7 @@ "载入对话历史存档": "LoadConversationHistoryArchive", "删除所有本地对话历史记录": "DeleteAllLocalConversationHistoryRecords", "Markdown英译中": "TranslateMarkdownFromEnglishToChinese", - "批量Markdown翻译": "BatchTranslateMarkdown", + "Markdown_Translate": "BatchTranslateMarkdown", "批量总结PDF文档": "BatchSummarizePDFDocuments", "批量总结PDF文档pdfminer": "BatchSummarizePDFDocumentsUsingPdfminer", "批量翻译PDF文档": "BatchTranslatePDFDocuments", diff --git a/docs/translate_japanese.json b/docs/translate_japanese.json index c1df398e..401457a4 100644 --- a/docs/translate_japanese.json +++ b/docs/translate_japanese.json @@ -40,7 +40,7 @@ "载入对话历史存档": "LoadConversationHistoryArchive", "删除所有本地对话历史记录": "DeleteAllLocalChatHistory", "Markdown英译中": "MarkdownTranslateFromEngToChi", - "批量Markdown翻译": "BatchTranslateMarkdown", + "Markdown_Translate": "BatchTranslateMarkdown", "批量总结PDF文档": "BatchSummarizePDFDocuments", "批量总结PDF文档pdfminer": "BatchSummarizePDFDocumentsUsingPDFMiner", "批量翻译PDF文档": "BatchTranslatePDFDocuments", diff --git a/docs/translate_std.json b/docs/translate_std.json index a8b255db..f624a04f 100644 --- a/docs/translate_std.json +++ b/docs/translate_std.json @@ -13,7 +13,7 @@ "代码重写为全英文_多线程": "RewriteCodeToEnglish_MultiThreaded", "解析一个CSharp项目": "ParsingCSharpProject", "删除所有本地对话历史记录": "DeleteAllLocalConversationHistoryRecords", - "批量Markdown翻译": "BatchTranslateMarkdown", + "Markdown_Translate": "BatchTranslateMarkdown", "连接bing搜索回答问题": "ConnectBingSearchAnswerQuestion", "Langchain知识库": "LangchainKnowledgeBase", "Latex输出PDF": "OutputPDFFromLatex", diff --git a/docs/translate_traditionalchinese.json b/docs/translate_traditionalchinese.json index c4e42857..a6bc674d 100644 --- a/docs/translate_traditionalchinese.json +++ b/docs/translate_traditionalchinese.json @@ -39,7 +39,7 @@ "载入对话历史存档": "LoadConversationHistoryArchive", "删除所有本地对话历史记录": "DeleteAllLocalConversationHistoryRecords", "Markdown英译中": "MarkdownEnglishToChinese", - "批量Markdown翻译": "BatchMarkdownTranslation", + "Markdown_Translate": "BatchMarkdownTranslation", "批量总结PDF文档": "BatchSummarizePDFDocuments", "批量总结PDF文档pdfminer": "BatchSummarizePDFDocumentsPdfminer", "批量翻译PDF文档": "BatchTranslatePDFDocuments", diff --git a/colorful.py b/shared_utils/colorful.py similarity index 100% rename from colorful.py rename to shared_utils/colorful.py diff --git a/shared_utils/config_loader.py b/shared_utils/config_loader.py index 1776998e..cf5d58cf 100644 --- a/shared_utils/config_loader.py +++ b/shared_utils/config_loader.py @@ -2,7 +2,7 @@ import importlib import time import os from functools import lru_cache -from colorful import print亮红, print亮绿, print亮蓝 +from shared_utils.colorful import print亮红, print亮绿, print亮蓝 pj = os.path.join default_user_name = 'default_user' diff --git a/shared_utils/connect_void_terminal.py b/shared_utils/connect_void_terminal.py index 3c84e337..8fdb1033 100644 --- a/shared_utils/connect_void_terminal.py +++ b/shared_utils/connect_void_terminal.py @@ -15,13 +15,13 @@ import os def get_plugin_handle(plugin_name): """ - e.g. plugin_name = 'crazy_functions.批量Markdown翻译->Markdown翻译指定语言' + e.g. plugin_name = 'crazy_functions.Markdown_Translate->Markdown翻译指定语言' """ import importlib assert ( "->" in plugin_name - ), "Example of plugin_name: crazy_functions.批量Markdown翻译->Markdown翻译指定语言" + ), "Example of plugin_name: crazy_functions.Markdown_Translate->Markdown翻译指定语言" module, fn_name = plugin_name.split("->") f_hot_reload = getattr(importlib.import_module(module, fn_name), fn_name) return f_hot_reload diff --git a/shared_utils/cookie_manager.py b/shared_utils/cookie_manager.py index c0994324..698e3926 100644 --- a/shared_utils/cookie_manager.py +++ b/shared_utils/cookie_manager.py @@ -90,23 +90,23 @@ def make_history_cache(): -""" -with gr.Row(): - txt = gr.Textbox(show_label=False, placeholder="Input question here.", elem_id='user_input_main').style(container=False) - txtx = gr.Textbox(show_label=False, placeholder="Input question here.", elem_id='user_input_main').style(container=False) -with gr.Row(): - btn_value = "Test" - elem_id = "TestCase" - variant = "primary" - input_list = [txt, txtx] - output_list = [txt, txtx] - input_name_list = ["txt(input)", "txtx(input)"] - output_name_list = ["txt", "txtx"] - js_callback = """(txt, txtx)=>{console.log(txt); console.log(txtx);}""" - def function(txt, txtx): - return "booo", "goooo" - create_button_with_javascript_callback(btn_value, elem_id, variant, js_callback, input_list, output_list, function, input_name_list, output_name_list) -""" +# """ +# with gr.Row(): +# txt = gr.Textbox(show_label=False, placeholder="Input question here.", elem_id='user_input_main').style(container=False) +# txtx = gr.Textbox(show_label=False, placeholder="Input question here.", elem_id='user_input_main').style(container=False) +# with gr.Row(): +# btn_value = "Test" +# elem_id = "TestCase" +# variant = "primary" +# input_list = [txt, txtx] +# output_list = [txt, txtx] +# input_name_list = ["txt(input)", "txtx(input)"] +# output_name_list = ["txt", "txtx"] +# js_callback = """(txt, txtx)=>{console.log(txt); console.log(txtx);}""" +# def function(txt, txtx): +# return "booo", "goooo" +# create_button_with_javascript_callback(btn_value, elem_id, variant, js_callback, input_list, output_list, function, input_name_list, output_name_list) +# """ def create_button_with_javascript_callback(btn_value, elem_id, variant, js_callback, input_list, output_list, function, input_name_list, output_name_list): import gradio as gr middle_ware_component = gr.Textbox(visible=False, elem_id=elem_id+'_buffer') diff --git a/tests/test_plugins.py b/tests/test_plugins.py index cab42fd7..d5f87cfa 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -43,7 +43,7 @@ if __name__ == "__main__": # plugin_test(plugin='crazy_functions.Latex全文润色->Latex英文润色', main_input="crazy_functions/test_project/latex/attention") - # plugin_test(plugin='crazy_functions.批量Markdown翻译->Markdown中译英', main_input="README.md") + # plugin_test(plugin='crazy_functions.Markdown_Translate->Markdown中译英', main_input="README.md") # plugin_test(plugin='crazy_functions.PDF_Translate->批量翻译PDF文档', main_input='crazy_functions/test_project/pdf_and_word/aaai.pdf') @@ -60,7 +60,7 @@ if __name__ == "__main__": # plugin_test(plugin='crazy_functions.数学动画生成manim->动画生成', main_input="A ball split into 2, and then split into 4, and finally split into 8.") # for lang in ["English", "French", "Japanese", "Korean", "Russian", "Italian", "German", "Portuguese", "Arabic"]: - # plugin_test(plugin='crazy_functions.批量Markdown翻译->Markdown翻译指定语言', main_input="README.md", advanced_arg={"advanced_arg": lang}) + # plugin_test(plugin='crazy_functions.Markdown_Translate->Markdown翻译指定语言', main_input="README.md", advanced_arg={"advanced_arg": lang}) # plugin_test(plugin='crazy_functions.知识库文件注入->知识库文件注入', main_input="./") diff --git a/themes/common.js b/themes/common.js index 204a692a..b5c4c5dd 100644 --- a/themes/common.js +++ b/themes/common.js @@ -1551,10 +1551,16 @@ async function generate_menu(guiBase64String, btnName){ hide_all_elem(); // 根据 gui_args, 使得对应参数项显现 let text_cnt = 0; + let dropdown_cnt = 0; + // PLUGIN_ARG_MENU for (const key in gui_args) { if (gui_args.hasOwnProperty(key)) { - const component_name = "plugin_arg_txt_" + text_cnt; - if (gui_args[key].type=='string'){ + + /////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////// Textbox //////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////// + if (gui_args[key].type=='string'){ // PLUGIN_ARG_MENU + const component_name = "plugin_arg_txt_" + text_cnt; push_data_to_gradio_component({ visible: true, label: gui_args[key].title + "(" + gui_args[key].description + ")", @@ -1579,6 +1585,26 @@ async function generate_menu(guiBase64String, btnName){ document.getElementById(component_name).parentNode.parentNode.style.display = ''; text_cnt += 1; } + + /////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////// Dropdown //////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////// + if (gui_args[key].type=='dropdown'){ // PLUGIN_ARG_MENU + const component_name = "plugin_arg_drop_" + dropdown_cnt; + push_data_to_gradio_component({ + visible: true, + choices: gui_args[key].options, + label: gui_args[key].title + "(" + gui_args[key].description + ")", + // label: gui_args[key].title, + placeholder: gui_args[key].description, + __type__: 'update' + }, component_name, "obj"); + push_data_to_gradio_component(gui_args[key].default_value, component_name, "obj"); + document.getElementById(component_name).parentNode.style.display = ''; + dropdown_cnt += 1; + } + + } } } @@ -1599,14 +1625,23 @@ async function execute_current_pop_up_plugin(){ let text_cnt = 0; for (const key in gui_args) { if (gui_args.hasOwnProperty(key)) { - if (gui_args[key].type=='string'){ + if (gui_args[key].type=='string'){ // PLUGIN_ARG_MENU corrisponding_elem_id = "plugin_arg_txt_"+text_cnt gui_args[key].user_confirmed_value = await get_data_from_gradio_component(corrisponding_elem_id); text_cnt += 1; } } } - + let dropdown_cnt = 0; + for (const key in gui_args) { + if (gui_args.hasOwnProperty(key)) { + if (gui_args[key].type=='dropdown'){ // PLUGIN_ARG_MENU + corrisponding_elem_id = "plugin_arg_drop_"+dropdown_cnt + gui_args[key].user_confirmed_value = await get_data_from_gradio_component(corrisponding_elem_id); + dropdown_cnt += 1; + } + } + } // close menu push_data_to_gradio_component({ visible: false, @@ -1621,6 +1656,7 @@ async function execute_current_pop_up_plugin(){ } function hide_all_elem(){ + // PLUGIN_ARG_MENU for (text_cnt = 0; text_cnt < 8; text_cnt++){ push_data_to_gradio_component({ visible: false, @@ -1629,9 +1665,19 @@ function hide_all_elem(){ }, "plugin_arg_txt_"+text_cnt, "obj"); document.getElementById("plugin_arg_txt_"+text_cnt).parentNode.parentNode.style.display = 'none'; } + for (dropdown_cnt = 0; dropdown_cnt < 8; dropdown_cnt++){ + push_data_to_gradio_component({ + visible: false, + choices: [], + label: "", + __type__: 'update' + }, "plugin_arg_drop_"+dropdown_cnt, "obj"); + document.getElementById("plugin_arg_drop_"+dropdown_cnt).parentNode.style.display = 'none'; + } } function close_current_pop_up_plugin(){ + // PLUGIN_ARG_MENU push_data_to_gradio_component({ visible: false, __type__: 'update' @@ -1639,6 +1685,7 @@ function close_current_pop_up_plugin(){ hide_all_elem(); } +// 生成高级插件的选择菜单 advanced_plugin_init_code_lib = {} function register_advanced_plugin_init_code(key, code){ advanced_plugin_init_code_lib[key] = code; diff --git a/themes/gui_advanced_plugin_class.py b/themes/gui_advanced_plugin_class.py index d72f866e..ac996dd9 100644 --- a/themes/gui_advanced_plugin_class.py +++ b/themes/gui_advanced_plugin_class.py @@ -9,9 +9,10 @@ def define_gui_advanced_plugin_class(plugins): for u in range(8): with gr.Row(): gr.Textbox(show_label=True, label="T1", placeholder="请输入", lines=1, visible=False, elem_id=f"plugin_arg_txt_{u}").style(container=False) - # for u in range(8): - # with gr.Row(): - # gr.Dropdown(label="T1", value="请选择", visible=False, elem_id=f"plugin_arg_drop_{u}").style(container=False) + for u in range(8): + with gr.Row(): # PLUGIN_ARG_MENU + gr.Dropdown(label="T1", value="请选择", choices=[], visible=True, elem_id=f"plugin_arg_drop_{u}", interactive=True) + with gr.Row(): # 这个隐藏textbox负责装入当前弹出插件的属性 gr.Textbox(show_label=False, placeholder="请输入", lines=1, visible=False,