改进Doc2X请求,并增加xelatex编译的支持 (#2058)
* doc2x请求函数格式清理 * 更新中间部分 * 添加doc2x超时设置并添加对xelatex编译的支持 * Bug修复以及增加对xelatex安装的检测 * 增强弱网环境下的稳定性 * 修复模型中_无法显示的问题 * add xelatex logs --------- Co-authored-by: binary-husky <qingxu.fu@outlook.com>
This commit is contained in:
@@ -47,7 +47,7 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
|
|||||||
yield from 解析PDF_基于DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request)
|
yield from 解析PDF_基于DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request)
|
||||||
return
|
return
|
||||||
except:
|
except:
|
||||||
chatbot.append([None, f"DOC2X服务不可用,现在将执行效果稍差的旧版代码。{trimmed_format_exc_markdown()}"])
|
chatbot.append([None, f"DOC2X服务不可用,请检查报错详细。{trimmed_format_exc_markdown()}"])
|
||||||
yield from update_ui(chatbot=chatbot, history=history)
|
yield from update_ui(chatbot=chatbot, history=history)
|
||||||
|
|
||||||
if method == "GROBID":
|
if method == "GROBID":
|
||||||
|
|||||||
@@ -300,7 +300,8 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin
|
|||||||
write_html(pfg.sp_file_contents, pfg.sp_file_result, chatbot=chatbot, project_folder=project_folder)
|
write_html(pfg.sp_file_contents, pfg.sp_file_result, chatbot=chatbot, project_folder=project_folder)
|
||||||
|
|
||||||
# <-------- 写出文件 ---------->
|
# <-------- 写出文件 ---------->
|
||||||
msg = f"当前大语言模型: {llm_kwargs['llm_model']},当前语言模型温度设定: {llm_kwargs['temperature']}。"
|
model_name = llm_kwargs['llm_model'].replace('_', '\\_') # 替换LLM模型名称中的下划线为转义字符
|
||||||
|
msg = f"当前大语言模型: {model_name},当前语言模型温度设定: {llm_kwargs['temperature']}。"
|
||||||
final_tex = lps.merge_result(pfg.file_result, mode, msg)
|
final_tex = lps.merge_result(pfg.file_result, mode, msg)
|
||||||
objdump((lps, pfg.file_result, mode, msg), file=pj(project_folder,'merge_result.pkl'))
|
objdump((lps, pfg.file_result, mode, msg), file=pj(project_folder,'merge_result.pkl'))
|
||||||
|
|
||||||
@@ -351,6 +352,41 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
|
|||||||
chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history)
|
chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history)
|
||||||
chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面
|
chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面
|
||||||
yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面
|
yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面
|
||||||
|
# 检查是否需要使用xelatex
|
||||||
|
def check_if_need_xelatex(tex_path):
|
||||||
|
try:
|
||||||
|
with open(tex_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
|
content = f.read(5000)
|
||||||
|
# 检查是否有使用xelatex的宏包
|
||||||
|
need_xelatex = any(
|
||||||
|
pkg in content
|
||||||
|
for pkg in ['fontspec', 'xeCJK', 'xetex', 'unicode-math', 'xltxtra', 'xunicode']
|
||||||
|
)
|
||||||
|
if need_xelatex:
|
||||||
|
logger.info(f"检测到宏包需要xelatex编译, 切换至xelatex编译")
|
||||||
|
else:
|
||||||
|
logger.info(f"未检测到宏包需要xelatex编译, 使用pdflatex编译")
|
||||||
|
return need_xelatex
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 根据编译器类型返回编译命令
|
||||||
|
def get_compile_command(compiler, filename):
|
||||||
|
compile_command = f'{compiler} -interaction=batchmode -file-line-error {filename}.tex'
|
||||||
|
logger.info('Latex 编译指令: ', compile_command)
|
||||||
|
return compile_command
|
||||||
|
|
||||||
|
# 确定使用的编译器
|
||||||
|
compiler = 'pdflatex'
|
||||||
|
if check_if_need_xelatex(pj(work_folder_modified, f'{main_file_modified}.tex')):
|
||||||
|
logger.info("检测到宏包需要xelatex编译,切换至xelatex编译")
|
||||||
|
# Check if xelatex is installed
|
||||||
|
try:
|
||||||
|
import subprocess
|
||||||
|
subprocess.run(['xelatex', '--version'], capture_output=True, check=True)
|
||||||
|
compiler = 'xelatex'
|
||||||
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||||
|
raise RuntimeError("检测到需要使用xelatex编译,但系统中未安装xelatex。请先安装texlive或其他提供xelatex的LaTeX发行版。")
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
import os
|
import os
|
||||||
@@ -361,10 +397,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
|
|||||||
|
|
||||||
# https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error
|
# https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error
|
||||||
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面
|
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面
|
||||||
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original)
|
ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original)
|
||||||
|
|
||||||
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面
|
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面
|
||||||
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified)
|
ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified)
|
||||||
|
|
||||||
if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')):
|
if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')):
|
||||||
# 只有第二步成功,才能继续下面的步骤
|
# 只有第二步成功,才能继续下面的步骤
|
||||||
@@ -375,10 +411,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
|
|||||||
ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux', work_folder_modified)
|
ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux', work_folder_modified)
|
||||||
|
|
||||||
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面
|
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面
|
||||||
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original)
|
ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original)
|
||||||
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified)
|
ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified)
|
||||||
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original)
|
ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_original), work_folder_original)
|
||||||
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified)
|
ok = compile_latex_with_timeout(get_compile_command(compiler, main_file_modified), work_folder_modified)
|
||||||
|
|
||||||
if mode!='translate_zh':
|
if mode!='translate_zh':
|
||||||
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面
|
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面
|
||||||
@@ -386,10 +422,10 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f
|
|||||||
ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex', os.getcwd())
|
ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex', os.getcwd())
|
||||||
|
|
||||||
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面
|
yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面
|
||||||
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder)
|
ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder)
|
||||||
ok = compile_latex_with_timeout(f'bibtex merge_diff.aux', work_folder)
|
ok = compile_latex_with_timeout(f'bibtex merge_diff.aux', work_folder)
|
||||||
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder)
|
ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder)
|
||||||
ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder)
|
ok = compile_latex_with_timeout(get_compile_command(compiler, 'merge_diff'), work_folder)
|
||||||
|
|
||||||
# <---------- 检查结果 ----------->
|
# <---------- 检查结果 ----------->
|
||||||
results_ = ""
|
results_ = ""
|
||||||
|
|||||||
@@ -6,75 +6,128 @@ from crazy_functions.crazy_utils import get_files_from_everything
|
|||||||
from shared_utils.colorful import *
|
from shared_utils.colorful import *
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import os
|
import os
|
||||||
|
import requests
|
||||||
import time
|
import time
|
||||||
|
|
||||||
def refresh_key(doc2x_api_key):
|
|
||||||
import requests, json
|
|
||||||
url = "https://api.doc2x.noedgeai.com/api/token/refresh"
|
|
||||||
res = requests.post(
|
|
||||||
url,
|
|
||||||
headers={"Authorization": "Bearer " + doc2x_api_key}
|
|
||||||
)
|
|
||||||
res_json = []
|
|
||||||
if res.status_code == 200:
|
|
||||||
decoded = res.content.decode("utf-8")
|
|
||||||
res_json = json.loads(decoded)
|
|
||||||
doc2x_api_key = res_json['data']['token']
|
|
||||||
else:
|
|
||||||
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
|
||||||
return doc2x_api_key
|
|
||||||
|
|
||||||
|
def retry_request(max_retries=3, delay=3):
|
||||||
|
"""
|
||||||
|
Decorator for retrying HTTP requests
|
||||||
|
Args:
|
||||||
|
max_retries: Maximum number of retry attempts
|
||||||
|
delay: Delay between retries in seconds
|
||||||
|
"""
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
except Exception as e:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
logger.error(
|
||||||
|
f"Request failed, retrying... ({attempt + 1}/{max_retries}) Error: {e}"
|
||||||
|
)
|
||||||
|
time.sleep(delay)
|
||||||
|
continue
|
||||||
|
raise e
|
||||||
|
return None
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
@retry_request()
|
||||||
|
def make_request(method, url, **kwargs):
|
||||||
|
"""
|
||||||
|
Make HTTP request with retry mechanism
|
||||||
|
"""
|
||||||
|
return requests.request(method, url, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def doc2x_api_response_status(response, uid=""):
|
||||||
|
"""
|
||||||
|
Check the status of Doc2x API response
|
||||||
|
Args:
|
||||||
|
response_data: Response object from Doc2x API
|
||||||
|
"""
|
||||||
|
response_json = response.json()
|
||||||
|
response_data = response_json.get("data", {})
|
||||||
|
code = response_json.get("code", "Unknown")
|
||||||
|
meg = response_data.get("message", response_json)
|
||||||
|
trace_id = response.headers.get("trace-id", "Failed to get trace-id")
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Doc2x return an error:\nTrace ID: {trace_id} {uid}\n{response.status_code} - {response_json}"
|
||||||
|
)
|
||||||
|
if code in ["parse_page_limit_exceeded", "parse_concurrency_limit"]:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Reached the limit of Doc2x:\nTrace ID: {trace_id} {uid}\n{code} - {meg}"
|
||||||
|
)
|
||||||
|
if code not in ["ok", "success"]:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Doc2x return an error:\nTrace ID: {trace_id} {uid}\n{code} - {meg}"
|
||||||
|
)
|
||||||
|
return response_data
|
||||||
|
|
||||||
|
|
||||||
def 解析PDF_DOC2X_转Latex(pdf_file_path):
|
def 解析PDF_DOC2X_转Latex(pdf_file_path):
|
||||||
zip_file_path, unzipped_folder = 解析PDF_DOC2X(pdf_file_path, format='tex')
|
zip_file_path, unzipped_folder = 解析PDF_DOC2X(pdf_file_path, format="tex")
|
||||||
return unzipped_folder
|
return unzipped_folder
|
||||||
|
|
||||||
|
|
||||||
def 解析PDF_DOC2X(pdf_file_path, format='tex'):
|
def 解析PDF_DOC2X(pdf_file_path, format="tex"):
|
||||||
"""
|
"""
|
||||||
format: 'tex', 'md', 'docx'
|
format: 'tex', 'md', 'docx'
|
||||||
"""
|
"""
|
||||||
import requests, json, os
|
|
||||||
DOC2X_API_KEY = get_conf('DOC2X_API_KEY')
|
DOC2X_API_KEY = get_conf("DOC2X_API_KEY")
|
||||||
latex_dir = get_log_folder(plugin_name="pdf_ocr_latex")
|
latex_dir = get_log_folder(plugin_name="pdf_ocr_latex")
|
||||||
markdown_dir = get_log_folder(plugin_name="pdf_ocr")
|
markdown_dir = get_log_folder(plugin_name="pdf_ocr")
|
||||||
doc2x_api_key = DOC2X_API_KEY
|
doc2x_api_key = DOC2X_API_KEY
|
||||||
|
|
||||||
|
# < ------ 第1步:预上传获取URL,然后上传文件 ------ >
|
||||||
|
logger.info("Doc2x 上传文件:预上传获取URL")
|
||||||
|
res = make_request(
|
||||||
|
"POST",
|
||||||
|
"https://v2.doc2x.noedgeai.com/api/v2/parse/preupload",
|
||||||
|
headers={"Authorization": "Bearer " + doc2x_api_key},
|
||||||
|
timeout=15,
|
||||||
|
)
|
||||||
|
res_data = doc2x_api_response_status(res)
|
||||||
|
upload_url = res_data["url"]
|
||||||
|
uuid = res_data["uid"]
|
||||||
|
|
||||||
# < ------ 第1步:上传 ------ >
|
logger.info("Doc2x 上传文件:上传文件")
|
||||||
logger.info("Doc2x 第1步:上传")
|
with open(pdf_file_path, "rb") as file:
|
||||||
with open(pdf_file_path, 'rb') as file:
|
res = make_request("PUT", upload_url, data=file, timeout=60)
|
||||||
res = requests.post(
|
res.raise_for_status()
|
||||||
"https://v2.doc2x.noedgeai.com/api/v2/parse/pdf",
|
|
||||||
headers={"Authorization": "Bearer " + doc2x_api_key},
|
|
||||||
data=file
|
|
||||||
)
|
|
||||||
# res_json = []
|
|
||||||
if res.status_code == 200:
|
|
||||||
res_json = res.json()
|
|
||||||
else:
|
|
||||||
raise RuntimeError(f"Doc2x return an error: {res.json()}")
|
|
||||||
uuid = res_json['data']['uid']
|
|
||||||
|
|
||||||
# < ------ 第2步:轮询等待 ------ >
|
# < ------ 第2步:轮询等待 ------ >
|
||||||
logger.info("Doc2x 第2步:轮询等待")
|
logger.info("Doc2x 处理文件中:轮询等待")
|
||||||
params = {'uid': uuid}
|
params = {"uid": uuid}
|
||||||
while True:
|
max_attempts = 60
|
||||||
res = requests.get(
|
attempt = 0
|
||||||
'https://v2.doc2x.noedgeai.com/api/v2/parse/status',
|
while attempt < max_attempts:
|
||||||
|
res = make_request(
|
||||||
|
"GET",
|
||||||
|
"https://v2.doc2x.noedgeai.com/api/v2/parse/status",
|
||||||
headers={"Authorization": "Bearer " + doc2x_api_key},
|
headers={"Authorization": "Bearer " + doc2x_api_key},
|
||||||
params=params
|
params=params,
|
||||||
|
timeout=15,
|
||||||
)
|
)
|
||||||
res_json = res.json()
|
res_data = doc2x_api_response_status(res)
|
||||||
if res_json['data']['status'] == "success":
|
if res_data["status"] == "success":
|
||||||
break
|
break
|
||||||
elif res_json['data']['status'] == "processing":
|
elif res_data["status"] == "processing":
|
||||||
time.sleep(3)
|
time.sleep(5)
|
||||||
logger.info(f"Doc2x is processing at {res_json['data']['progress']}%")
|
logger.info(f"Doc2x is processing at {res_data['progress']}%")
|
||||||
elif res_json['data']['status'] == "failed":
|
attempt += 1
|
||||||
raise RuntimeError(f"Doc2x return an error: {res_json}")
|
else:
|
||||||
|
raise RuntimeError(f"Doc2x return an error: {res_data}")
|
||||||
|
if attempt >= max_attempts:
|
||||||
|
raise RuntimeError("Doc2x processing timeout after maximum attempts")
|
||||||
|
|
||||||
# < ------ 第3步:提交转化 ------ >
|
# < ------ 第3步:提交转化 ------ >
|
||||||
logger.info("Doc2x 第3步:提交转化")
|
logger.info("Doc2x 第3步:提交转化")
|
||||||
@@ -84,42 +137,44 @@ def 解析PDF_DOC2X(pdf_file_path, format='tex'):
|
|||||||
"formula_mode": "dollar",
|
"formula_mode": "dollar",
|
||||||
"filename": "output"
|
"filename": "output"
|
||||||
}
|
}
|
||||||
res = requests.post(
|
res = make_request(
|
||||||
'https://v2.doc2x.noedgeai.com/api/v2/convert/parse',
|
"POST",
|
||||||
|
"https://v2.doc2x.noedgeai.com/api/v2/convert/parse",
|
||||||
headers={"Authorization": "Bearer " + doc2x_api_key},
|
headers={"Authorization": "Bearer " + doc2x_api_key},
|
||||||
json=data
|
json=data,
|
||||||
|
timeout=15,
|
||||||
)
|
)
|
||||||
if res.status_code == 200:
|
doc2x_api_response_status(res, uid=f"uid: {uuid}")
|
||||||
res_json = res.json()
|
|
||||||
else:
|
|
||||||
raise RuntimeError(f"Doc2x return an error: {res.json()}")
|
|
||||||
|
|
||||||
|
|
||||||
# < ------ 第4步:等待结果 ------ >
|
# < ------ 第4步:等待结果 ------ >
|
||||||
logger.info("Doc2x 第4步:等待结果")
|
logger.info("Doc2x 第4步:等待结果")
|
||||||
params = {'uid': uuid}
|
params = {"uid": uuid}
|
||||||
while True:
|
max_attempts = 36
|
||||||
res = requests.get(
|
attempt = 0
|
||||||
'https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result',
|
while attempt < max_attempts:
|
||||||
|
res = make_request(
|
||||||
|
"GET",
|
||||||
|
"https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result",
|
||||||
headers={"Authorization": "Bearer " + doc2x_api_key},
|
headers={"Authorization": "Bearer " + doc2x_api_key},
|
||||||
params=params
|
params=params,
|
||||||
|
timeout=15,
|
||||||
)
|
)
|
||||||
res_json = res.json()
|
res_data = doc2x_api_response_status(res, uid=f"uid: {uuid}")
|
||||||
if res_json['data']['status'] == "success":
|
if res_data["status"] == "success":
|
||||||
break
|
break
|
||||||
elif res_json['data']['status'] == "processing":
|
elif res_data["status"] == "processing":
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
logger.info(f"Doc2x still processing")
|
logger.info("Doc2x still processing to convert file")
|
||||||
elif res_json['data']['status'] == "failed":
|
attempt += 1
|
||||||
raise RuntimeError(f"Doc2x return an error: {res_json}")
|
if attempt >= max_attempts:
|
||||||
|
raise RuntimeError("Doc2x conversion timeout after maximum attempts")
|
||||||
|
|
||||||
# < ------ 第5步:最后的处理 ------ >
|
# < ------ 第5步:最后的处理 ------ >
|
||||||
logger.info("Doc2x 第5步:最后的处理")
|
logger.info("Doc2x 第5步:下载转换后的文件")
|
||||||
|
|
||||||
if format=='tex':
|
if format == "tex":
|
||||||
target_path = latex_dir
|
target_path = latex_dir
|
||||||
if format=='md':
|
if format == "md":
|
||||||
target_path = markdown_dir
|
target_path = markdown_dir
|
||||||
os.makedirs(target_path, exist_ok=True)
|
os.makedirs(target_path, exist_ok=True)
|
||||||
|
|
||||||
@@ -127,17 +182,18 @@ def 解析PDF_DOC2X(pdf_file_path, format='tex'):
|
|||||||
# < ------ 下载 ------ >
|
# < ------ 下载 ------ >
|
||||||
for attempt in range(max_attempt):
|
for attempt in range(max_attempt):
|
||||||
try:
|
try:
|
||||||
result_url = res_json['data']['url']
|
result_url = res_data["url"]
|
||||||
res = requests.get(result_url)
|
res = make_request("GET", result_url, timeout=60)
|
||||||
zip_path = os.path.join(target_path, gen_time_str() + '.zip')
|
zip_path = os.path.join(target_path, gen_time_str() + ".zip")
|
||||||
unzip_path = os.path.join(target_path, gen_time_str())
|
unzip_path = os.path.join(target_path, gen_time_str())
|
||||||
if res.status_code == 200:
|
if res.status_code == 200:
|
||||||
with open(zip_path, "wb") as f: f.write(res.content)
|
with open(zip_path, "wb") as f:
|
||||||
|
f.write(res.content)
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(f"Doc2x return an error: {res.json()}")
|
raise RuntimeError(f"Doc2x return an error: {res.json()}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if attempt < max_attempt - 1:
|
if attempt < max_attempt - 1:
|
||||||
logger.error(f"Failed to download latex file, retrying... {e}")
|
logger.error(f"Failed to download uid = {uuid} file, retrying... {e}")
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
@@ -145,22 +201,31 @@ def 解析PDF_DOC2X(pdf_file_path, format='tex'):
|
|||||||
|
|
||||||
# < ------ 解压 ------ >
|
# < ------ 解压 ------ >
|
||||||
import zipfile
|
import zipfile
|
||||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||||
zip_ref.extractall(unzip_path)
|
zip_ref.extractall(unzip_path)
|
||||||
return zip_path, unzip_path
|
return zip_path, unzip_path
|
||||||
|
|
||||||
|
|
||||||
def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request):
|
def 解析PDF_DOC2X_单文件(
|
||||||
|
fp,
|
||||||
|
project_folder,
|
||||||
|
llm_kwargs,
|
||||||
|
plugin_kwargs,
|
||||||
|
chatbot,
|
||||||
|
history,
|
||||||
|
system_prompt,
|
||||||
|
DOC2X_API_KEY,
|
||||||
|
user_request,
|
||||||
|
):
|
||||||
def pdf2markdown(filepath):
|
def pdf2markdown(filepath):
|
||||||
chatbot.append((None, f"Doc2x 解析中"))
|
chatbot.append((None, f"Doc2x 解析中"))
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
|
|
||||||
md_zip_path, unzipped_folder = 解析PDF_DOC2X(filepath, format='md')
|
md_zip_path, unzipped_folder = 解析PDF_DOC2X(filepath, format="md")
|
||||||
|
|
||||||
promote_file_to_downloadzone(md_zip_path, chatbot=chatbot)
|
promote_file_to_downloadzone(md_zip_path, chatbot=chatbot)
|
||||||
chatbot.append((None, f"完成解析 {md_zip_path} ..."))
|
chatbot.append((None, f"完成解析 {md_zip_path} ..."))
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
return md_zip_path
|
return md_zip_path
|
||||||
|
|
||||||
def deliver_to_markdown_plugin(md_zip_path, user_request):
|
def deliver_to_markdown_plugin(md_zip_path, user_request):
|
||||||
@@ -174,77 +239,97 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
|
|||||||
os.makedirs(target_path_base, exist_ok=True)
|
os.makedirs(target_path_base, exist_ok=True)
|
||||||
shutil.copyfile(md_zip_path, this_file_path)
|
shutil.copyfile(md_zip_path, this_file_path)
|
||||||
ex_folder = this_file_path + ".extract"
|
ex_folder = this_file_path + ".extract"
|
||||||
extract_archive(
|
extract_archive(file_path=this_file_path, dest_dir=ex_folder)
|
||||||
file_path=this_file_path, dest_dir=ex_folder
|
|
||||||
)
|
|
||||||
|
|
||||||
# edit markdown files
|
# edit markdown files
|
||||||
success, file_manifest, project_folder = get_files_from_everything(ex_folder, type='.md')
|
success, file_manifest, project_folder = get_files_from_everything(
|
||||||
|
ex_folder, type=".md"
|
||||||
|
)
|
||||||
for generated_fp in file_manifest:
|
for generated_fp in file_manifest:
|
||||||
# 修正一些公式问题
|
# 修正一些公式问题
|
||||||
with open(generated_fp, 'r', encoding='utf8') as f:
|
with open(generated_fp, "r", encoding="utf8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
# 将公式中的\[ \]替换成$$
|
# 将公式中的\[ \]替换成$$
|
||||||
content = content.replace(r'\[', r'$$').replace(r'\]', r'$$')
|
content = content.replace(r"\[", r"$$").replace(r"\]", r"$$")
|
||||||
# 将公式中的\( \)替换成$
|
# 将公式中的\( \)替换成$
|
||||||
content = content.replace(r'\(', r'$').replace(r'\)', r'$')
|
content = content.replace(r"\(", r"$").replace(r"\)", r"$")
|
||||||
content = content.replace('```markdown', '\n').replace('```', '\n')
|
content = content.replace("```markdown", "\n").replace("```", "\n")
|
||||||
with open(generated_fp, 'w', encoding='utf8') as f:
|
with open(generated_fp, "w", encoding="utf8") as f:
|
||||||
f.write(content)
|
f.write(content)
|
||||||
promote_file_to_downloadzone(generated_fp, chatbot=chatbot)
|
promote_file_to_downloadzone(generated_fp, chatbot=chatbot)
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
|
|
||||||
# 生成在线预览html
|
# 生成在线预览html
|
||||||
file_name = '在线预览翻译(原文)' + gen_time_str() + '.html'
|
file_name = "在线预览翻译(原文)" + gen_time_str() + ".html"
|
||||||
preview_fp = os.path.join(ex_folder, file_name)
|
preview_fp = os.path.join(ex_folder, file_name)
|
||||||
from shared_utils.advanced_markdown_format import markdown_convertion_for_file
|
from shared_utils.advanced_markdown_format import (
|
||||||
|
markdown_convertion_for_file,
|
||||||
|
)
|
||||||
|
|
||||||
with open(generated_fp, "r", encoding="utf-8") as f:
|
with open(generated_fp, "r", encoding="utf-8") as f:
|
||||||
md = f.read()
|
md = f.read()
|
||||||
# # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染
|
# # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染
|
||||||
# md = re.sub(r'^<table>', r'.<table>', md, flags=re.MULTILINE)
|
# md = re.sub(r'^<table>', r'.<table>', md, flags=re.MULTILINE)
|
||||||
html = markdown_convertion_for_file(md)
|
html = markdown_convertion_for_file(md)
|
||||||
with open(preview_fp, "w", encoding="utf-8") as f: f.write(html)
|
with open(preview_fp, "w", encoding="utf-8") as f:
|
||||||
|
f.write(html)
|
||||||
chatbot.append([None, f"生成在线预览:{generate_file_link([preview_fp])}"])
|
chatbot.append([None, f"生成在线预览:{generate_file_link([preview_fp])}"])
|
||||||
promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
|
promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
chatbot.append((None, f"调用Markdown插件 {ex_folder} ..."))
|
chatbot.append((None, f"调用Markdown插件 {ex_folder} ..."))
|
||||||
plugin_kwargs['markdown_expected_output_dir'] = ex_folder
|
plugin_kwargs["markdown_expected_output_dir"] = ex_folder
|
||||||
|
|
||||||
translated_f_name = 'translated_markdown.md'
|
translated_f_name = "translated_markdown.md"
|
||||||
generated_fp = plugin_kwargs['markdown_expected_output_path'] = os.path.join(ex_folder, translated_f_name)
|
generated_fp = plugin_kwargs["markdown_expected_output_path"] = os.path.join(
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
ex_folder, translated_f_name
|
||||||
yield from Markdown英译中(ex_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
|
)
|
||||||
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
|
yield from Markdown英译中(
|
||||||
|
ex_folder,
|
||||||
|
llm_kwargs,
|
||||||
|
plugin_kwargs,
|
||||||
|
chatbot,
|
||||||
|
history,
|
||||||
|
system_prompt,
|
||||||
|
user_request,
|
||||||
|
)
|
||||||
if os.path.exists(generated_fp):
|
if os.path.exists(generated_fp):
|
||||||
# 修正一些公式问题
|
# 修正一些公式问题
|
||||||
with open(generated_fp, 'r', encoding='utf8') as f: content = f.read()
|
with open(generated_fp, "r", encoding="utf8") as f:
|
||||||
content = content.replace('```markdown', '\n').replace('```', '\n')
|
content = f.read()
|
||||||
|
content = content.replace("```markdown", "\n").replace("```", "\n")
|
||||||
# Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染
|
# Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染
|
||||||
# content = re.sub(r'^<table>', r'.<table>', content, flags=re.MULTILINE)
|
# content = re.sub(r'^<table>', r'.<table>', content, flags=re.MULTILINE)
|
||||||
with open(generated_fp, 'w', encoding='utf8') as f: f.write(content)
|
with open(generated_fp, "w", encoding="utf8") as f:
|
||||||
|
f.write(content)
|
||||||
# 生成在线预览html
|
# 生成在线预览html
|
||||||
file_name = '在线预览翻译' + gen_time_str() + '.html'
|
file_name = "在线预览翻译" + gen_time_str() + ".html"
|
||||||
preview_fp = os.path.join(ex_folder, file_name)
|
preview_fp = os.path.join(ex_folder, file_name)
|
||||||
from shared_utils.advanced_markdown_format import markdown_convertion_for_file
|
from shared_utils.advanced_markdown_format import (
|
||||||
|
markdown_convertion_for_file,
|
||||||
|
)
|
||||||
|
|
||||||
with open(generated_fp, "r", encoding="utf-8") as f:
|
with open(generated_fp, "r", encoding="utf-8") as f:
|
||||||
md = f.read()
|
md = f.read()
|
||||||
html = markdown_convertion_for_file(md)
|
html = markdown_convertion_for_file(md)
|
||||||
with open(preview_fp, "w", encoding="utf-8") as f: f.write(html)
|
with open(preview_fp, "w", encoding="utf-8") as f:
|
||||||
|
f.write(html)
|
||||||
promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
|
promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
|
||||||
# 生成包含图片的压缩包
|
# 生成包含图片的压缩包
|
||||||
dest_folder = get_log_folder(chatbot.get_user())
|
dest_folder = get_log_folder(chatbot.get_user())
|
||||||
zip_name = '翻译后的带图文档.zip'
|
zip_name = "翻译后的带图文档.zip"
|
||||||
zip_folder(source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name)
|
zip_folder(
|
||||||
|
source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name
|
||||||
|
)
|
||||||
zip_fp = os.path.join(dest_folder, zip_name)
|
zip_fp = os.path.join(dest_folder, zip_name)
|
||||||
promote_file_to_downloadzone(zip_fp, chatbot=chatbot)
|
promote_file_to_downloadzone(zip_fp, chatbot=chatbot)
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
|
|
||||||
md_zip_path = yield from pdf2markdown(fp)
|
md_zip_path = yield from pdf2markdown(fp)
|
||||||
yield from deliver_to_markdown_plugin(md_zip_path, user_request)
|
yield from deliver_to_markdown_plugin(md_zip_path, user_request)
|
||||||
|
|
||||||
|
|
||||||
def 解析PDF_基于DOC2X(file_manifest, *args):
|
def 解析PDF_基于DOC2X(file_manifest, *args):
|
||||||
for index, fp in enumerate(file_manifest):
|
for index, fp in enumerate(file_manifest):
|
||||||
yield from 解析PDF_DOC2X_单文件(fp, *args)
|
yield from 解析PDF_DOC2X_单文件(fp, *args)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
2
main.py
2
main.py
@@ -186,7 +186,7 @@ def main():
|
|||||||
define_gui_floating_menu(customize_btns, functional, predefined_btns, cookies, web_cookie_cache)
|
define_gui_floating_menu(customize_btns, functional, predefined_btns, cookies, web_cookie_cache)
|
||||||
|
|
||||||
# 浮动时间线定义
|
# 浮动时间线定义
|
||||||
gr.Spark(label="", value="")
|
gr.Spark()
|
||||||
|
|
||||||
# 插件二级菜单的实现
|
# 插件二级菜单的实现
|
||||||
from themes.gui_advanced_plugin_class import define_gui_advanced_plugin_class
|
from themes.gui_advanced_plugin_class import define_gui_advanced_plugin_class
|
||||||
|
|||||||
Reference in New Issue
Block a user