From 85dbe4a4bf1278e375a53e90776000f7d1843adc Mon Sep 17 00:00:00 2001 From: binary-husky Date: Fri, 7 Jun 2024 15:53:08 +0000 Subject: [PATCH] pdf processing improvement --- .../pdf_fns/parse_pdf_via_doc2x.py | 10 +-- shared_utils/advanced_markdown_format.py | 78 +++++++++++++++---- 2 files changed, 70 insertions(+), 18 deletions(-) diff --git a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py index f67e79fe..bae4d951 100644 --- a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py +++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py @@ -159,10 +159,10 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha file_name = '在线预览翻译(原文)' + gen_time_str() + '.html' preview_fp = os.path.join(ex_folder, file_name) from shared_utils.advanced_markdown_format import markdown_convertion_for_file - with open(generated_fp, "r", encoding="utf-8") as f: - md = f.read() - # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染 - md = re.sub(r'^', r'😃
', md, flags=re.MULTILINE) + # with open(generated_fp, "r", encoding="utf-8") as f: + # md = f.read() + # # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染 + # md = re.sub(r'^
', r'.
', md, flags=re.MULTILINE) html = markdown_convertion_for_file(md) with open(preview_fp, "w", encoding="utf-8") as f: f.write(html) chatbot.append([None, f"生成在线预览:{generate_file_link([preview_fp])}"]) @@ -182,7 +182,7 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha with open(generated_fp, 'r', encoding='utf8') as f: content = f.read() content = content.replace('```markdown', '\n').replace('```', '\n') # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染 - content = re.sub(r'^
', r'😃
', content, flags=re.MULTILINE) + # content = re.sub(r'^
', r'.
', content, flags=re.MULTILINE) with open(generated_fp, 'w', encoding='utf8') as f: f.write(content) # 生成在线预览html file_name = '在线预览翻译' + gen_time_str() + '.html' diff --git a/shared_utils/advanced_markdown_format.py b/shared_utils/advanced_markdown_format.py index 5674e1da..e5295c1d 100644 --- a/shared_utils/advanced_markdown_format.py +++ b/shared_utils/advanced_markdown_format.py @@ -46,6 +46,16 @@ code_highlight_configs_block_mermaid = { }, } + +mathpatterns = { + r"(?(.*?)' txt = fix_markdown_indent(txt) # convert everything to html format - split = markdown.markdown(text="---") convert_stage_1 = markdown.markdown( text=txt, extensions=[ @@ -245,14 +286,25 @@ def markdown_convertion_for_file(txt): ], extension_configs={**markdown_extension_configs, **code_highlight_configs}, ) - convert_stage_1 = markdown_bug_hunt(convert_stage_1) + + + convert_stage_1 = fix_dollar_sticking_bug(convert_stage_1) + def repl_fn(match): + content = match.group(2) + return f'' + + pattern = "|".join([pattern for pattern, property in mathpatterns.items() if not property["allow_multi_lines"]]) + pattern = re.compile(pattern, flags=re.ASCII) + convert_stage_2 = pattern.sub(repl_fn, convert_stage_1) + + convert_stage_4 = markdown_bug_hunt(convert_stage_2) # 2. convert to rendered equation - convert_stage_2_2, n = re.subn( - find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL + convert_stage_5, n = re.subn( + find_equation_pattern, replace_math_render, convert_stage_4, flags=re.DOTALL ) # cat them together - return pre + convert_stage_2_2 + suf + return pre + convert_stage_5 + suf @lru_cache(maxsize=128) # 使用 lru缓存 加快转换速度 def markdown_convertion(txt):