From 85dbe4a4bf1278e375a53e90776000f7d1843adc Mon Sep 17 00:00:00 2001
From: binary-husky <qingxu.fu@outlook.com>
Date: Fri, 7 Jun 2024 15:53:08 +0000
Subject: [PATCH] pdf processing improvement

---
 .../pdf_fns/parse_pdf_via_doc2x.py            | 10 +--
 shared_utils/advanced_markdown_format.py      | 78 +++++++++++++++----
 2 files changed, 70 insertions(+), 18 deletions(-)
diff --git a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py
index f67e79fe..bae4d951 100644
--- a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py
+++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py
@@ -159,10 +159,10 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
             file_name = '在线预览翻译（原文）' + gen_time_str() + '.html'
             preview_fp = os.path.join(ex_folder, file_name)
             from shared_utils.advanced_markdown_format import markdown_convertion_for_file
-            with open(generated_fp, "r", encoding="utf-8") as f:
-                md = f.read()
-                # Markdown中使用不标准的表格，需要在表格前加上一个emoji，以便公式渲染
-                md = re.sub(r'^<table>', r'😃<table>', md, flags=re.MULTILINE)
+            # with open(generated_fp, "r", encoding="utf-8") as f:
+            #     md = f.read()
+            #     # Markdown中使用不标准的表格，需要在表格前加上一个emoji，以便公式渲染
+            #     md = re.sub(r'^<table>', r'.<table>', md, flags=re.MULTILINE)
             html = markdown_convertion_for_file(md)
             with open(preview_fp, "w", encoding="utf-8") as f: f.write(html)
             chatbot.append([None, f"生成在线预览：{generate_file_link([preview_fp])}"])
@@ -182,7 +182,7 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
             with open(generated_fp, 'r', encoding='utf8') as f: content = f.read()
             content = content.replace('```markdown', '\n').replace('```', '\n')
             # Markdown中使用不标准的表格，需要在表格前加上一个emoji，以便公式渲染
-            content = re.sub(r'^<table>', r'😃<table>', content, flags=re.MULTILINE)
+            # content = re.sub(r'^<table>', r'.<table>', content, flags=re.MULTILINE)
             with open(generated_fp, 'w', encoding='utf8') as f: f.write(content)
             # 生成在线预览html
             file_name = '在线预览翻译' + gen_time_str() + '.html'
diff --git a/shared_utils/advanced_markdown_format.py b/shared_utils/advanced_markdown_format.py
index 5674e1da..e5295c1d 100644
--- a/shared_utils/advanced_markdown_format.py
+++ b/shared_utils/advanced_markdown_format.py
@@ -46,6 +46,16 @@ code_highlight_configs_block_mermaid = {
     },
 }
 
+
+mathpatterns = {
+    r"(?<!\\|\$)(\$)([^\$]+)(\$)": {"allow_multi_lines": False},  #  $...$
+    r"(?<!\\)(\$\$)([^\$]+)(\$\$)": {"allow_multi_lines": True},  # $$...$$
+    r"(?<!\\)(\\\[)(.+?)(\\\])": {"allow_multi_lines": False},  # \[...\]
+    r'(?<!\\)(\\\()(.+?)(\\\))': {'allow_multi_lines': False},                       # \(...\)
+    # r'(?<!\\)(\\begin{([a-z]+?\*?)})(.+?)(\\end{\2})': {'allow_multi_lines': True},  # \begin...\end
+    # r'(?<!\\)(\$`)([^`]+)(`\$)': {'allow_multi_lines': False},                       # $`...`$
+}
+
 def tex2mathml_catch_exception(content, *args, **kwargs):
     try:
         content = tex2mathml(content, *args, **kwargs)
@@ -96,14 +106,7 @@ def is_equation(txt):
         return False
     if "$" not in txt and "\\[" not in txt:
         return False
-    mathpatterns = {
-        r"(?<!\\|\$)(\$)([^\$]+)(\$)": {"allow_multi_lines": False},  #  $...$
-        r"(?<!\\)(\$\$)([^\$]+)(\$\$)": {"allow_multi_lines": True},  # $$...$$
-        r"(?<!\\)(\\\[)(.+?)(\\\])": {"allow_multi_lines": False},  # \[...\]
-        # r'(?<!\\)(\\\()(.+?)(\\\))': {'allow_multi_lines': False},                       # \(...\)
-        # r'(?<!\\)(\\begin{([a-z]+?\*?)})(.+?)(\\end{\2})': {'allow_multi_lines': True},  # \begin...\end
-        # r'(?<!\\)(\$`)([^`]+)(`\$)': {'allow_multi_lines': False},                       # $`...`$
-    }
+
     matches = []
     for pattern, property in mathpatterns.items():
         flags = re.ASCII | re.DOTALL if property["allow_multi_lines"] else re.ASCII
@@ -207,6 +210,45 @@ def fix_code_segment_indent(txt):
         return txt
 
 
+def fix_dollar_sticking_bug(txt):
+    """
+    修复不标准的dollar符号的问题
+    """
+    txt_result = ""
+    single_stack_height = 0
+    double_stack_height = 0
+    while True:
+        index = txt.find('$')
+        if index == -1:
+            txt_result += txt
+            return txt_result
+        # still has $
+        # how many dollar
+        while True:
+            is_double = (txt[index+1] == '$')
+            if is_double:
+                if single_stack_height != 0:
+                    # add a padding
+                    txt = txt[:(index+1)] + " " + txt[(index+1):]
+                    continue
+                if double_stack_height == 0:
+                    double_stack_height = 1
+                else:
+                    double_stack_height = 0
+                txt_result += txt[:(index+2)]
+                txt = txt[(index+2):]
+            else:
+                if double_stack_height != 0:
+                    print('Fatal')
+                if single_stack_height == 0:
+                    single_stack_height = 1
+                else:
+                    single_stack_height = 0
+                txt_result += txt[:(index+1)]
+                txt = txt[(index+1):]
+            break
+
+
 def markdown_convertion_for_file(txt):
     """
     将Markdown格式的文本转换为HTML格式。如果包含数学公式，则先将公式转换为HTML格式。
@@ -233,7 +275,6 @@ def markdown_convertion_for_file(txt):
     find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
     txt = fix_markdown_indent(txt)
     # convert everything to html format
-    split = markdown.markdown(text="---")
     convert_stage_1 = markdown.markdown(
         text=txt,
         extensions=[
@@ -245,14 +286,25 @@ def markdown_convertion_for_file(txt):
         ],
         extension_configs={**markdown_extension_configs, **code_highlight_configs},
     )
-    convert_stage_1 = markdown_bug_hunt(convert_stage_1)
+
+
+    convert_stage_1 = fix_dollar_sticking_bug(convert_stage_1)
+    def repl_fn(match):
+        content = match.group(2)
+        return f'<script type="math/tex">{content}</script>'
+
+    pattern = "|".join([pattern for pattern, property in mathpatterns.items() if not property["allow_multi_lines"]])
+    pattern = re.compile(pattern, flags=re.ASCII)
+    convert_stage_2 = pattern.sub(repl_fn, convert_stage_1)
+
+    convert_stage_4 = markdown_bug_hunt(convert_stage_2)
 
     # 2. convert to rendered equation
-    convert_stage_2_2, n = re.subn(
-        find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL
+    convert_stage_5, n = re.subn(
+        find_equation_pattern, replace_math_render, convert_stage_4, flags=re.DOTALL
     )
     # cat them together
-    return pre + convert_stage_2_2 + suf
+    return pre + convert_stage_5 + suf
 
 @lru_cache(maxsize=128)  # 使用 lru缓存 加快转换速度
 def markdown_convertion(txt):