update

2024-02-25 22:16:46 +08:00
parent 47289f863d
commit d0703ef32d
96 changed files with 7507 additions and 2453 deletions
--- a/shared_utils/advanced_markdown_format.py
+++ b/shared_utils/advanced_markdown_format.py
@@ -4,62 +4,47 @@ import os
 import math
 from textwrap import dedent
 from functools import lru_cache
-from pymdownx.superfences import fence_div_format, fence_code_format
+from pymdownx.superfences import fence_code_format
 from latex2mathml.converter import convert as tex2mathml
 from shared_utils.config_loader import get_conf as get_conf
-
-pj = os.path.join
-default_user_name = 'default_user'
+from shared_utils.text_mask import apply_gpt_academic_string_mask

 markdown_extension_configs = {
-    'mdx_math': {
-        'enable_dollar_delimiter': True,
-        'use_gitlab_delimiters': False,
+    "mdx_math": {
+        "enable_dollar_delimiter": True,
+        "use_gitlab_delimiters": False,
    },
 }

 code_highlight_configs = {
    "pymdownx.superfences": {
-        'css_class': 'codehilite',
+        "css_class": "codehilite",
        "custom_fences": [
-            {
-                'name': 'mermaid',
-                'class': 'mermaid',
-                'format': fence_code_format
-            }
-        ]
+            {"name": "mermaid", "class": "mermaid", "format": fence_code_format}
+        ],
    },
    "pymdownx.highlight": {
-        'css_class': 'codehilite',
-        'guess_lang': True,
+        "css_class": "codehilite",
+        "guess_lang": True,
        # 'auto_title': True,
        # 'linenums': True
-    }
+    },
 }

-def text_divide_paragraph(text):
-    """
-    将文本按照段落分隔符分割开，生成带有段落标签的HTML代码。
-    """
-    pre = '<div class="markdown-body">'
-    suf = '</div>'
-    if text.startswith(pre) and text.endswith(suf):
-        return text
-
-    if '```' in text:
-        # careful input
-        return text
-    elif '</div>' in text:
-        # careful input
-        return text
-    else:
-        # whatever input
-        lines = text.split("\n")
-        for i, line in enumerate(lines):
-            lines[i] = lines[i].replace(" ", "&nbsp;")
-        text = "</br>".join(lines)
-        return pre + text + suf
-
+code_highlight_configs_block_mermaid = {
+    "pymdownx.superfences": {
+        "css_class": "codehilite",
+        # "custom_fences": [
+        #     {"name": "mermaid", "class": "mermaid", "format": fence_code_format}
+        # ],
+    },
+    "pymdownx.highlight": {
+        "css_class": "codehilite",
+        "guess_lang": True,
+        # 'auto_title': True,
+        # 'linenums': True
+    },
+}

 def tex2mathml_catch_exception(content, *args, **kwargs):
    try:
@@ -71,20 +56,20 @@ def tex2mathml_catch_exception(content, *args, **kwargs):

 def replace_math_no_render(match):
    content = match.group(1)
-    if 'mode=display' in match.group(0):
-        content = content.replace('\n', '</br>')
-        return f"<font color=\"#00FF00\">$$</font><font color=\"#FF00FF\">{content}</font><font color=\"#00FF00\">$$</font>"
+    if "mode=display" in match.group(0):
+        content = content.replace("\n", "</br>")
+        return f'<font color="#00FF00">$$</font><font color="#FF00FF">{content}</font><font color="#00FF00">$$</font>'
    else:
-        return f"<font color=\"#00FF00\">$</font><font color=\"#FF00FF\">{content}</font><font color=\"#00FF00\">$</font>"
+        return f'<font color="#00FF00">$</font><font color="#FF00FF">{content}</font><font color="#00FF00">$</font>'


 def replace_math_render(match):
    content = match.group(1)
-    if 'mode=display' in match.group(0):
-        if '\\begin{aligned}' in content:
-            content = content.replace('\\begin{aligned}', '\\begin{array}')
-            content = content.replace('\\end{aligned}', '\\end{array}')
-            content = content.replace('&', ' ')
+    if "mode=display" in match.group(0):
+        if "\\begin{aligned}" in content:
+            content = content.replace("\\begin{aligned}", "\\begin{array}")
+            content = content.replace("\\end{aligned}", "\\end{array}")
+            content = content.replace("&", " ")
        content = tex2mathml_catch_exception(content, display="block")
        return content
    else:
@@ -95,9 +80,11 @@ def markdown_bug_hunt(content):
    """
    解决一个mdx_math的bug（单$包裹begin命令时多余<script>）
    """
-    content = content.replace('<script type="math/tex">\n<script type="math/tex; mode=display">',
-                                '<script type="math/tex; mode=display">')
-    content = content.replace('</script>\n</script>', '</script>')
+    content = content.replace(
+        '<script type="math/tex">\n<script type="math/tex; mode=display">',
+        '<script type="math/tex; mode=display">',
+    )
+    content = content.replace("</script>\n</script>", "</script>")
    return content


@@ -105,25 +92,29 @@ def is_equation(txt):
    """
    判定是否为公式 | 测试1 写出洛伦兹定律，使用tex格式公式 测试2 给出柯西不等式，使用latex格式 测试3 写出麦克斯韦方程组
    """
-    if '```' in txt and '```reference' not in txt: return False
-    if '$' not in txt and '\\[' not in txt: return False
+    if "```" in txt and "```reference" not in txt:
+        return False
+    if "$" not in txt and "\\[" not in txt:
+        return False
    mathpatterns = {
-        r'(?<!\\|\$)(\$)([^\$]+)(\$)': {'allow_multi_lines': False},                       #  $...$
-        r'(?<!\\)(\$\$)([^\$]+)(\$\$)': {'allow_multi_lines': True},                       # $$...$$
-        r'(?<!\\)(\\\[)(.+?)(\\\])': {'allow_multi_lines': False},                         # \[...\]
+        r"(?<!\\|\$)(\$)([^\$]+)(\$)": {"allow_multi_lines": False},  #  $...$
+        r"(?<!\\)(\$\$)([^\$]+)(\$\$)": {"allow_multi_lines": True},  # $$...$$
+        r"(?<!\\)(\\\[)(.+?)(\\\])": {"allow_multi_lines": False},  # \[...\]
        # r'(?<!\\)(\\\()(.+?)(\\\))': {'allow_multi_lines': False},                       # \(...\)
        # r'(?<!\\)(\\begin{([a-z]+?\*?)})(.+?)(\\end{\2})': {'allow_multi_lines': True},  # \begin...\end
        # r'(?<!\\)(\$`)([^`]+)(`\$)': {'allow_multi_lines': False},                       # $`...`$
    }
    matches = []
    for pattern, property in mathpatterns.items():
-        flags = re.ASCII | re.DOTALL if property['allow_multi_lines'] else re.ASCII
+        flags = re.ASCII | re.DOTALL if property["allow_multi_lines"] else re.ASCII
        matches.extend(re.findall(pattern, txt, flags))
-    if len(matches) == 0: return False
+    if len(matches) == 0:
+        return False
    contain_any_eq = False
-    illegal_pattern = re.compile(r'[^\x00-\x7F]|echo')
+    illegal_pattern = re.compile(r"[^\x00-\x7F]|echo")
    for match in matches:
-        if len(match) != 3: return False
+        if len(match) != 3:
+            return False
        eq_canidate = match[1]
        if illegal_pattern.search(eq_canidate):
            return False
@@ -134,27 +125,28 @@ def is_equation(txt):

 def fix_markdown_indent(txt):
    # fix markdown indent
-    if (' - ' not in txt) or ('. ' not in txt):
+    if (" - " not in txt) or (". " not in txt):
        # do not need to fix, fast escape
        return txt
    # walk through the lines and fix non-standard indentation
    lines = txt.split("\n")
-    pattern = re.compile(r'^\s+-')
+    pattern = re.compile(r"^\s+-")
    activated = False
    for i, line in enumerate(lines):
-        if line.startswith('- ') or line.startswith('1. '):
+        if line.startswith("- ") or line.startswith("1. "):
            activated = True
        if activated and pattern.match(line):
            stripped_string = line.lstrip()
            num_spaces = len(line) - len(stripped_string)
            if (num_spaces % 4) == 3:
                num_spaces_should_be = math.ceil(num_spaces / 4) * 4
-                lines[i] = ' ' * num_spaces_should_be + stripped_string
-    return '\n'.join(lines)
+                lines[i] = " " * num_spaces_should_be + stripped_string
+    return "\n".join(lines)


 FENCED_BLOCK_RE = re.compile(
-    dedent(r'''
+    dedent(
+        r"""
        (?P<fence>^[ \t]*(?:~{3,}|`{3,}))[ ]*                      # opening fence
        ((\{(?P<attrs>[^\}\n]*)\})|                              # (optional {attrs} or
        (\.?(?P<lang>[\w#.+-]*)[ ]*)?                            # optional (.)lang
@@ -162,16 +154,17 @@ FENCED_BLOCK_RE = re.compile(
        \n                                                       # newline (end of opening fence)
        (?P<code>.*?)(?<=\n)                                     # the code block
        (?P=fence)[ ]*$                                          # closing fence
-    '''),
-    re.MULTILINE | re.DOTALL | re.VERBOSE
+    """
+    ),
+    re.MULTILINE | re.DOTALL | re.VERBOSE,
 )


 def get_line_range(re_match_obj, txt):
    start_pos, end_pos = re_match_obj.regs[0]
-    num_newlines_before = txt[:start_pos+1].count('\n')
+    num_newlines_before = txt[: start_pos + 1].count("\n")
    line_start = num_newlines_before
-    line_end = num_newlines_before + txt[start_pos:end_pos].count('\n')+1
+    line_end = num_newlines_before + txt[start_pos:end_pos].count("\n") + 1
    return line_start, line_end


@@ -181,14 +174,16 @@ def fix_code_segment_indent(txt):
    txt_tmp = txt
    while True:
        re_match_obj = FENCED_BLOCK_RE.search(txt_tmp)
-        if not re_match_obj: break
-        if len(lines) == 0: lines = txt.split("\n")
-        
+        if not re_match_obj:
+            break
+        if len(lines) == 0:
+            lines = txt.split("\n")
+
        # 清空 txt_tmp 对应的位置方便下次搜索
        start_pos, end_pos = re_match_obj.regs[0]
-        txt_tmp = txt_tmp[:start_pos] + ' '*(end_pos-start_pos) + txt_tmp[end_pos:]
+        txt_tmp = txt_tmp[:start_pos] + " " * (end_pos - start_pos) + txt_tmp[end_pos:]
        line_start, line_end = get_line_range(re_match_obj, txt)
-        
+
        # 获取公共缩进
        shared_indent_cnt = 1e5
        for i in range(line_start, line_end):
@@ -202,26 +197,26 @@ def fix_code_segment_indent(txt):
            num_spaces_should_be = math.ceil(shared_indent_cnt / 4) * 4
            for i in range(line_start, line_end):
                add_n = num_spaces_should_be - shared_indent_cnt
-                lines[i] = ' ' * add_n + lines[i]
-            if not change_any: # 遇到第一个
+                lines[i] = " " * add_n + lines[i]
+            if not change_any:  # 遇到第一个
                change_any = True

    if change_any:
-        return '\n'.join(lines)
+        return "\n".join(lines)
    else:
        return txt
-    
-    
-@lru_cache(maxsize=128) # 使用 lru缓存 加快转换速度
+
+
+@lru_cache(maxsize=128)  # 使用 lru缓存 加快转换速度
 def markdown_convertion(txt):
    """
    将Markdown格式的文本转换为HTML格式。如果包含数学公式，则先将公式转换为HTML格式。
    """
    pre = '<div class="markdown-body">'
-    suf = '</div>'
+    suf = "</div>"
    if txt.startswith(pre) and txt.endswith(suf):
        # print('警告，输入了已经经过转化的字符串，二次转化可能出问题')
-        return txt # 已经被转化过，不需要再次转化
+        return txt  # 已经被转化过，不需要再次转化

    find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'

@@ -229,18 +224,47 @@ def markdown_convertion(txt):
    # txt = fix_code_segment_indent(txt)
    if is_equation(txt):  # 有$标识的公式符号，且没有代码段```的标识
        # convert everything to html format
-        split = markdown.markdown(text='---')
-        convert_stage_1 = markdown.markdown(text=txt, extensions=['sane_lists', 'tables', 'mdx_math', 'pymdownx.superfences', 'pymdownx.highlight'],
-                                            extension_configs={**markdown_extension_configs, **code_highlight_configs})
+        split = markdown.markdown(text="---")
+        convert_stage_1 = markdown.markdown(
+            text=txt,
+            extensions=[
+                "sane_lists",
+                "tables",
+                "mdx_math",
+                "pymdownx.superfences",
+                "pymdownx.highlight",
+            ],
+            extension_configs={**markdown_extension_configs, **code_highlight_configs},
+        )
        convert_stage_1 = markdown_bug_hunt(convert_stage_1)
        # 1. convert to easy-to-copy tex (do not render math)
-        convert_stage_2_1, n = re.subn(find_equation_pattern, replace_math_no_render, convert_stage_1, flags=re.DOTALL)
+        convert_stage_2_1, n = re.subn(
+            find_equation_pattern,
+            replace_math_no_render,
+            convert_stage_1,
+            flags=re.DOTALL,
+        )
        # 2. convert to rendered equation
-        convert_stage_2_2, n = re.subn(find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL)
+        convert_stage_2_2, n = re.subn(
+            find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL
+        )
        # cat them together
-        return pre + convert_stage_2_1 + f'{split}' + convert_stage_2_2 + suf
+        return pre + convert_stage_2_1 + f"{split}" + convert_stage_2_2 + suf
    else:
-        return pre + markdown.markdown(txt, extensions=['sane_lists', 'tables', 'pymdownx.superfences', 'pymdownx.highlight'], extension_configs=code_highlight_configs) + suf
+        return (
+            pre
+            + markdown.markdown(
+                txt,
+                extensions=[
+                    "sane_lists",
+                    "tables",
+                    "pymdownx.superfences",
+                    "pymdownx.highlight",
+                ],
+                extension_configs=code_highlight_configs,
+            )
+            + suf
+        )


 def close_up_code_segment_during_stream(gpt_reply):
@@ -254,20 +278,67 @@ def close_up_code_segment_during_stream(gpt_reply):
        str: 返回一个新的字符串，将输出代码片段的“后面的```”补上。

    """
-    if '```' not in gpt_reply:
+    if "```" not in gpt_reply:
        return gpt_reply
-    if gpt_reply.endswith('```'):
+    if gpt_reply.endswith("```"):
        return gpt_reply

    # 排除了以上两个情况，我们
-    segments = gpt_reply.split('```')
+    segments = gpt_reply.split("```")
    n_mark = len(segments) - 1
    if n_mark % 2 == 1:
-        return gpt_reply + '\n```' # 输出代码片段中！
+        return gpt_reply + "\n```"  # 输出代码片段中！
    else:
        return gpt_reply


+def special_render_issues_for_mermaid(text):
+    # 用不太优雅的方式处理一个core_functional.py中出现的mermaid渲染特例：
+    # 我不希望"总结绘制脑图"prompt中的mermaid渲染出来
+    @lru_cache(maxsize=1)
+    def get_special_case():
+        from core_functional import get_core_functions
+        special_case = get_core_functions()["总结绘制脑图"]["Suffix"]
+        return special_case
+    if text.endswith(get_special_case()): text = text.replace("```mermaid", "```")
+    return text
+
+
+def compat_non_markdown_input(text):
+    """
+    改善非markdown输入的显示效果，例如将空格转换为&nbsp;，将换行符转换为</br>等。
+    """
+    if "```" in text:
+        # careful input：markdown输入
+        text = special_render_issues_for_mermaid(text)  # 处理特殊的渲染问题
+        return text
+    elif "</div>" in text:
+        # careful input：html输入
+        return text
+    else:
+        # whatever input：非markdown输入
+        lines = text.split("\n")
+        for i, line in enumerate(lines):
+            lines[i] = lines[i].replace(" ", "&nbsp;")  # 空格转换为&nbsp;
+        text = "</br>".join(lines)  # 换行符转换为</br>
+        return text
+
+
+@lru_cache(maxsize=128)  # 使用lru缓存
+def simple_markdown_convertion(text):
+    pre = '<div class="markdown-body">'
+    suf = "</div>"
+    if text.startswith(pre) and text.endswith(suf):
+        return text  # 已经被转化过，不需要再次转化
+    text = compat_non_markdown_input(text)    # 兼容非markdown输入
+    text = markdown.markdown(
+        text,
+        extensions=["pymdownx.superfences", "tables", "pymdownx.highlight"],
+        extension_configs=code_highlight_configs,
+    )
+    return pre + text + suf
+
+
 def format_io(self, y):
    """
    将输入和输出解析为HTML格式。将y中最后一项的输入部分段落化，并将输出部分的Markdown和数学公式转换为HTML格式。
@@ -275,13 +346,16 @@ def format_io(self, y):
    if y is None or y == []:
        return []
    i_ask, gpt_reply = y[-1]
-    # 输入部分太自由，预处理一波
-    if i_ask is not None: i_ask = text_divide_paragraph(i_ask)
+    i_ask = apply_gpt_academic_string_mask(i_ask, mode="show_render")
+    gpt_reply = apply_gpt_academic_string_mask(gpt_reply, mode="show_render")
    # 当代码输出半截的时候，试着补上后个```
-    if gpt_reply is not None: gpt_reply = close_up_code_segment_during_stream(gpt_reply)
-    # process
+    if gpt_reply is not None:
+        gpt_reply = close_up_code_segment_during_stream(gpt_reply)
+    # 处理提问与输出
    y[-1] = (
-        None if i_ask is None else markdown.markdown(i_ask, extensions=['pymdownx.superfences', 'tables', 'pymdownx.highlight'], extension_configs=code_highlight_configs),
-        None if gpt_reply is None else markdown_convertion(gpt_reply)
+        # 输入部分
+        None if i_ask is None else simple_markdown_convertion(i_ask),
+        # 输出部分
+        None if gpt_reply is None else markdown_convertion(gpt_reply),
    )
    return y
--- a/shared_utils/connect_void_terminal.py
+++ b/shared_utils/connect_void_terminal.py
@@ -52,7 +52,7 @@ def get_plugin_default_kwargs():
    }
    chatbot = ChatBotWithCookies(llm_kwargs)

-    # txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port
+    # txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request
    DEFAULT_FN_GROUPS_kwargs = {
        "main_input": "./README.md",
        "llm_kwargs": llm_kwargs,
@@ -60,7 +60,7 @@ def get_plugin_default_kwargs():
        "chatbot_with_cookie": chatbot,
        "history": [],
        "system_prompt": "You are a good AI.",
-        "web_port": None,
+        "user_request": None,
    }
    return DEFAULT_FN_GROUPS_kwargs

--- a/shared_utils/handle_upload.py
+++ b/shared_utils/handle_upload.py
@@ -0,0 +1,137 @@
+import importlib
+import time
+import inspect
+import re
+import os
+import base64
+import gradio
+import shutil
+import glob
+from shared_utils.config_loader import get_conf
+
+def html_local_file(file):
+    base_path = os.path.dirname(__file__)  # 项目目录
+    if os.path.exists(str(file)):
+        file = f'file={file.replace(base_path, ".")}'
+    return file
+
+
+def html_local_img(__file, layout="left", max_width=None, max_height=None, md=True):
+    style = ""
+    if max_width is not None:
+        style += f"max-width: {max_width};"
+    if max_height is not None:
+        style += f"max-height: {max_height};"
+    __file = html_local_file(__file)
+    a = f'<div align="{layout}"><img src="{__file}" style="{style}"></div>'
+    if md:
+        a = f"![{__file}]({__file})"
+    return a
+
+
+def file_manifest_filter_type(file_list, filter_: list = None):
+    new_list = []
+    if not filter_:
+        filter_ = ["png", "jpg", "jpeg"]
+    for file in file_list:
+        if str(os.path.basename(file)).split(".")[-1] in filter_:
+            new_list.append(html_local_img(file, md=False))
+        else:
+            new_list.append(file)
+    return new_list
+
+
+def zip_extract_member_new(self, member, targetpath, pwd):
+    # 修复中文乱码的问题
+    """Extract the ZipInfo object 'member' to a physical
+        file on the path targetpath.
+    """
+    import zipfile
+    if not isinstance(member, zipfile.ZipInfo):
+        member = self.getinfo(member)
+
+    # build the destination pathname, replacing
+    # forward slashes to platform specific separators.
+    arcname = member.filename.replace('/', os.path.sep)
+    arcname = arcname.encode('cp437', errors='replace').decode('gbk', errors='replace')
+
+    if os.path.altsep:
+        arcname = arcname.replace(os.path.altsep, os.path.sep)
+    # interpret absolute pathname as relative, remove drive letter or
+    # UNC path, redundant separators, "." and ".." components.
+    arcname = os.path.splitdrive(arcname)[1]
+    invalid_path_parts = ('', os.path.curdir, os.path.pardir)
+    arcname = os.path.sep.join(x for x in arcname.split(os.path.sep)
+                                if x not in invalid_path_parts)
+    if os.path.sep == '\\':
+        # filter illegal characters on Windows
+        arcname = self._sanitize_windows_name(arcname, os.path.sep)
+
+    targetpath = os.path.join(targetpath, arcname)
+    targetpath = os.path.normpath(targetpath)
+
+    # Create all upper directories if necessary.
+    upperdirs = os.path.dirname(targetpath)
+    if upperdirs and not os.path.exists(upperdirs):
+        os.makedirs(upperdirs)
+
+    if member.is_dir():
+        if not os.path.isdir(targetpath):
+            os.mkdir(targetpath)
+        return targetpath
+
+    with self.open(member, pwd=pwd) as source, \
+            open(targetpath, "wb") as target:
+        shutil.copyfileobj(source, target)
+
+    return targetpath
+
+
+def extract_archive(file_path, dest_dir):
+    import zipfile
+    import tarfile
+    import os
+
+    # Get the file extension of the input file
+    file_extension = os.path.splitext(file_path)[1]
+
+    # Extract the archive based on its extension
+    if file_extension == ".zip":
+        with zipfile.ZipFile(file_path, "r") as zipobj:
+            zipobj._extract_member = lambda a,b,c: zip_extract_member_new(zipobj, a,b,c)    # 修复中文乱码的问题
+            zipobj.extractall(path=dest_dir)
+            print("Successfully extracted zip archive to {}".format(dest_dir))
+
+    elif file_extension in [".tar", ".gz", ".bz2"]:
+        with tarfile.open(file_path, "r:*") as tarobj:
+            tarobj.extractall(path=dest_dir)
+            print("Successfully extracted tar archive to {}".format(dest_dir))
+
+    # 第三方库，需要预先pip install rarfile
+    # 此外，Windows上还需要安装winrar软件，配置其Path环境变量，如"C:\Program Files\WinRAR"才可以
+    elif file_extension == ".rar":
+        try:
+            import rarfile
+
+            with rarfile.RarFile(file_path) as rf:
+                rf.extractall(path=dest_dir)
+                print("Successfully extracted rar archive to {}".format(dest_dir))
+        except:
+            print("Rar format requires additional dependencies to install")
+            return "\n\n解压失败! 需要安装pip install rarfile来解压rar文件。建议：使用zip压缩格式。"
+
+    # 第三方库，需要预先pip install py7zr
+    elif file_extension == ".7z":
+        try:
+            import py7zr
+
+            with py7zr.SevenZipFile(file_path, mode="r") as f:
+                f.extractall(path=dest_dir)
+                print("Successfully extracted 7z archive to {}".format(dest_dir))
+        except:
+            print("7z format requires additional dependencies to install")
+            return "\n\n解压失败! 需要安装pip install py7zr来解压7z文件"
+    else:
+        return ""
+    return ""
+
--- a/shared_utils/key_pattern_manager.py
+++ b/shared_utils/key_pattern_manager.py
@@ -14,7 +14,7 @@ def is_openai_api_key(key):
    if len(CUSTOM_API_KEY_PATTERN) != 0:
        API_MATCH_ORIGINAL = re.match(CUSTOM_API_KEY_PATTERN, key)
    else:
-        API_MATCH_ORIGINAL = re.match(r"sk-[a-zA-Z0-9]{48}$", key)
+        API_MATCH_ORIGINAL = re.match(r"sk-[a-zA-Z0-9]{48}$|sess-[a-zA-Z0-9]{40}$", key)
    return bool(API_MATCH_ORIGINAL)


--- a/shared_utils/text_mask.py
+++ b/shared_utils/text_mask.py
@@ -0,0 +1,107 @@
+import re
+from functools import lru_cache
+
+# 这段代码是使用Python编程语言中的re模块，即正则表达式库，来定义了一个正则表达式模式。
+# 这个模式被编译成一个正则表达式对象，存储在名为const_extract_exp的变量中，以便于后续快速的匹配和查找操作。
+# 这里解释一下正则表达式中的几个特殊字符：
+# - . 表示任意单一字符。
+# - * 表示前一个字符可以出现0次或多次。
+# - ? 在这里用作非贪婪匹配，也就是说它会匹配尽可能少的字符。在(.*?)中，它确保我们匹配的任意文本是尽可能短的，也就是说，它会在</show_llm>和</show_render>标签之前停止匹配。
+# - () 括号在正则表达式中表示捕获组。
+# - 在这个例子中，(.*?)表示捕获任意长度的文本，直到遇到括号外部最近的限定符，即</show_llm>和</show_render>。
+
+# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=/1=-=-=-=-=-=-=-=-=-=-=-=-=-=/2-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+const_extract_re = re.compile(
+    r"<gpt_academic_string_mask><show_llm>(.*?)</show_llm><show_render>(.*?)</show_render></gpt_academic_string_mask>"
+)
+# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=/1=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-/2-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+const_extract_langbased_re = re.compile(
+    r"<gpt_academic_string_mask><lang_english>(.*?)</lang_english><lang_chinese>(.*?)</lang_chinese></gpt_academic_string_mask>",
+    flags=re.DOTALL,
+)
+
+@lru_cache(maxsize=128)
+def apply_gpt_academic_string_mask(string, mode="show_all"):
+    """
+    当字符串中有掩码tag时（<gpt_academic_string_mask><show_...>），根据字符串要给谁看（大模型，还是web渲染），对字符串进行处理，返回处理后的字符串
+    示意图：https://mermaid.live/edit#pako:eNqlkUtLw0AUhf9KuOta0iaTplkIPlpduFJwoZEwJGNbzItpita2O6tF8QGKogXFtwu7cSHiq3-mk_oznFR8IYLgrGbuOd9hDrcCpmcR0GDW9ubNPKaBMDauuwI_A9M6YN-3y0bODwxsYos4BdMoBrTg5gwHF-d0mBH6-vqFQe58ed5m9XPW2uteX3Tubrj0ljLYcwxxR3h1zB43WeMs3G19yEM9uapDMe_NG9i2dagKw1Fee4c1D9nGEbtc-5n6HbNtJ8IyHOs8tbs7V2HrlDX2w2Y7XD_5haHEtQiNsOwfMVa_7TzsvrWIuJGo02qTrdwLk9gukQylHv3Afv1ML270s-HZUndrmW1tdA-WfvbM_jMFYuAQ6uCCxVdciTJ1CPLEITpo_GphypeouzXuw6XAmyi7JmgBLZEYlHwLB2S4gHMUO-9DH7tTnvf1CVoFFkBLSOk4QmlRTqpIlaWUHINyNFXjaQWpCYRURUKiWovBYo8X4ymEJFlECQUpqaQkJmuvWygPpg
+    """
+    if "<gpt_academic_string_mask>" not in string: # No need to process
+        return string
+
+    if mode == "show_all":
+        return string
+    if mode == "show_llm":
+        string = const_extract_re.sub(r"\1", string)
+    elif mode == "show_render":
+        string = const_extract_re.sub(r"\2", string)
+    else:
+        raise ValueError("Invalid mode")
+    return string
+
+
+@lru_cache(maxsize=128)
+def build_gpt_academic_masked_string(text_show_llm="", text_show_render=""):
+    """
+    根据字符串要给谁看（大模型，还是web渲染），生成带掩码tag的字符串
+    """
+    return f"<gpt_academic_string_mask><show_llm>{text_show_llm}</show_llm><show_render>{text_show_render}</show_render></gpt_academic_string_mask>"
+
+
+@lru_cache(maxsize=128)
+def apply_gpt_academic_string_mask_langbased(string, lang_reference):
+    """
+    当字符串中有掩码tag时（<gpt_academic_string_mask><lang_...>），根据语言，选择提示词，对字符串进行处理，返回处理后的字符串
+    例如，如果lang_reference是英文，那么就只显示英文提示词，中文提示词就不显示了
+    举例：
+        输入1
+            string = "注意，lang_reference这段文字是：<gpt_academic_string_mask><lang_english>英语</lang_english><lang_chinese>中文</lang_chinese></gpt_academic_string_mask>"
+            lang_reference = "hello world"
+        输出1
+            "注意，lang_reference这段文字是：英语"
+            
+        输入2
+            string = "注意，lang_reference这段文字是中文"   # 注意这里没有掩码tag，所以不会被处理
+            lang_reference = "hello world"
+        输出2
+            "注意，lang_reference这段文字是中文"            # 原样返回
+    """
+
+    if "<gpt_academic_string_mask>" not in string: # No need to process
+        return string
+
+    def contains_chinese(string):
+        chinese_regex = re.compile(u'[\u4e00-\u9fff]+')
+        return chinese_regex.search(string) is not None
+
+    mode = "english" if not contains_chinese(lang_reference) else "chinese"
+    if mode == "english":
+        string = const_extract_langbased_re.sub(r"\1", string)
+    elif mode == "chinese":
+        string = const_extract_langbased_re.sub(r"\2", string)
+    else:
+        raise ValueError("Invalid mode")
+    return string
+
+
+@lru_cache(maxsize=128)
+def build_gpt_academic_masked_string_langbased(text_show_english="", text_show_chinese=""):
+    """
+    根据语言，选择提示词，对字符串进行处理，返回处理后的字符串
+    """
+    return f"<gpt_academic_string_mask><lang_english>{text_show_english}</lang_english><lang_chinese>{text_show_chinese}</lang_chinese></gpt_academic_string_mask>"
+
+
+if __name__ == "__main__":
+    # Test
+    input_string = (
+        "你好\n"
+        + build_gpt_academic_masked_string(text_show_llm="mermaid", text_show_render="")
+        + "你好\n"
+    )
+    print(
+        apply_gpt_academic_string_mask(input_string, "show_llm")
+    )  # Should print the strings with 'abc' in place of the academic mask tags
+    print(
+        apply_gpt_academic_string_mask(input_string, "show_render")
+    )  # Should print the strings with 'xyz' in place of the academic mask tags