"version": 3.48

2023-08-16 13:26:37 +08:00
parent 96c1852abc
commit 8a5e8bc5c1
88 changed files with 8087 additions and 803 deletions
--- a/request_llm/bridge_all.py
+++ b/request_llm/bridge_all.py
@@ -16,18 +16,9 @@ from toolbox import get_conf, trimmed_format_exc
 from .bridge_chatgpt import predict_no_ui_long_connection as chatgpt_noui
 from .bridge_chatgpt import predict as chatgpt_ui

-from .bridge_azure_test import predict_no_ui_long_connection as azure_noui
-from .bridge_azure_test import predict as azure_ui
-
 from .bridge_chatglm import predict_no_ui_long_connection as chatglm_noui
 from .bridge_chatglm import predict as chatglm_ui

-from .bridge_newbing import predict_no_ui_long_connection as newbing_noui
-from .bridge_newbing import predict as newbing_ui
-
-# from .bridge_tgui import predict_no_ui_long_connection as tgui_noui
-# from .bridge_tgui import predict as tgui_ui
-
 colors = ['#FF00FF', '#00FFFF', '#FF0000', '#990099', '#009999', '#990044']

 class LazyloadTiktoken(object):
@@ -51,10 +42,11 @@ class LazyloadTiktoken(object):
        return encoder.decode(*args, **kwargs)

 # Endpoint 重定向
-API_URL_REDIRECT, = get_conf("API_URL_REDIRECT")
+API_URL_REDIRECT, AZURE_ENDPOINT, AZURE_ENGINE = get_conf("API_URL_REDIRECT", "AZURE_ENDPOINT", "AZURE_ENGINE")
 openai_endpoint = "https://api.openai.com/v1/chat/completions"
 api2d_endpoint = "https://openai.api2d.net/v1/chat/completions"
 newbing_endpoint = "wss://sydney.bing.com/sydney/ChatHub"
+azure_endpoint = AZURE_ENDPOINT + f'openai/deployments/{AZURE_ENGINE}/chat/completions?api-version=2023-05-15'
 # 兼容旧版的配置
 try:
    API_URL, = get_conf("API_URL")
@@ -76,6 +68,10 @@ get_token_num_gpt35 = lambda txt: len(tokenizer_gpt35.encode(txt, disallowed_spe
 get_token_num_gpt4 = lambda txt: len(tokenizer_gpt4.encode(txt, disallowed_special=()))


+# 开始初始化模型
+AVAIL_LLM_MODELS, LLM_MODEL = get_conf("AVAIL_LLM_MODELS", "LLM_MODEL")
+AVAIL_LLM_MODELS = AVAIL_LLM_MODELS + [LLM_MODEL]
+# -=-=-=-=-=-=- 以下这部分是最早加入的最稳定的模型 -=-=-=-=-=-=-
 model_info = {
    # openai
    "gpt-3.5-turbo": {
@@ -124,10 +120,10 @@ model_info = {
    },

    # azure openai
-    "azure-gpt35":{
-        "fn_with_ui": azure_ui,
-        "fn_without_ui": azure_noui,
-        "endpoint": get_conf("AZURE_ENDPOINT"),
+    "azure-gpt-3.5":{
+        "fn_with_ui": chatgpt_ui,
+        "fn_without_ui": chatgpt_noui,
+        "endpoint": azure_endpoint,
        "max_token": 4096,
        "tokenizer": tokenizer_gpt35,
        "token_cnt": get_token_num_gpt35,
@@ -169,21 +165,33 @@ model_info = {
        "tokenizer": tokenizer_gpt35,
        "token_cnt": get_token_num_gpt35,
    },
-    
-    # newbing
-    "newbing": {
-        "fn_with_ui": newbing_ui,
-        "fn_without_ui": newbing_noui,
-        "endpoint": newbing_endpoint,
-        "max_token": 4096,
-        "tokenizer": tokenizer_gpt35,
-        "token_cnt": get_token_num_gpt35,
-    },

 }

-
-AVAIL_LLM_MODELS, = get_conf("AVAIL_LLM_MODELS")
+# -=-=-=-=-=-=- 以下部分是新加入的模型，可能附带额外依赖 -=-=-=-=-=-=-
+if "claude-1-100k" in AVAIL_LLM_MODELS or "claude-2" in AVAIL_LLM_MODELS:
+    from .bridge_claude import predict_no_ui_long_connection as claude_noui
+    from .bridge_claude import predict as claude_ui
+    model_info.update({
+        "claude-1-100k": {
+            "fn_with_ui": claude_ui,
+            "fn_without_ui": claude_noui,
+            "endpoint": None,
+            "max_token": 8196,
+            "tokenizer": tokenizer_gpt35,
+            "token_cnt": get_token_num_gpt35,
+        },
+    })
+    model_info.update({
+        "claude-2": {
+            "fn_with_ui": claude_ui,
+            "fn_without_ui": claude_noui,
+            "endpoint": None,
+            "max_token": 8196,
+            "tokenizer": tokenizer_gpt35,
+            "token_cnt": get_token_num_gpt35,
+        },
+    })
 if "jittorllms_rwkv" in AVAIL_LLM_MODELS:
    from .bridge_jittorllms_rwkv import predict_no_ui_long_connection as rwkv_noui
    from .bridge_jittorllms_rwkv import predict as rwkv_ui
@@ -239,7 +247,6 @@ if "moss" in AVAIL_LLM_MODELS:
 if "stack-claude" in AVAIL_LLM_MODELS:
    from .bridge_stackclaude import predict_no_ui_long_connection as claude_noui
    from .bridge_stackclaude import predict as claude_ui
-    # claude
    model_info.update({
        "stack-claude": {
            "fn_with_ui": claude_ui,
@@ -254,7 +261,6 @@ if "newbing-free" in AVAIL_LLM_MODELS:
    try:
        from .bridge_newbingfree import predict_no_ui_long_connection as newbingfree_noui
        from .bridge_newbingfree import predict as newbingfree_ui
-        # claude
        model_info.update({
            "newbing-free": {
                "fn_with_ui": newbingfree_ui,
@@ -267,6 +273,120 @@ if "newbing-free" in AVAIL_LLM_MODELS:
        })
    except:
        print(trimmed_format_exc())
+if "newbing" in AVAIL_LLM_MODELS:   # same with newbing-free
+    try:
+        from .bridge_newbingfree import predict_no_ui_long_connection as newbingfree_noui
+        from .bridge_newbingfree import predict as newbingfree_ui
+        model_info.update({
+            "newbing": {
+                "fn_with_ui": newbingfree_ui,
+                "fn_without_ui": newbingfree_noui,
+                "endpoint": newbing_endpoint,
+                "max_token": 4096,
+                "tokenizer": tokenizer_gpt35,
+                "token_cnt": get_token_num_gpt35,
+            }
+        })
+    except:
+        print(trimmed_format_exc())
+if "chatglmft" in AVAIL_LLM_MODELS:   # same with newbing-free
+    try:
+        from .bridge_chatglmft import predict_no_ui_long_connection as chatglmft_noui
+        from .bridge_chatglmft import predict as chatglmft_ui
+        model_info.update({
+            "chatglmft": {
+                "fn_with_ui": chatglmft_ui,
+                "fn_without_ui": chatglmft_noui,
+                "endpoint": None,
+                "max_token": 4096,
+                "tokenizer": tokenizer_gpt35,
+                "token_cnt": get_token_num_gpt35,
+            }
+        })
+    except:
+        print(trimmed_format_exc())
+if "internlm" in AVAIL_LLM_MODELS:
+    try:
+        from .bridge_internlm import predict_no_ui_long_connection as internlm_noui
+        from .bridge_internlm import predict as internlm_ui
+        model_info.update({
+            "internlm": {
+                "fn_with_ui": internlm_ui,
+                "fn_without_ui": internlm_noui,
+                "endpoint": None,
+                "max_token": 4096,
+                "tokenizer": tokenizer_gpt35,
+                "token_cnt": get_token_num_gpt35,
+            }
+        })
+    except:
+        print(trimmed_format_exc())
+if "chatglm_onnx" in AVAIL_LLM_MODELS:
+    try:
+        from .bridge_chatglmonnx import predict_no_ui_long_connection as chatglm_onnx_noui
+        from .bridge_chatglmonnx import predict as chatglm_onnx_ui
+        model_info.update({
+            "chatglm_onnx": {
+                "fn_with_ui": chatglm_onnx_ui,
+                "fn_without_ui": chatglm_onnx_noui,
+                "endpoint": None,
+                "max_token": 4096,
+                "tokenizer": tokenizer_gpt35,
+                "token_cnt": get_token_num_gpt35,
+            }
+        })
+    except:
+        print(trimmed_format_exc())
+if "qwen" in AVAIL_LLM_MODELS:
+    try:
+        from .bridge_qwen import predict_no_ui_long_connection as qwen_noui
+        from .bridge_qwen import predict as qwen_ui
+        model_info.update({
+            "qwen": {
+                "fn_with_ui": qwen_ui,
+                "fn_without_ui": qwen_noui,
+                "endpoint": None,
+                "max_token": 4096,
+                "tokenizer": tokenizer_gpt35,
+                "token_cnt": get_token_num_gpt35,
+            }
+        })
+    except:
+        print(trimmed_format_exc())
+if "chatgpt_website" in AVAIL_LLM_MODELS:   # 接入一些逆向工程https://github.com/acheong08/ChatGPT-to-API/
+    try:
+        from .bridge_chatgpt_website import predict_no_ui_long_connection as chatgpt_website_noui
+        from .bridge_chatgpt_website import predict as chatgpt_website_ui
+        model_info.update({
+            "chatgpt_website": {
+                "fn_with_ui": chatgpt_website_ui,
+                "fn_without_ui": chatgpt_website_noui,
+                "endpoint": None,
+                "max_token": 4096,
+                "tokenizer": tokenizer_gpt35,
+                "token_cnt": get_token_num_gpt35,
+            }
+        })
+    except:
+        print(trimmed_format_exc())
+if "spark" in AVAIL_LLM_MODELS:   # 讯飞星火认知大模型
+    try:
+        from .bridge_spark import predict_no_ui_long_connection as spark_noui
+        from .bridge_spark import predict as spark_ui
+        model_info.update({
+            "spark": {
+                "fn_with_ui": spark_ui,
+                "fn_without_ui": spark_noui,
+                "endpoint": None,
+                "max_token": 4096,
+                "tokenizer": tokenizer_gpt35,
+                "token_cnt": get_token_num_gpt35,
+            }
+        })
+    except:
+        print(trimmed_format_exc())
+
+

 def LLM_CATCH_EXCEPTION(f):
    """
@@ -307,7 +427,8 @@ def predict_no_ui_long_connection(inputs, llm_kwargs, history, sys_prompt, obser
        method = model_info[model]["fn_without_ui"]
        return method(inputs, llm_kwargs, history, sys_prompt, observe_window, console_slience)
    else:
-        # 如果同时询问多个大语言模型：
+
+        # 如果同时询问多个大语言模型，这个稍微啰嗦一点，但思路相同，您不必读这个else分支
        executor = ThreadPoolExecutor(max_workers=4)
        models = model.split('&')
        n_model = len(models)
@@ -370,6 +491,6 @@ def predict(inputs, llm_kwargs, *args, **kwargs):
    additional_fn代表点击的哪个按钮，按钮见functional.py
    """

-    method = model_info[llm_kwargs['llm_model']]["fn_with_ui"]
+    method = model_info[llm_kwargs['llm_model']]["fn_with_ui"]  # 如果这里报错，检查config中的AVAIL_LLM_MODELS选项
    yield from method(inputs, llm_kwargs, *args, **kwargs)

--- a/request_llm/bridge_chatglm.py
+++ b/request_llm/bridge_chatglm.py
@@ -37,15 +37,23 @@ class GetGLMHandle(Process):
        # 子进程执行
        # 第一次运行，加载参数
        retry = 0
+        LOCAL_MODEL_QUANT, device = get_conf('LOCAL_MODEL_QUANT', 'LOCAL_MODEL_DEVICE')
+
+        if LOCAL_MODEL_QUANT == "INT4":         # INT4
+            _model_name_ = "THUDM/chatglm2-6b-int4"
+        elif LOCAL_MODEL_QUANT == "INT8":       # INT8
+            _model_name_ = "THUDM/chatglm2-6b-int8"
+        else:
+            _model_name_ = "THUDM/chatglm2-6b"  # FP16
+
        while True:
            try:
                if self.chatglm_model is None:
-                    self.chatglm_tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
-                    device, = get_conf('LOCAL_MODEL_DEVICE')
+                    self.chatglm_tokenizer = AutoTokenizer.from_pretrained(_model_name_, trust_remote_code=True)
                    if device=='cpu':
-                        self.chatglm_model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True).float()
+                        self.chatglm_model = AutoModel.from_pretrained(_model_name_, trust_remote_code=True).float()
                    else:
-                        self.chatglm_model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True).half().cuda()
+                        self.chatglm_model = AutoModel.from_pretrained(_model_name_, trust_remote_code=True).half().cuda()
                    self.chatglm_model = self.chatglm_model.eval()
                    break
                else:
@@ -136,11 +144,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
            return

    if additional_fn is not None:
-        import core_functional
-        importlib.reload(core_functional)    # 热更新prompt
-        core_functional = core_functional.get_core_functions()
-        if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs)  # 获取预处理函数（如果有的话）
-        inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
+        from core_functional import handle_core_functionality
+        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)

    # 处理历史信息
    history_feedin = []
--- a/request_llm/bridge_chatglmft.py
+++ b/request_llm/bridge_chatglmft.py
@@ -0,0 +1,207 @@
+
+from transformers import AutoModel, AutoTokenizer
+import time
+import os
+import json
+import threading
+import importlib
+from toolbox import update_ui, get_conf
+from multiprocessing import Process, Pipe
+
+load_message = "ChatGLMFT尚未加载，加载需要一段时间。注意，取决于`config.py`的配置，ChatGLMFT消耗大量的内存（CPU）或显存（GPU），也许会导致低配计算机卡死 ……"
+
+def string_to_options(arguments):
+    import argparse
+    import shlex
+    # Create an argparse.ArgumentParser instance
+    parser = argparse.ArgumentParser()
+    # Add command-line arguments
+    parser.add_argument("--llm_to_learn", type=str, help="LLM model to learn", default="gpt-3.5-turbo")
+    parser.add_argument("--prompt_prefix", type=str, help="Prompt prefix", default='')
+    parser.add_argument("--system_prompt", type=str, help="System prompt", default='')
+    parser.add_argument("--batch", type=int, help="System prompt", default=50)
+    # Parse the arguments
+    args = parser.parse_args(shlex.split(arguments))
+    return args
+
+
+#################################################################################
+class GetGLMFTHandle(Process):
+    def __init__(self):
+        super().__init__(daemon=True)
+        self.parent, self.child = Pipe()
+        self.chatglmft_model = None
+        self.chatglmft_tokenizer = None
+        self.info = ""
+        self.success = True
+        self.check_dependency()
+        self.start()
+        self.threadLock = threading.Lock()
+        
+    def check_dependency(self):
+        try:
+            import sentencepiece
+            self.info = "依赖检测通过"
+            self.success = True
+        except:
+            self.info = "缺少ChatGLMFT的依赖，如果要使用ChatGLMFT，除了基础的pip依赖以外，您还需要运行`pip install -r request_llm/requirements_chatglm.txt`安装ChatGLM的依赖。"
+            self.success = False
+
+    def ready(self):
+        return self.chatglmft_model is not None
+
+    def run(self):
+        # 子进程执行
+        # 第一次运行，加载参数
+        retry = 0
+        while True:
+            try:
+                if self.chatglmft_model is None:
+                    from transformers import AutoConfig
+                    import torch
+                    # conf = 'request_llm/current_ptune_model.json'
+                    # if not os.path.exists(conf): raise RuntimeError('找不到微调模型信息')
+                    # with open(conf, 'r', encoding='utf8') as f:
+                    #     model_args = json.loads(f.read())
+                    ChatGLM_PTUNING_CHECKPOINT, = get_conf('ChatGLM_PTUNING_CHECKPOINT')
+                    assert os.path.exists(ChatGLM_PTUNING_CHECKPOINT), "找不到微调模型检查点"
+                    conf = os.path.join(ChatGLM_PTUNING_CHECKPOINT, "config.json")
+                    with open(conf, 'r', encoding='utf8') as f:
+                        model_args = json.loads(f.read())
+                    if 'model_name_or_path' not in model_args:
+                        model_args['model_name_or_path'] = model_args['_name_or_path']
+                    self.chatglmft_tokenizer = AutoTokenizer.from_pretrained(
+                        model_args['model_name_or_path'], trust_remote_code=True)
+                    config = AutoConfig.from_pretrained(
+                        model_args['model_name_or_path'], trust_remote_code=True)
+
+                    config.pre_seq_len = model_args['pre_seq_len']
+                    config.prefix_projection = model_args['prefix_projection']
+
+                    print(f"Loading prefix_encoder weight from {ChatGLM_PTUNING_CHECKPOINT}")
+                    model = AutoModel.from_pretrained(model_args['model_name_or_path'], config=config, trust_remote_code=True)
+                    prefix_state_dict = torch.load(os.path.join(ChatGLM_PTUNING_CHECKPOINT, "pytorch_model.bin"))
+                    new_prefix_state_dict = {}
+                    for k, v in prefix_state_dict.items():
+                        if k.startswith("transformer.prefix_encoder."):
+                            new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
+                    model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
+
+                    if model_args['quantization_bit'] is not None:
+                        print(f"Quantized to {model_args['quantization_bit']} bit")
+                        model = model.quantize(model_args['quantization_bit'])
+                    model = model.cuda()
+                    if model_args['pre_seq_len'] is not None:
+                        # P-tuning v2
+                        model.transformer.prefix_encoder.float()
+                    self.chatglmft_model = model.eval()
+
+                    break
+                else:
+                    break
+            except Exception as e:
+                retry += 1
+                if retry > 3: 
+                    self.child.send('[Local Message] Call ChatGLMFT fail 不能正常加载ChatGLMFT的参数。')
+                    raise RuntimeError("不能正常加载ChatGLMFT的参数！")
+
+        while True:
+            # 进入任务等待状态
+            kwargs = self.child.recv()
+            # 收到消息，开始请求
+            try:
+                for response, history in self.chatglmft_model.stream_chat(self.chatglmft_tokenizer, **kwargs):
+                    self.child.send(response)
+                    # # 中途接收可能的终止指令（如果有的话）
+                    # if self.child.poll(): 
+                    #     command = self.child.recv()
+                    #     if command == '[Terminate]': break
+            except:
+                from toolbox import trimmed_format_exc
+                self.child.send('[Local Message] Call ChatGLMFT fail.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
+            # 请求处理结束，开始下一个循环
+            self.child.send('[Finish]')
+
+    def stream_chat(self, **kwargs):
+        # 主进程执行
+        self.threadLock.acquire()
+        self.parent.send(kwargs)
+        while True:
+            res = self.parent.recv()
+            if res != '[Finish]':
+                yield res
+            else:
+                break
+        self.threadLock.release()
+    
+global glmft_handle
+glmft_handle = None
+#################################################################################
+def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
+    """
+        多线程方法
+        函数的说明请见 request_llm/bridge_all.py
+    """
+    global glmft_handle
+    if glmft_handle is None:
+        glmft_handle = GetGLMFTHandle()
+        if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + glmft_handle.info
+        if not glmft_handle.success: 
+            error = glmft_handle.info
+            glmft_handle = None
+            raise RuntimeError(error)
+
+    # chatglmft 没有 sys_prompt 接口，因此把prompt加入 history
+    history_feedin = []
+    history_feedin.append(["What can I do?", sys_prompt])
+    for i in range(len(history)//2):
+        history_feedin.append([history[2*i], history[2*i+1]] )
+
+    watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可
+    response = ""
+    for response in glmft_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
+        if len(observe_window) >= 1:  observe_window[0] = response
+        if len(observe_window) >= 2:  
+            if (time.time()-observe_window[1]) > watch_dog_patience:
+                raise RuntimeError("程序终止。")
+    return response
+
+
+
+def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
+    """
+        单线程方法
+        函数的说明请见 request_llm/bridge_all.py
+    """
+    chatbot.append((inputs, ""))
+
+    global glmft_handle
+    if glmft_handle is None:
+        glmft_handle = GetGLMFTHandle()
+        chatbot[-1] = (inputs, load_message + "\n\n" + glmft_handle.info)
+        yield from update_ui(chatbot=chatbot, history=[])
+        if not glmft_handle.success: 
+            glmft_handle = None
+            return
+
+    if additional_fn is not None:
+        from core_functional import handle_core_functionality
+        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
+
+    # 处理历史信息
+    history_feedin = []
+    history_feedin.append(["What can I do?", system_prompt] )
+    for i in range(len(history)//2):
+        history_feedin.append([history[2*i], history[2*i+1]] )
+
+    # 开始接收chatglmft的回复
+    response = "[Local Message]: 等待ChatGLMFT响应中 ..."
+    for response in glmft_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
+        chatbot[-1] = (inputs, response)
+        yield from update_ui(chatbot=chatbot, history=history)
+
+    # 总结输出
+    if response == "[Local Message]: 等待ChatGLMFT响应中 ...":
+        response = "[Local Message]: ChatGLMFT响应异常 ..."
+    history.extend([inputs, response])
+    yield from update_ui(chatbot=chatbot, history=history)
--- a/request_llm/bridge_chatglmonnx.py
+++ b/request_llm/bridge_chatglmonnx.py
@@ -0,0 +1,73 @@
+model_name = "ChatGLM-ONNX"
+cmd_to_install = "`pip install -r request_llm/requirements_chatglm_onnx.txt`"
+
+
+from transformers import AutoModel, AutoTokenizer
+import time
+import threading
+import importlib
+from toolbox import update_ui, get_conf
+from multiprocessing import Process, Pipe
+from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
+
+from .chatglmoonx import ChatGLMModel, chat_template
+
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Local Model
+# ------------------------------------------------------------------------------------------------------------------------
+@SingletonLocalLLM
+class GetONNXGLMHandle(LocalLLMHandle):
+
+    def load_model_info(self):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        self.model_name = model_name
+        self.cmd_to_install = cmd_to_install
+
+    def load_model_and_tokenizer(self):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        import os, glob
+        if not len(glob.glob("./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/*.bin")) >= 7: # 该模型有七个 bin 文件
+            from huggingface_hub import snapshot_download
+            snapshot_download(repo_id="K024/ChatGLM-6b-onnx-u8s8", local_dir="./request_llm/ChatGLM-6b-onnx-u8s8")
+        def create_model():
+            return ChatGLMModel(
+                tokenizer_path = "./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/sentencepiece.model",
+                onnx_model_path = "./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
+            )
+        self._model = create_model()
+        return self._model, None
+
+    def llm_stream_generator(self, **kwargs):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        def adaptor(kwargs):
+            query = kwargs['query']
+            max_length = kwargs['max_length']
+            top_p = kwargs['top_p']
+            temperature = kwargs['temperature']
+            history = kwargs['history']
+            return query, max_length, top_p, temperature, history
+
+        query, max_length, top_p, temperature, history = adaptor(kwargs)
+
+        prompt = chat_template(history, query)
+        for answer in self._model.generate_iterate(
+            prompt,
+            max_generated_tokens=max_length,
+            top_k=1,
+            top_p=top_p,
+            temperature=temperature,
+        ):
+            yield answer
+        
+    def try_to_import_special_deps(self, **kwargs):
+        # import something that will raise error if the user does not install requirement_*.txt
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        pass
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 GPT-Academic Interface
+# ------------------------------------------------------------------------------------------------------------------------
+predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetONNXGLMHandle, model_name)
--- a/request_llm/bridge_chatgpt.py
+++ b/request_llm/bridge_chatgpt.py
@@ -22,8 +22,8 @@ import importlib
 # config_private.py放自己的秘密如API和代理网址
 # 读取时首先看是否存在私密的config_private配置文件（不受git管控），如果有，则覆盖原config文件
 from toolbox import get_conf, update_ui, is_any_api_key, select_api_key, what_keys, clip_history, trimmed_format_exc
-proxies, API_KEY, TIMEOUT_SECONDS, MAX_RETRY = \
-    get_conf('proxies', 'API_KEY', 'TIMEOUT_SECONDS', 'MAX_RETRY')
+proxies, TIMEOUT_SECONDS, MAX_RETRY, API_ORG = \
+    get_conf('proxies', 'TIMEOUT_SECONDS', 'MAX_RETRY', 'API_ORG')

 timeout_bot_msg = '[Local Message] Request timeout. Network error. Please check proxy settings in config.py.' + \
                  '网络错误，检查代理服务器是否可用，以及代理设置的格式是否正确，格式须是[协议]://[地址]:[端口]，缺一不可。'
@@ -101,6 +101,8 @@ def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="",
                    if (time.time()-observe_window[1]) > watch_dog_patience:
                        raise RuntimeError("用户取消了程序。")
        else: raise RuntimeError("意外Json结构："+delta)
+    if json_data['finish_reason'] == 'content_filter':
+        raise RuntimeError("由于提问含不合规内容被Azure过滤。")
    if json_data['finish_reason'] == 'length':
        raise ConnectionAbortedError("正常结束，但显示Token不足，导致输出不完整，请削减单次输入的文本量。")
    return result
@@ -127,11 +129,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
        return

    if additional_fn is not None:
-        import core_functional
-        importlib.reload(core_functional)    # 热更新prompt
-        core_functional = core_functional.get_core_functions()
-        if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs)  # 获取预处理函数（如果有的话）
-        inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
+        from core_functional import handle_core_functionality
+        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)

    raw_input = inputs
    logging.info(f'[raw_input] {raw_input}')
@@ -172,9 +171,10 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
                chunk = next(stream_response)
            except StopIteration:
                # 非OpenAI官方接口的出现这样的报错，OpenAI和API2D不会走这里
-                from toolbox import regular_txt_to_markdown; tb_str = '```\n' + trimmed_format_exc() + '```'
-                chatbot[-1] = (chatbot[-1][0], f"[Local Message] 远程返回错误: \n\n{tb_str} \n\n{regular_txt_to_markdown(chunk.decode())}")
-                yield from update_ui(chatbot=chatbot, history=history, msg="远程返回错误:" + chunk.decode()) # 刷新界面
+                chunk_decoded = chunk.decode()
+                error_msg = chunk_decoded
+                chatbot, history = handle_error(inputs, llm_kwargs, chatbot, history, chunk_decoded, error_msg)
+                yield from update_ui(chatbot=chatbot, history=history, msg="非Openai官方接口返回了错误:" + chunk.decode()) # 刷新界面
                return
            
            # print(chunk.decode()[6:])
@@ -185,7 +185,7 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
            if chunk:
                try:
                    chunk_decoded = chunk.decode()
-                    # 前者API2D的
+                    # 前者是API2D的结束条件，后者是OPENAI的结束条件
                    if ('data: [DONE]' in chunk_decoded) or (len(json.loads(chunk_decoded[6:])['choices'][0]["delta"]) == 0):
                        # 判定为数据流的结束，gpt_replying_buffer也写完了
                        logging.info(f'[response] {gpt_replying_buffer}')
@@ -198,36 +198,45 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
                    history[-1] = gpt_replying_buffer
                    chatbot[-1] = (history[-2], history[-1])
                    yield from update_ui(chatbot=chatbot, history=history, msg=status_text) # 刷新界面
-
                except Exception as e:
-                    traceback.print_exc()
                    yield from update_ui(chatbot=chatbot, history=history, msg="Json解析不合常规") # 刷新界面
                    chunk = get_full_error(chunk, stream_response)
                    chunk_decoded = chunk.decode()
                    error_msg = chunk_decoded
-                    if "reduce the length" in error_msg:
-                        if len(history) >= 2: history[-1] = ""; history[-2] = "" # 清除当前溢出的输入：history[-2] 是本次输入, history[-1] 是本次输出
-                        history = clip_history(inputs=inputs, history=history, tokenizer=model_info[llm_kwargs['llm_model']]['tokenizer'], 
-                                               max_token_limit=(model_info[llm_kwargs['llm_model']]['max_token'])) # history至少释放二分之一
-                        chatbot[-1] = (chatbot[-1][0], "[Local Message] Reduce the length. 本次输入过长, 或历史数据过长. 历史缓存数据已部分释放, 您可以请再次尝试. (若再次失败则更可能是因为输入过长.)")
-                        # history = []    # 清除历史
-                    elif "does not exist" in error_msg:
-                        chatbot[-1] = (chatbot[-1][0], f"[Local Message] Model {llm_kwargs['llm_model']} does not exist. 模型不存在, 或者您没有获得体验资格.")
-                    elif "Incorrect API key" in error_msg:
-                        chatbot[-1] = (chatbot[-1][0], "[Local Message] Incorrect API key. OpenAI以提供了不正确的API_KEY为由, 拒绝服务.")
-                    elif "exceeded your current quota" in error_msg:
-                        chatbot[-1] = (chatbot[-1][0], "[Local Message] You exceeded your current quota. OpenAI以账户额度不足为由, 拒绝服务.")
-                    elif "bad forward key" in error_msg:
-                        chatbot[-1] = (chatbot[-1][0], "[Local Message] Bad forward key. API2D账户额度不足.")
-                    elif "Not enough point" in error_msg:
-                        chatbot[-1] = (chatbot[-1][0], "[Local Message] Not enough point. API2D账户点数不足.")
-                    else:
-                        from toolbox import regular_txt_to_markdown
-                        tb_str = '```\n' + trimmed_format_exc() + '```'
-                        chatbot[-1] = (chatbot[-1][0], f"[Local Message] 异常 \n\n{tb_str} \n\n{regular_txt_to_markdown(chunk_decoded)}")
+                    chatbot, history = handle_error(inputs, llm_kwargs, chatbot, history, chunk_decoded, error_msg)
                    yield from update_ui(chatbot=chatbot, history=history, msg="Json异常" + error_msg) # 刷新界面
+                    print(error_msg)
                    return

+def handle_error(inputs, llm_kwargs, chatbot, history, chunk_decoded, error_msg):
+    from .bridge_all import model_info
+    openai_website = ' 请登录OpenAI查看详情 https://platform.openai.com/signup'
+    if "reduce the length" in error_msg:
+        if len(history) >= 2: history[-1] = ""; history[-2] = "" # 清除当前溢出的输入：history[-2] 是本次输入, history[-1] 是本次输出
+        history = clip_history(inputs=inputs, history=history, tokenizer=model_info[llm_kwargs['llm_model']]['tokenizer'], 
+                                               max_token_limit=(model_info[llm_kwargs['llm_model']]['max_token'])) # history至少释放二分之一
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] Reduce the length. 本次输入过长, 或历史数据过长. 历史缓存数据已部分释放, 您可以请再次尝试. (若再次失败则更可能是因为输入过长.)")
+                        # history = []    # 清除历史
+    elif "does not exist" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], f"[Local Message] Model {llm_kwargs['llm_model']} does not exist. 模型不存在, 或者您没有获得体验资格.")
+    elif "Incorrect API key" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] Incorrect API key. OpenAI以提供了不正确的API_KEY为由, 拒绝服务. " + openai_website)
+    elif "exceeded your current quota" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] You exceeded your current quota. OpenAI以账户额度不足为由, 拒绝服务." + openai_website)
+    elif "account is not active" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] Your account is not active. OpenAI以账户失效为由, 拒绝服务." + openai_website)
+    elif "associated with a deactivated account" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] You are associated with a deactivated account. OpenAI以账户失效为由, 拒绝服务." + openai_website)
+    elif "bad forward key" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] Bad forward key. API2D账户额度不足.")
+    elif "Not enough point" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] Not enough point. API2D账户点数不足.")
+    else:
+        from toolbox import regular_txt_to_markdown
+        tb_str = '```\n' + trimmed_format_exc() + '```'
+        chatbot[-1] = (chatbot[-1][0], f"[Local Message] 异常 \n\n{tb_str} \n\n{regular_txt_to_markdown(chunk_decoded)}")
+    return chatbot, history
+
 def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
    """
    整合所有信息，选择LLM模型，生成http请求，为发送请求做准备
@@ -241,6 +250,8 @@ def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
+    if API_ORG.startswith('org-'): headers.update({"OpenAI-Organization": API_ORG})
+    if llm_kwargs['llm_model'].startswith('azure-'): headers.update({"api-key": api_key})

    conversation_cnt = len(history) // 2

--- a/request_llm/bridge_chatgpt_website.py
+++ b/request_llm/bridge_chatgpt_website.py
@@ -0,0 +1,297 @@
+# 借鉴了 https://github.com/GaiZhenbiao/ChuanhuChatGPT 项目
+
+"""
+    该文件中主要包含三个函数
+
+    不具备多线程能力的函数：
+    1. predict: 正常对话时使用，具备完备的交互功能，不可多线程
+
+    具备多线程调用能力的函数
+    2. predict_no_ui：高级实验性功能模块调用，不会实时显示在界面上，参数简单，可以多线程并行，方便实现复杂的功能逻辑
+    3. predict_no_ui_long_connection：在实验过程中发现调用predict_no_ui处理长文档时，和openai的连接容易断掉，这个函数用stream的方式解决这个问题，同样支持多线程
+"""
+
+import json
+import time
+import gradio as gr
+import logging
+import traceback
+import requests
+import importlib
+
+# config_private.py放自己的秘密如API和代理网址
+# 读取时首先看是否存在私密的config_private配置文件（不受git管控），如果有，则覆盖原config文件
+from toolbox import get_conf, update_ui, is_any_api_key, select_api_key, what_keys, clip_history, trimmed_format_exc
+proxies, TIMEOUT_SECONDS, MAX_RETRY, API_ORG = \
+    get_conf('proxies', 'TIMEOUT_SECONDS', 'MAX_RETRY', 'API_ORG')
+
+timeout_bot_msg = '[Local Message] Request timeout. Network error. Please check proxy settings in config.py.' + \
+                  '网络错误，检查代理服务器是否可用，以及代理设置的格式是否正确，格式须是[协议]://[地址]:[端口]，缺一不可。'
+
+def get_full_error(chunk, stream_response):
+    """
+        获取完整的从Openai返回的报错
+    """
+    while True:
+        try:
+            chunk += next(stream_response)
+        except:
+            break
+    return chunk
+
+
+def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=None, console_slience=False):
+    """
+    发送至chatGPT，等待回复，一次性完成，不显示中间过程。但内部用stream的方法避免中途网线被掐。
+    inputs：
+        是本次问询的输入
+    sys_prompt:
+        系统静默prompt
+    llm_kwargs：
+        chatGPT的内部调优参数
+    history：
+        是之前的对话列表
+    observe_window = None：
+        用于负责跨越线程传递已经输出的部分，大部分时候仅仅为了fancy的视觉效果，留空即可。observe_window[0]：观测窗。observe_window[1]：看门狗
+    """
+    watch_dog_patience = 5 # 看门狗的耐心, 设置5秒即可
+    headers, payload = generate_payload(inputs, llm_kwargs, history, system_prompt=sys_prompt, stream=True)
+    retry = 0
+    while True:
+        try:
+            # make a POST request to the API endpoint, stream=False
+            from .bridge_all import model_info
+            endpoint = model_info[llm_kwargs['llm_model']]['endpoint']
+            response = requests.post(endpoint, headers=headers, proxies=proxies,
+                                    json=payload, stream=True, timeout=TIMEOUT_SECONDS); break
+        except requests.exceptions.ReadTimeout as e:
+            retry += 1
+            traceback.print_exc()
+            if retry > MAX_RETRY: raise TimeoutError
+            if MAX_RETRY!=0: print(f'请求超时，正在重试 ({retry}/{MAX_RETRY}) ……')
+
+    stream_response =  response.iter_lines()
+    result = ''
+    while True:
+        try: chunk = next(stream_response).decode()
+        except StopIteration: 
+            break
+        except requests.exceptions.ConnectionError:
+            chunk = next(stream_response).decode() # 失败了，重试一次？再失败就没办法了。
+        if len(chunk)==0: continue
+        if not chunk.startswith('data:'): 
+            error_msg = get_full_error(chunk.encode('utf8'), stream_response).decode()
+            if "reduce the length" in error_msg:
+                raise ConnectionAbortedError("OpenAI拒绝了请求:" + error_msg)
+            else:
+                raise RuntimeError("OpenAI拒绝了请求：" + error_msg)
+        if ('data: [DONE]' in chunk): break # api2d 正常完成
+        json_data = json.loads(chunk.lstrip('data:'))['choices'][0]
+        delta = json_data["delta"]
+        if len(delta) == 0: break
+        if "role" in delta: continue
+        if "content" in delta: 
+            result += delta["content"]
+            if not console_slience: print(delta["content"], end='')
+            if observe_window is not None: 
+                # 观测窗，把已经获取的数据显示出去
+                if len(observe_window) >= 1: observe_window[0] += delta["content"]
+                # 看门狗，如果超过期限没有喂狗，则终止
+                if len(observe_window) >= 2:  
+                    if (time.time()-observe_window[1]) > watch_dog_patience:
+                        raise RuntimeError("用户取消了程序。")
+        else: raise RuntimeError("意外Json结构："+delta)
+    if json_data['finish_reason'] == 'content_filter':
+        raise RuntimeError("由于提问含不合规内容被Azure过滤。")
+    if json_data['finish_reason'] == 'length':
+        raise ConnectionAbortedError("正常结束，但显示Token不足，导致输出不完整，请削减单次输入的文本量。")
+    return result
+
+
+def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
+    """
+    发送至chatGPT，流式获取输出。
+    用于基础的对话功能。
+    inputs 是本次问询的输入
+    top_p, temperature是chatGPT的内部调优参数
+    history 是之前的对话列表（注意无论是inputs还是history，内容太长了都会触发token数量溢出的错误）
+    chatbot 为WebUI中显示的对话列表，修改它，然后yeild出去，可以直接修改对话界面内容
+    additional_fn代表点击的哪个按钮，按钮见functional.py
+    """
+    if is_any_api_key(inputs):
+        chatbot._cookies['api_key'] = inputs
+        chatbot.append(("输入已识别为openai的api_key", what_keys(inputs)))
+        yield from update_ui(chatbot=chatbot, history=history, msg="api_key已导入") # 刷新界面
+        return
+    elif not is_any_api_key(chatbot._cookies['api_key']):
+        chatbot.append((inputs, "缺少api_key。\n\n1. 临时解决方案：直接在输入区键入api_key，然后回车提交。\n\n2. 长效解决方案：在config.py中配置。"))
+        yield from update_ui(chatbot=chatbot, history=history, msg="缺少api_key") # 刷新界面
+        return
+
+    if additional_fn is not None:
+        from core_functional import handle_core_functionality
+        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
+
+    raw_input = inputs
+    logging.info(f'[raw_input] {raw_input}')
+    chatbot.append((inputs, ""))
+    yield from update_ui(chatbot=chatbot, history=history, msg="等待响应") # 刷新界面
+
+    try:
+        headers, payload = generate_payload(inputs, llm_kwargs, history, system_prompt, stream)
+    except RuntimeError as e:
+        chatbot[-1] = (inputs, f"您提供的api-key不满足要求，不包含任何可用于{llm_kwargs['llm_model']}的api-key。您可能选择了错误的模型或请求源。")
+        yield from update_ui(chatbot=chatbot, history=history, msg="api-key不满足要求") # 刷新界面
+        return
+        
+    history.append(inputs); history.append("")
+
+    retry = 0
+    while True:
+        try:
+            # make a POST request to the API endpoint, stream=True
+            from .bridge_all import model_info
+            endpoint = model_info[llm_kwargs['llm_model']]['endpoint']
+            response = requests.post(endpoint, headers=headers, proxies=proxies,
+                                    json=payload, stream=True, timeout=TIMEOUT_SECONDS);break
+        except:
+            retry += 1
+            chatbot[-1] = ((chatbot[-1][0], timeout_bot_msg))
+            retry_msg = f"，正在重试 ({retry}/{MAX_RETRY}) ……" if MAX_RETRY > 0 else ""
+            yield from update_ui(chatbot=chatbot, history=history, msg="请求超时"+retry_msg) # 刷新界面
+            if retry > MAX_RETRY: raise TimeoutError
+
+    gpt_replying_buffer = ""
+    
+    is_head_of_the_stream = True
+    if stream:
+        stream_response =  response.iter_lines()
+        while True:
+            try:
+                chunk = next(stream_response)
+            except StopIteration:
+                # 非OpenAI官方接口的出现这样的报错，OpenAI和API2D不会走这里
+                chunk_decoded = chunk.decode()
+                error_msg = chunk_decoded
+                chatbot, history = handle_error(inputs, llm_kwargs, chatbot, history, chunk_decoded, error_msg)
+                yield from update_ui(chatbot=chatbot, history=history, msg="非Openai官方接口返回了错误:" + chunk.decode()) # 刷新界面
+                return
+            
+            # print(chunk.decode()[6:])
+            if is_head_of_the_stream and (r'"object":"error"' not in chunk.decode()):
+                # 数据流的第一帧不携带content
+                is_head_of_the_stream = False; continue
+            
+            if chunk:
+                try:
+                    chunk_decoded = chunk.decode()
+                    # 前者是API2D的结束条件，后者是OPENAI的结束条件
+                    if 'data: [DONE]' in chunk_decoded:
+                        # 判定为数据流的结束，gpt_replying_buffer也写完了
+                        logging.info(f'[response] {gpt_replying_buffer}')
+                        break
+                    # 处理数据流的主体
+                    chunkjson = json.loads(chunk_decoded[6:])
+                    status_text = f"finish_reason: {chunkjson['choices'][0]['finish_reason']}"
+                    delta = chunkjson['choices'][0]["delta"]
+                    if "content" in delta:
+                        gpt_replying_buffer = gpt_replying_buffer + delta["content"]
+                    history[-1] = gpt_replying_buffer
+                    chatbot[-1] = (history[-2], history[-1])
+                    yield from update_ui(chatbot=chatbot, history=history, msg=status_text) # 刷新界面
+                except Exception as e:
+                    yield from update_ui(chatbot=chatbot, history=history, msg="Json解析不合常规") # 刷新界面
+                    chunk = get_full_error(chunk, stream_response)
+                    chunk_decoded = chunk.decode()
+                    error_msg = chunk_decoded
+                    chatbot, history = handle_error(inputs, llm_kwargs, chatbot, history, chunk_decoded, error_msg)
+                    yield from update_ui(chatbot=chatbot, history=history, msg="Json异常" + error_msg) # 刷新界面
+                    print(error_msg)
+                    return
+
+def handle_error(inputs, llm_kwargs, chatbot, history, chunk_decoded, error_msg):
+    from .bridge_all import model_info
+    openai_website = ' 请登录OpenAI查看详情 https://platform.openai.com/signup'
+    if "reduce the length" in error_msg:
+        if len(history) >= 2: history[-1] = ""; history[-2] = "" # 清除当前溢出的输入：history[-2] 是本次输入, history[-1] 是本次输出
+        history = clip_history(inputs=inputs, history=history, tokenizer=model_info[llm_kwargs['llm_model']]['tokenizer'], 
+                                               max_token_limit=(model_info[llm_kwargs['llm_model']]['max_token'])) # history至少释放二分之一
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] Reduce the length. 本次输入过长, 或历史数据过长. 历史缓存数据已部分释放, 您可以请再次尝试. (若再次失败则更可能是因为输入过长.)")
+                        # history = []    # 清除历史
+    elif "does not exist" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], f"[Local Message] Model {llm_kwargs['llm_model']} does not exist. 模型不存在, 或者您没有获得体验资格.")
+    elif "Incorrect API key" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] Incorrect API key. OpenAI以提供了不正确的API_KEY为由, 拒绝服务. " + openai_website)
+    elif "exceeded your current quota" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] You exceeded your current quota. OpenAI以账户额度不足为由, 拒绝服务." + openai_website)
+    elif "account is not active" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] Your account is not active. OpenAI以账户失效为由, 拒绝服务." + openai_website)
+    elif "associated with a deactivated account" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] You are associated with a deactivated account. OpenAI以账户失效为由, 拒绝服务." + openai_website)
+    elif "bad forward key" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] Bad forward key. API2D账户额度不足.")
+    elif "Not enough point" in error_msg:
+        chatbot[-1] = (chatbot[-1][0], "[Local Message] Not enough point. API2D账户点数不足.")
+    else:
+        from toolbox import regular_txt_to_markdown
+        tb_str = '```\n' + trimmed_format_exc() + '```'
+        chatbot[-1] = (chatbot[-1][0], f"[Local Message] 异常 \n\n{tb_str} \n\n{regular_txt_to_markdown(chunk_decoded)}")
+    return chatbot, history
+
+def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
+    """
+    整合所有信息，选择LLM模型，生成http请求，为发送请求做准备
+    """
+    if not is_any_api_key(llm_kwargs['api_key']):
+        raise AssertionError("你提供了错误的API_KEY。\n\n1. 临时解决方案：直接在输入区键入api_key，然后回车提交。\n\n2. 长效解决方案：在config.py中配置。")
+
+    api_key = select_api_key(llm_kwargs['api_key'], llm_kwargs['llm_model'])
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {api_key}"
+    }
+    if API_ORG.startswith('org-'): headers.update({"OpenAI-Organization": API_ORG})
+    if llm_kwargs['llm_model'].startswith('azure-'): headers.update({"api-key": api_key})
+
+    conversation_cnt = len(history) // 2
+
+    messages = [{"role": "system", "content": system_prompt}]
+    if conversation_cnt:
+        for index in range(0, 2*conversation_cnt, 2):
+            what_i_have_asked = {}
+            what_i_have_asked["role"] = "user"
+            what_i_have_asked["content"] = history[index]
+            what_gpt_answer = {}
+            what_gpt_answer["role"] = "assistant"
+            what_gpt_answer["content"] = history[index+1]
+            if what_i_have_asked["content"] != "":
+                if what_gpt_answer["content"] == "": continue
+                if what_gpt_answer["content"] == timeout_bot_msg: continue
+                messages.append(what_i_have_asked)
+                messages.append(what_gpt_answer)
+            else:
+                messages[-1]['content'] = what_gpt_answer['content']
+
+    what_i_ask_now = {}
+    what_i_ask_now["role"] = "user"
+    what_i_ask_now["content"] = inputs
+    messages.append(what_i_ask_now)
+
+    payload = {
+        "model": llm_kwargs['llm_model'].strip('api2d-'),
+        "messages": messages, 
+        "temperature": llm_kwargs['temperature'],  # 1.0,
+        "top_p": llm_kwargs['top_p'],  # 1.0,
+        "n": 1,
+        "stream": stream,
+        "presence_penalty": 0,
+        "frequency_penalty": 0,
+    }
+    try:
+        print(f" {llm_kwargs['llm_model']} : {conversation_cnt} : {inputs[:100]} ..........")
+    except:
+        print('输入中可能存在乱码。')
+    return headers,payload
+
+
--- a/request_llm/bridge_claude.py
+++ b/request_llm/bridge_claude.py
@@ -0,0 +1,228 @@
+# 借鉴了 https://github.com/GaiZhenbiao/ChuanhuChatGPT 项目
+
+"""
+    该文件中主要包含2个函数
+
+    不具备多线程能力的函数：
+    1. predict: 正常对话时使用，具备完备的交互功能，不可多线程
+
+    具备多线程调用能力的函数
+    2. predict_no_ui_long_connection：在实验过程中发现调用predict_no_ui处理长文档时，和openai的连接容易断掉，这个函数用stream的方式解决这个问题，同样支持多线程
+"""
+
+import os
+import json
+import time
+import gradio as gr
+import logging
+import traceback
+import requests
+import importlib
+
+# config_private.py放自己的秘密如API和代理网址
+# 读取时首先看是否存在私密的config_private配置文件（不受git管控），如果有，则覆盖原config文件
+from toolbox import get_conf, update_ui, trimmed_format_exc, ProxyNetworkActivate
+proxies, TIMEOUT_SECONDS, MAX_RETRY, ANTHROPIC_API_KEY = \
+    get_conf('proxies', 'TIMEOUT_SECONDS', 'MAX_RETRY', 'ANTHROPIC_API_KEY')
+
+timeout_bot_msg = '[Local Message] Request timeout. Network error. Please check proxy settings in config.py.' + \
+                  '网络错误，检查代理服务器是否可用，以及代理设置的格式是否正确，格式须是[协议]://[地址]:[端口]，缺一不可。'
+
+def get_full_error(chunk, stream_response):
+    """
+        获取完整的从Openai返回的报错
+    """
+    while True:
+        try:
+            chunk += next(stream_response)
+        except:
+            break
+    return chunk
+
+
+def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=None, console_slience=False):
+    """
+    发送至chatGPT，等待回复，一次性完成，不显示中间过程。但内部用stream的方法避免中途网线被掐。
+    inputs：
+        是本次问询的输入
+    sys_prompt:
+        系统静默prompt
+    llm_kwargs：
+        chatGPT的内部调优参数
+    history：
+        是之前的对话列表
+    observe_window = None：
+        用于负责跨越线程传递已经输出的部分，大部分时候仅仅为了fancy的视觉效果，留空即可。observe_window[0]：观测窗。observe_window[1]：看门狗
+    """
+    from anthropic import Anthropic
+    watch_dog_patience = 5 # 看门狗的耐心, 设置5秒即可
+    prompt = generate_payload(inputs, llm_kwargs, history, system_prompt=sys_prompt, stream=True)
+    retry = 0
+    if len(ANTHROPIC_API_KEY) == 0:
+        raise RuntimeError("没有设置ANTHROPIC_API_KEY选项")
+
+    while True:
+        try:
+            # make a POST request to the API endpoint, stream=False
+            from .bridge_all import model_info
+            anthropic = Anthropic(api_key=ANTHROPIC_API_KEY)
+            # endpoint = model_info[llm_kwargs['llm_model']]['endpoint']
+            # with ProxyNetworkActivate()
+            stream = anthropic.completions.create(
+                prompt=prompt,
+                max_tokens_to_sample=4096,       # The maximum number of tokens to generate before stopping.
+                model=llm_kwargs['llm_model'],
+                stream=True,
+                temperature = llm_kwargs['temperature']
+            )
+            break
+        except Exception as e:
+            retry += 1
+            traceback.print_exc()
+            if retry > MAX_RETRY: raise TimeoutError
+            if MAX_RETRY!=0: print(f'请求超时，正在重试 ({retry}/{MAX_RETRY}) ……')
+    result = ''
+    try: 
+        for completion in stream:
+            result += completion.completion
+            if not console_slience: print(completion.completion, end='')
+            if observe_window is not None: 
+                # 观测窗，把已经获取的数据显示出去
+                if len(observe_window) >= 1: observe_window[0] += completion.completion
+                # 看门狗，如果超过期限没有喂狗，则终止
+                if len(observe_window) >= 2:  
+                    if (time.time()-observe_window[1]) > watch_dog_patience:
+                        raise RuntimeError("用户取消了程序。")
+    except Exception as e:
+        traceback.print_exc()
+
+    return result
+
+
+def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
+    """
+    发送至chatGPT，流式获取输出。
+    用于基础的对话功能。
+    inputs 是本次问询的输入
+    top_p, temperature是chatGPT的内部调优参数
+    history 是之前的对话列表（注意无论是inputs还是history，内容太长了都会触发token数量溢出的错误）
+    chatbot 为WebUI中显示的对话列表，修改它，然后yeild出去，可以直接修改对话界面内容
+    additional_fn代表点击的哪个按钮，按钮见functional.py
+    """
+    from anthropic import Anthropic
+    if len(ANTHROPIC_API_KEY) == 0:
+        chatbot.append((inputs, "没有设置ANTHROPIC_API_KEY"))
+        yield from update_ui(chatbot=chatbot, history=history, msg="等待响应") # 刷新界面
+        return
+    
+    if additional_fn is not None:
+        from core_functional import handle_core_functionality
+        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
+
+    raw_input = inputs
+    logging.info(f'[raw_input] {raw_input}')
+    chatbot.append((inputs, ""))
+    yield from update_ui(chatbot=chatbot, history=history, msg="等待响应") # 刷新界面
+
+    try:
+        prompt = generate_payload(inputs, llm_kwargs, history, system_prompt, stream)
+    except RuntimeError as e:
+        chatbot[-1] = (inputs, f"您提供的api-key不满足要求，不包含任何可用于{llm_kwargs['llm_model']}的api-key。您可能选择了错误的模型或请求源。")
+        yield from update_ui(chatbot=chatbot, history=history, msg="api-key不满足要求") # 刷新界面
+        return
+
+    history.append(inputs); history.append("")
+
+    retry = 0
+    while True:
+        try:
+            # make a POST request to the API endpoint, stream=True
+            from .bridge_all import model_info
+            anthropic = Anthropic(api_key=ANTHROPIC_API_KEY)
+            # endpoint = model_info[llm_kwargs['llm_model']]['endpoint']
+            # with ProxyNetworkActivate()
+            stream = anthropic.completions.create(
+                prompt=prompt,
+                max_tokens_to_sample=4096,       # The maximum number of tokens to generate before stopping.
+                model=llm_kwargs['llm_model'],
+                stream=True,
+                temperature = llm_kwargs['temperature']
+            )
+            
+            break
+        except:
+            retry += 1
+            chatbot[-1] = ((chatbot[-1][0], timeout_bot_msg))
+            retry_msg = f"，正在重试 ({retry}/{MAX_RETRY}) ……" if MAX_RETRY > 0 else ""
+            yield from update_ui(chatbot=chatbot, history=history, msg="请求超时"+retry_msg) # 刷新界面
+            if retry > MAX_RETRY: raise TimeoutError
+
+    gpt_replying_buffer = ""
+    
+    for completion in stream:
+        try:
+            gpt_replying_buffer = gpt_replying_buffer + completion.completion
+            history[-1] = gpt_replying_buffer
+            chatbot[-1] = (history[-2], history[-1])
+            yield from update_ui(chatbot=chatbot, history=history, msg='正常') # 刷新界面
+
+        except Exception as e:
+            from toolbox import regular_txt_to_markdown
+            tb_str = '```\n' + trimmed_format_exc() + '```'
+            chatbot[-1] = (chatbot[-1][0], f"[Local Message] 异常 \n\n{tb_str}")
+            yield from update_ui(chatbot=chatbot, history=history, msg="Json异常" + tb_str) # 刷新界面
+            return
+        
+
+
+
+# https://github.com/jtsang4/claude-to-chatgpt/blob/main/claude_to_chatgpt/adapter.py
+def convert_messages_to_prompt(messages):
+    prompt = ""
+    role_map = {
+        "system": "Human",
+        "user": "Human",
+        "assistant": "Assistant",
+    }
+    for message in messages:
+        role = message["role"]
+        content = message["content"]
+        transformed_role = role_map[role]
+        prompt += f"\n\n{transformed_role.capitalize()}: {content}"
+    prompt += "\n\nAssistant: "
+    return prompt
+
+def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
+    """
+    整合所有信息，选择LLM模型，生成http请求，为发送请求做准备
+    """
+    from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
+
+    conversation_cnt = len(history) // 2
+
+    messages = [{"role": "system", "content": system_prompt}]
+    if conversation_cnt:
+        for index in range(0, 2*conversation_cnt, 2):
+            what_i_have_asked = {}
+            what_i_have_asked["role"] = "user"
+            what_i_have_asked["content"] = history[index]
+            what_gpt_answer = {}
+            what_gpt_answer["role"] = "assistant"
+            what_gpt_answer["content"] = history[index+1]
+            if what_i_have_asked["content"] != "":
+                if what_gpt_answer["content"] == "": continue
+                if what_gpt_answer["content"] == timeout_bot_msg: continue
+                messages.append(what_i_have_asked)
+                messages.append(what_gpt_answer)
+            else:
+                messages[-1]['content'] = what_gpt_answer['content']
+
+    what_i_ask_now = {}
+    what_i_ask_now["role"] = "user"
+    what_i_ask_now["content"] = inputs
+    messages.append(what_i_ask_now)
+    prompt = convert_messages_to_prompt(messages)
+
+    return prompt
+
+
--- a/request_llm/bridge_internlm.py
+++ b/request_llm/bridge_internlm.py
@@ -0,0 +1,202 @@
+model_name = "InternLM"
+cmd_to_install = "`pip install -r request_llm/requirements_chatglm.txt`"
+
+from transformers import AutoModel, AutoTokenizer
+import time
+import threading
+import importlib
+from toolbox import update_ui, get_conf
+from multiprocessing import Process, Pipe
+from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Local Model Utils
+# ------------------------------------------------------------------------------------------------------------------------
+def try_to_import_special_deps():
+    import sentencepiece
+
+def combine_history(prompt, hist):
+    user_prompt = "<|User|>:{user}<eoh>\n"
+    robot_prompt = "<|Bot|>:{robot}<eoa>\n"
+    cur_query_prompt = "<|User|>:{user}<eoh>\n<|Bot|>:"
+    messages = hist
+    total_prompt = ""
+    for message in messages:
+        cur_content = message
+        cur_prompt = user_prompt.replace("{user}", cur_content[0])
+        total_prompt += cur_prompt
+        cur_prompt = robot_prompt.replace("{robot}", cur_content[1])
+        total_prompt += cur_prompt
+    total_prompt = total_prompt + cur_query_prompt.replace("{user}", prompt)
+    return total_prompt
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Local Model
+# ------------------------------------------------------------------------------------------------------------------------
+@SingletonLocalLLM
+class GetInternlmHandle(LocalLLMHandle):
+
+    def load_model_info(self):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        self.model_name = model_name
+        self.cmd_to_install = cmd_to_install
+
+    def try_to_import_special_deps(self, **kwargs):
+        """
+        import something that will raise error if the user does not install requirement_*.txt
+        """
+        import sentencepiece
+
+    def load_model_and_tokenizer(self):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        import torch
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        device, = get_conf('LOCAL_MODEL_DEVICE')
+        if self._model is None:
+            tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
+            if device=='cpu':
+                model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).to(torch.bfloat16)
+            else:
+                model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).to(torch.bfloat16).cuda()
+
+            model = model.eval()
+        return model, tokenizer
+
+    def llm_stream_generator(self, **kwargs):
+        import torch
+        import logging
+        import copy
+        import warnings
+        import torch.nn as nn
+        from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig
+
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        def adaptor():
+            model = self._model
+            tokenizer = self._tokenizer
+            prompt = kwargs['query']
+            max_length = kwargs['max_length']
+            top_p = kwargs['top_p']
+            temperature = kwargs['temperature']
+            history = kwargs['history']
+            real_prompt = combine_history(prompt, history)
+            return model, tokenizer, real_prompt, max_length, top_p, temperature
+        
+        model, tokenizer, prompt, max_length, top_p, temperature = adaptor()
+        prefix_allowed_tokens_fn = None
+        logits_processor = None
+        stopping_criteria = None
+        additional_eos_token_id = 103028
+        generation_config = None
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ https://github.com/InternLM/InternLM/blob/efbf5335709a8c8faeac6eaf07193973ff1d56a1/web_demo.py#L25
+
+        inputs = tokenizer([prompt], padding=True, return_tensors="pt")
+        input_length = len(inputs["input_ids"][0])
+        for k, v in inputs.items():
+            inputs[k] = v.cuda()
+        input_ids = inputs["input_ids"]
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+        if generation_config is None:
+            generation_config = model.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        if additional_eos_token_id is not None:
+            eos_token_id.append(additional_eos_token_id)
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logging.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "input_ids"
+            logging.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        logits_processor = model._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+
+        stopping_criteria = model._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = model._get_logits_warper(generation_config)
+
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = model.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = model(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = model._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=False
+            )
+            unfinished_sequences = unfinished_sequences.mul((min(next_tokens != i for i in eos_token_id)).long())
+            
+            output_token_ids = input_ids[0].cpu().tolist()
+            output_token_ids = output_token_ids[input_length:]
+            for each_eos_token_id in eos_token_id:
+                if output_token_ids[-1] == each_eos_token_id:
+                    output_token_ids = output_token_ids[:-1]
+            response = tokenizer.decode(output_token_ids)
+
+            yield response
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                return
+
+    
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 GPT-Academic Interface
+# ------------------------------------------------------------------------------------------------------------------------
+predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetInternlmHandle, model_name)
--- a/request_llm/bridge_jittorllms_llama.py
+++ b/request_llm/bridge_jittorllms_llama.py
@@ -154,11 +154,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
            return

    if additional_fn is not None:
-        import core_functional
-        importlib.reload(core_functional)    # 热更新prompt
-        core_functional = core_functional.get_core_functions()
-        if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs)  # 获取预处理函数（如果有的话）
-        inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
+        from core_functional import handle_core_functionality
+        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)

    # 处理历史信息
    history_feedin = []
--- a/request_llm/bridge_jittorllms_pangualpha.py
+++ b/request_llm/bridge_jittorllms_pangualpha.py
@@ -154,11 +154,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
            return

    if additional_fn is not None:
-        import core_functional
-        importlib.reload(core_functional)    # 热更新prompt
-        core_functional = core_functional.get_core_functions()
-        if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs)  # 获取预处理函数（如果有的话）
-        inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
+        from core_functional import handle_core_functionality
+        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)

    # 处理历史信息
    history_feedin = []
--- a/request_llm/bridge_jittorllms_rwkv.py
+++ b/request_llm/bridge_jittorllms_rwkv.py
@@ -154,11 +154,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
            return

    if additional_fn is not None:
-        import core_functional
-        importlib.reload(core_functional)    # 热更新prompt
-        core_functional = core_functional.get_core_functions()
-        if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs)  # 获取预处理函数（如果有的话）
-        inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
+        from core_functional import handle_core_functionality
+        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)

    # 处理历史信息
    history_feedin = []
--- a/request_llm/bridge_moss.py
+++ b/request_llm/bridge_moss.py
@@ -224,11 +224,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
        yield from update_ui(chatbot=chatbot, history=history)

    if additional_fn is not None:
-        import core_functional
-        importlib.reload(core_functional)    # 热更新prompt
-        core_functional = core_functional.get_core_functions()
-        if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs)  # 获取预处理函数（如果有的话）
-        inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
+        from core_functional import handle_core_functionality
+        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)

    # 处理历史信息
    history_feedin = []
--- a/request_llm/bridge_newbingfree.py
+++ b/request_llm/bridge_newbingfree.py
@@ -89,9 +89,6 @@ class NewBingHandle(Process):
                if a not in self.local_history:
                    self.local_history.append(a)
                    prompt += a + '\n'
-                # if b not in self.local_history:
-                #     self.local_history.append(b)
-                #     prompt += b + '\n'

            # 问题
            prompt += question
@@ -101,7 +98,7 @@ class NewBingHandle(Process):
            async for final, response in self.newbing_model.ask_stream(
                prompt=question,
                conversation_style=NEWBING_STYLE,     # ["creative", "balanced", "precise"]
-                wss_link=endpoint,                      # "wss://sydney.bing.com/sydney/ChatHub"
+                wss_link=endpoint,                    # "wss://sydney.bing.com/sydney/ChatHub"
            ):
                if not final:
                    print(response)
@@ -121,14 +118,26 @@ class NewBingHandle(Process):
        self.local_history = []
        if (self.newbing_model is None) or (not self.success):
            # 代理设置
-            proxies, = get_conf('proxies')
+            proxies, NEWBING_COOKIES = get_conf('proxies', 'NEWBING_COOKIES')
            if proxies is None: 
                self.proxies_https = None
            else: 
                self.proxies_https = proxies['https']

+            if (NEWBING_COOKIES is not None) and len(NEWBING_COOKIES) > 100:
+                try:
+                    cookies = json.loads(NEWBING_COOKIES)
+                except:
+                    self.success = False
+                    tb_str = '\n```\n' + trimmed_format_exc() + '\n```\n'
+                    self.child.send(f'[Local Message] NEWBING_COOKIES未填写或有格式错误。')
+                    self.child.send('[Fail]'); self.child.send('[Finish]')
+                    raise RuntimeError(f"NEWBING_COOKIES未填写或有格式错误。")
+            else:
+                cookies = None
+
            try:
-                self.newbing_model = NewbingChatbot(proxy=self.proxies_https)
+                self.newbing_model = NewbingChatbot(proxy=self.proxies_https, cookies=cookies)
            except:
                self.success = False
                tb_str = '\n```\n' + trimmed_format_exc() + '\n```\n'
@@ -143,7 +152,7 @@ class NewBingHandle(Process):
            asyncio.run(self.async_run())
        except Exception:
            tb_str = '\n```\n' + trimmed_format_exc() + '\n```\n'
-            self.child.send(f'[Local Message] Newbing失败 {tb_str}.')
+            self.child.send(f'[Local Message] Newbing 请求失败，报错信息如下. 如果是与网络相关的问题，建议更换代理协议（推荐http）或代理节点 {tb_str}.')
            self.child.send('[Fail]')
            self.child.send('[Finish]')
        
@@ -151,18 +160,14 @@ class NewBingHandle(Process):
        """
        这个函数运行在主进程
        """
-        self.threadLock.acquire()
-        self.parent.send(kwargs)    # 发送请求到子进程
+        self.threadLock.acquire()   # 获取线程锁
+        self.parent.send(kwargs)    # 请求子进程
        while True:
-            res = self.parent.recv()    # 等待newbing回复的片段
-            if res == '[Finish]':
-                break       # 结束
-            elif res == '[Fail]':
-                self.success = False
-                break
-            else:
-                yield res   # newbing回复的片段
-        self.threadLock.release()
+            res = self.parent.recv()                            # 等待newbing回复的片段
+            if res == '[Finish]': break                         # 结束
+            elif res == '[Fail]': self.success = False; break   # 失败
+            else: yield res                                     # newbing回复的片段
+        self.threadLock.release()   # 释放线程锁


 """
@@ -219,11 +224,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
            return

    if additional_fn is not None:
-        import core_functional
-        importlib.reload(core_functional)    # 热更新prompt
-        core_functional = core_functional.get_core_functions()
-        if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs)  # 获取预处理函数（如果有的话）
-        inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
+        from core_functional import handle_core_functionality
+        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)

    history_feedin = []
    for i in range(len(history)//2):
--- a/request_llm/bridge_qwen.py
+++ b/request_llm/bridge_qwen.py
@@ -0,0 +1,68 @@
+model_name = "Qwen"
+cmd_to_install = "`pip install -r request_llm/requirements_qwen.txt`"
+
+
+from transformers import AutoModel, AutoTokenizer
+import time
+import threading
+import importlib
+from toolbox import update_ui, get_conf
+from multiprocessing import Process, Pipe
+from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
+
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Local Model
+# ------------------------------------------------------------------------------------------------------------------------
+@SingletonLocalLLM
+class GetONNXGLMHandle(LocalLLMHandle):
+
+    def load_model_info(self):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        self.model_name = model_name
+        self.cmd_to_install = cmd_to_install
+
+    def load_model_and_tokenizer(self):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        import os, glob
+        import os
+        import platform
+        from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
+
+        model_id = 'qwen/Qwen-7B-Chat'
+        revision = 'v1.0.1'
+        self._tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True)
+        # use fp16
+        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", revision=revision, trust_remote_code=True, fp16=True).eval()
+        model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True)  # 可指定不同的生成长度、top_p等相关超参
+        self._model = model
+
+        return self._model, self._tokenizer
+
+    def llm_stream_generator(self, **kwargs):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        def adaptor(kwargs):
+            query = kwargs['query']
+            max_length = kwargs['max_length']
+            top_p = kwargs['top_p']
+            temperature = kwargs['temperature']
+            history = kwargs['history']
+            return query, max_length, top_p, temperature, history
+
+        query, max_length, top_p, temperature, history = adaptor(kwargs)
+
+        for response in self._model.chat(self._tokenizer, query, history=history, stream=True):
+            yield response
+        
+    def try_to_import_special_deps(self, **kwargs):
+        # import something that will raise error if the user does not install requirement_*.txt
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 主进程执行
+        import importlib
+        importlib.import_module('modelscope')
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 GPT-Academic Interface
+# ------------------------------------------------------------------------------------------------------------------------
+predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetONNXGLMHandle, model_name)
--- a/request_llm/bridge_spark.py
+++ b/request_llm/bridge_spark.py
@@ -0,0 +1,49 @@
+
+import time
+import threading
+import importlib
+from toolbox import update_ui, get_conf
+from multiprocessing import Process, Pipe
+
+model_name = '星火认知大模型'
+
+def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
+    """
+        ⭐多线程方法
+        函数的说明请见 request_llm/bridge_all.py
+    """
+    watch_dog_patience = 5
+    response = ""
+
+    from .com_sparkapi import SparkRequestInstance
+    sri = SparkRequestInstance()
+    for response in sri.generate(inputs, llm_kwargs, history, sys_prompt):
+        if len(observe_window) >= 1:
+            observe_window[0] = response
+        if len(observe_window) >= 2:
+            if (time.time()-observe_window[1]) > watch_dog_patience: raise RuntimeError("程序终止。")
+    return response
+
+def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
+    """
+        ⭐单线程方法
+        函数的说明请见 request_llm/bridge_all.py
+    """
+    chatbot.append((inputs, ""))
+
+    if additional_fn is not None:
+        from core_functional import handle_core_functionality
+        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
+
+    # 开始接收回复    
+    from .com_sparkapi import SparkRequestInstance
+    sri = SparkRequestInstance()
+    for response in sri.generate(inputs, llm_kwargs, history, system_prompt):
+        chatbot[-1] = (inputs, response)
+        yield from update_ui(chatbot=chatbot, history=history)
+
+    # 总结输出
+    if response == f"[Local Message]: 等待{model_name}响应中 ...":
+        response = f"[Local Message]: {model_name}响应异常 ..."
+    history.extend([inputs, response])
+    yield from update_ui(chatbot=chatbot, history=history)
--- a/request_llm/bridge_stackclaude.py
+++ b/request_llm/bridge_stackclaude.py
@@ -1,4 +1,4 @@
-from .bridge_newbing import preprocess_newbing_out, preprocess_newbing_out_simple
+from .bridge_newbingfree import preprocess_newbing_out, preprocess_newbing_out_simple
 from multiprocessing import Process, Pipe
 from toolbox import update_ui, get_conf, trimmed_format_exc
 import threading
@@ -248,14 +248,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
            return

    if additional_fn is not None:
-        import core_functional
-        importlib.reload(core_functional)    # 热更新prompt
-        core_functional = core_functional.get_core_functions()
-        if "PreProcess" in core_functional[additional_fn]:
-            inputs = core_functional[additional_fn]["PreProcess"](
-                inputs)  # 获取预处理函数（如果有的话）
-        inputs = core_functional[additional_fn]["Prefix"] + \
-            inputs + core_functional[additional_fn]["Suffix"]
+        from core_functional import handle_core_functionality
+        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)

    history_feedin = []
    for i in range(len(history)//2):
--- a/request_llm/bridge_tgui.py
+++ b/request_llm/bridge_tgui.py
@@ -96,11 +96,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
        additional_fn代表点击的哪个按钮，按钮见functional.py
    """
    if additional_fn is not None:
-        import core_functional
-        importlib.reload(core_functional)    # 热更新prompt
-        core_functional = core_functional.get_core_functions()
-        if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs)  # 获取预处理函数（如果有的话）
-        inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
+        from core_functional import handle_core_functionality
+        inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)

    raw_input = "What I would like to say is the following: " + inputs
    history.extend([inputs, ""])
--- a/request_llm/chatglmoonx.py
+++ b/request_llm/chatglmoonx.py
@@ -0,0 +1,229 @@
+
+
+
+
+
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py
+# ------------------------------------------------------------------------------------------------------------------------
+import re
+import numpy as np
+# import torch
+from onnxruntime import InferenceSession, SessionOptions
+
+
+# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU,
+# although they are documented as supported on CUDA.
+providers = ["CPUExecutionProvider"]
+
+# if torch.cuda.is_available():
+#     providers = ["CUDAExecutionProvider"] + providers
+
+
+# Default paths
+tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model"
+onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
+
+
+# input & output names
+past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]]
+present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]]
+output_names = ["logits"] + present_names
+
+
+# default kv_cache for first inference
+default_past_key_values = {
+    k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names
+}
+
+
+def chat_template(history: list[tuple[str, str]], current: str):
+    prompt = ""
+    chat_round = 0
+    for question, answer in history:
+        prompt += f"[Round {chat_round}]\n问：{question}\n答：{answer}\n"
+        chat_round += 1
+    prompt += f"[Round {chat_round}]\n问：{current}\n答："
+    return prompt
+
+
+def process_response(response: str):
+    response = response.strip()
+    response = response.replace("[[训练时间]]", "2023年")
+    punkts = [
+        [",", "，"],
+        ["!", "！"],
+        [":", "："],
+        [";", "；"],
+        ["\?", "？"],
+    ]
+    for item in punkts:
+        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+    return response
+
+
+class ChatGLMModel():
+
+    def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None:
+        self.tokenizer = ChatGLMTokenizer(tokenizer_path)
+        options = SessionOptions()
+        options.enable_profiling = profile
+        self.session = InferenceSession(onnx_model_path, options, providers=providers)
+        self.eop_token_id = self.tokenizer["<eop>"]
+
+
+    def prepare_input(self, prompt: str):
+        input_ids, prefix_mask = self.tokenizer.encode(prompt)
+
+        input_ids = np.array([input_ids], dtype=np.longlong)
+        prefix_mask = np.array([prefix_mask], dtype=np.longlong)
+
+        return input_ids, prefix_mask, default_past_key_values
+
+
+    def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1):
+        # softmax with temperature
+        exp_logits = np.exp(logits / temperature)
+        probs = exp_logits / np.sum(exp_logits)
+
+        # top k
+        top_k_idx = np.argsort(-probs)[:top_k]
+        top_k_probs = probs[top_k_idx]
+
+        # top p
+        cumsum_probs = np.cumsum(top_k_probs)
+        top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0
+        top_k_probs = top_k_probs / np.sum(top_k_probs)
+
+        # sample
+        next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs)
+        return next_token[0].item()
+
+
+    def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1):
+        input_ids, prefix_mask, past_key_values = self.prepare_input(prompt)
+        output_tokens = []
+
+        while True:
+            inputs = {
+                "input_ids": input_ids,
+                "prefix_mask": prefix_mask,
+                "use_past": np.array(len(output_tokens) > 0),
+            }
+            inputs.update(past_key_values)
+
+            logits, *past_key_values = self.session.run(output_names, inputs)
+            past_key_values = { k: v for k, v in zip(past_names, past_key_values) }
+
+            next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature)
+            
+            output_tokens += [next_token]
+
+            if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens:
+                break
+
+            input_ids = np.array([[next_token]], dtype=np.longlong)
+            prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1)
+
+            yield process_response(self.tokenizer.decode(output_tokens))
+
+        return process_response(self.tokenizer.decode(output_tokens))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# ------------------------------------------------------------------------------------------------------------------------
+# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py
+# ------------------------------------------------------------------------------------------------------------------------
+
+import re
+from sentencepiece import SentencePieceProcessor
+
+
+def replace_spaces_with_blank(match: re.Match[str]):
+    return f"<|blank_{len(match.group())}|>"
+
+
+def replace_blank_with_spaces(match: re.Match[str]):
+    return " " * int(match.group(1))
+
+
+class ChatGLMTokenizer:
+    def __init__(self, vocab_file):
+        assert vocab_file is not None
+        self.vocab_file = vocab_file
+        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
+        self.text_tokenizer = SentencePieceProcessor(str(vocab_file))
+
+    def __len__(self):
+        return len(self.text_tokenizer)
+
+    def __getitem__(self, key: str):
+        return self.text_tokenizer[key]
+
+
+    def preprocess(self, text: str, linebreak=True, whitespaces=True):
+        if linebreak:
+            text = text.replace("\n", "<n>")
+        if whitespaces:
+            text = text.replace("\t", "<|tab|>")
+            text = re.sub(r" {2,80}", replace_spaces_with_blank, text)
+        return text
+
+
+    def encode(
+        self, text: str, text_pair: str = None,
+        linebreak=True, whitespaces=True,
+        add_dummy_prefix=True, special_tokens=True,
+    ) -> tuple[list[int], list[int]]:
+        """
+        text: Text to encode. Bidirectional part with a [gMASK] and an <sop> for causal LM.
+        text_pair: causal LM part.
+        linebreak: Whether to encode newline (\n) in text.
+        whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self.preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+
+        tokens = self.text_tokenizer.encode(text)
+        prefix_mask = [1] * len(tokens)
+        if special_tokens:
+            tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer["<sop>"]]
+            prefix_mask += [1, 0]
+
+        if text_pair is not None:
+            text_pair = self.preprocess(text_pair, linebreak, whitespaces)
+            pair_tokens = self.text_tokenizer.encode(text_pair)
+            tokens += pair_tokens
+            prefix_mask += [0] * len(pair_tokens)
+            if special_tokens:
+                tokens += [self.text_tokenizer["<eop>"]]
+                prefix_mask += [0]
+
+        return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask
+
+
+    def decode(self, text_ids: list[int]) -> str:
+        text = self.text_tokenizer.decode(text_ids)
+        text = text.replace("<n>", "\n")
+        text = text.replace("<|tab|>", "\t")
+        text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text)
+        return text
+
+
--- a/request_llm/com_sparkapi.py
+++ b/request_llm/com_sparkapi.py
@@ -0,0 +1,184 @@
+from toolbox import get_conf
+import base64
+import datetime
+import hashlib
+import hmac
+import json
+from urllib.parse import urlparse
+import ssl
+from datetime import datetime
+from time import mktime
+from urllib.parse import urlencode
+from wsgiref.handlers import format_date_time
+import websocket
+import threading, time
+
+timeout_bot_msg = '[Local Message] Request timeout. Network error.'
+
+class Ws_Param(object):
+    # 初始化
+    def __init__(self, APPID, APIKey, APISecret, gpt_url):
+        self.APPID = APPID
+        self.APIKey = APIKey
+        self.APISecret = APISecret
+        self.host = urlparse(gpt_url).netloc
+        self.path = urlparse(gpt_url).path
+        self.gpt_url = gpt_url
+
+    # 生成url
+    def create_url(self):
+        # 生成RFC1123格式的时间戳
+        now = datetime.now()
+        date = format_date_time(mktime(now.timetuple()))
+
+        # 拼接字符串
+        signature_origin = "host: " + self.host + "\n"
+        signature_origin += "date: " + date + "\n"
+        signature_origin += "GET " + self.path + " HTTP/1.1"
+
+        # 进行hmac-sha256进行加密
+        signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'), digestmod=hashlib.sha256).digest()
+        signature_sha_base64 = base64.b64encode(signature_sha).decode(encoding='utf-8')
+        authorization_origin = f'api_key="{self.APIKey}", algorithm="hmac-sha256", headers="host date request-line", signature="{signature_sha_base64}"'
+        authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
+
+        # 将请求的鉴权参数组合为字典
+        v = {
+            "authorization": authorization,
+            "date": date,
+            "host": self.host
+        }
+        # 拼接鉴权参数，生成url
+        url = self.gpt_url + '?' + urlencode(v)
+        # 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释，比对相同参数时生成的url与自己代码生成的url是否一致
+        return url
+
+
+
+class SparkRequestInstance():
+    def __init__(self):
+        XFYUN_APPID, XFYUN_API_SECRET, XFYUN_API_KEY = get_conf('XFYUN_APPID', 'XFYUN_API_SECRET', 'XFYUN_API_KEY')
+
+        self.appid = XFYUN_APPID
+        self.api_secret = XFYUN_API_SECRET
+        self.api_key = XFYUN_API_KEY
+        self.gpt_url = "ws://spark-api.xf-yun.com/v1.1/chat"
+        self.time_to_yield_event = threading.Event()
+        self.time_to_exit_event = threading.Event()
+
+        self.result_buf = ""
+
+    def generate(self, inputs, llm_kwargs, history, system_prompt):
+        llm_kwargs = llm_kwargs
+        history = history
+        system_prompt = system_prompt
+        import _thread as thread
+        thread.start_new_thread(self.create_blocking_request, (inputs, llm_kwargs, history, system_prompt))
+        while True:
+            self.time_to_yield_event.wait(timeout=1)
+            if self.time_to_yield_event.is_set():
+                yield self.result_buf
+            if self.time_to_exit_event.is_set():
+                return self.result_buf
+
+
+    def create_blocking_request(self, inputs, llm_kwargs, history, system_prompt):
+        wsParam = Ws_Param(self.appid, self.api_key, self.api_secret, self.gpt_url)
+        websocket.enableTrace(False)
+        wsUrl = wsParam.create_url()
+
+        # 收到websocket连接建立的处理
+        def on_open(ws):
+            import _thread as thread
+            thread.start_new_thread(run, (ws,))
+
+        def run(ws, *args):
+            data = json.dumps(gen_params(ws.appid, *ws.all_args))
+            ws.send(data)
+
+        # 收到websocket消息的处理
+        def on_message(ws, message):
+            data = json.loads(message)
+            code = data['header']['code']
+            if code != 0:
+                print(f'请求错误: {code}, {data}')
+                ws.close()
+                self.time_to_exit_event.set()
+            else:
+                choices = data["payload"]["choices"]
+                status = choices["status"]
+                content = choices["text"][0]["content"]
+                ws.content += content
+                self.result_buf += content
+                if status == 2:
+                    ws.close()
+                    self.time_to_exit_event.set()
+            self.time_to_yield_event.set()
+
+        # 收到websocket错误的处理
+        def on_error(ws, error):
+            print("error:", error)
+            self.time_to_exit_event.set()
+
+        # 收到websocket关闭的处理
+        def on_close(ws, *args):
+            self.time_to_exit_event.set()
+
+        # websocket
+        ws = websocket.WebSocketApp(wsUrl, on_message=on_message, on_error=on_error, on_close=on_close, on_open=on_open)
+        ws.appid = self.appid
+        ws.content = ""
+        ws.all_args = (inputs, llm_kwargs, history, system_prompt)
+        ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})
+
+def generate_message_payload(inputs, llm_kwargs, history, system_prompt):
+    conversation_cnt = len(history) // 2
+    messages = [{"role": "system", "content": system_prompt}]
+    if conversation_cnt:
+        for index in range(0, 2*conversation_cnt, 2):
+            what_i_have_asked = {}
+            what_i_have_asked["role"] = "user"
+            what_i_have_asked["content"] = history[index]
+            what_gpt_answer = {}
+            what_gpt_answer["role"] = "assistant"
+            what_gpt_answer["content"] = history[index+1]
+            if what_i_have_asked["content"] != "":
+                if what_gpt_answer["content"] == "": continue
+                if what_gpt_answer["content"] == timeout_bot_msg: continue
+                messages.append(what_i_have_asked)
+                messages.append(what_gpt_answer)
+            else:
+                messages[-1]['content'] = what_gpt_answer['content']
+    what_i_ask_now = {}
+    what_i_ask_now["role"] = "user"
+    what_i_ask_now["content"] = inputs
+    messages.append(what_i_ask_now)
+    return messages
+
+
+def gen_params(appid, inputs, llm_kwargs, history, system_prompt):
+    """
+    通过appid和用户的提问来生成请参数
+    """
+    data = {
+        "header": {
+            "app_id": appid,
+            "uid": "1234"
+        },
+        "parameter": {
+            "chat": {
+                "domain": "general",
+                "temperature": llm_kwargs["temperature"],
+                "random_threshold": 0.5,
+                "max_tokens": 4096,
+                "auditing": "default"
+            }
+        },
+        "payload": {
+            "message": {
+                "text": generate_message_payload(inputs, llm_kwargs, history, system_prompt)
+            }
+        }
+    }
+    return data
+
--- a/request_llm/edge_gpt_free.py
+++ b/request_llm/edge_gpt_free.py
@@ -447,6 +447,15 @@ class _ChatHub:
        """
        Ask a question to the bot
        """
+        req_header = HEADERS
+        if self.cookies is not None:
+            ws_cookies = []
+            for cookie in self.cookies:
+                ws_cookies.append(f"{cookie['name']}={cookie['value']}")
+            req_header.update({
+                'Cookie': ';'.join(ws_cookies),
+            })
+            
        timeout = aiohttp.ClientTimeout(total=30)
        self.session = aiohttp.ClientSession(timeout=timeout)

@@ -455,7 +464,7 @@ class _ChatHub:
        # Check if websocket is closed
        self.wss = await self.session.ws_connect(
            wss_link,
-            headers=HEADERS,
+            headers=req_header,
            ssl=ssl_context,
            proxy=self.proxy,
            autoping=False,
@@ -510,7 +519,11 @@ class _ChatHub:
        resp_txt_no_link = ""
        while not final:
            msg = await self.wss.receive()
-            objects = msg.data.split(DELIMITER)
+            try:
+                objects = msg.data.split(DELIMITER)
+            except :
+                continue
+            
            for obj in objects:
                if obj is None or not obj:
                    continue
@@ -1109,4 +1122,4 @@ class ImageQuery(Query):


 if __name__ == "__main__":
-    main()
+    main()
--- a/request_llm/local_llm_class.py
+++ b/request_llm/local_llm_class.py
@@ -0,0 +1,180 @@
+from transformers import AutoModel, AutoTokenizer
+import time
+import threading
+import importlib
+from toolbox import update_ui, get_conf, Singleton
+from multiprocessing import Process, Pipe
+
+def SingletonLocalLLM(cls):
+    """
+    一个单实例装饰器
+    """
+    _instance = {}
+    def _singleton(*args, **kargs):
+        if cls not in _instance:
+            _instance[cls] = cls(*args, **kargs)
+            return _instance[cls]
+        elif _instance[cls].corrupted:
+            _instance[cls] = cls(*args, **kargs)
+            return _instance[cls]
+        else:
+            return _instance[cls]
+    return _singleton
+
+class LocalLLMHandle(Process):
+    def __init__(self):
+        # ⭐主进程执行
+        super().__init__(daemon=True)
+        self.corrupted = False
+        self.load_model_info()
+        self.parent, self.child = Pipe()
+        self.running = True
+        self._model = None
+        self._tokenizer = None
+        self.info = ""
+        self.check_dependency()
+        self.start()
+        self.threadLock = threading.Lock()
+
+    def load_model_info(self):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        raise NotImplementedError("Method not implemented yet")
+        self.model_name = ""
+        self.cmd_to_install = ""
+
+    def load_model_and_tokenizer(self):
+        """
+        This function should return the model and the tokenizer
+        """
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        raise NotImplementedError("Method not implemented yet")
+
+    def llm_stream_generator(self, **kwargs):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        raise NotImplementedError("Method not implemented yet")
+        
+    def try_to_import_special_deps(self, **kwargs):
+        """
+        import something that will raise error if the user does not install requirement_*.txt
+        """
+        # ⭐主进程执行
+        raise NotImplementedError("Method not implemented yet")
+
+    def check_dependency(self):
+        # ⭐主进程执行
+        try:
+            self.try_to_import_special_deps()
+            self.info = "依赖检测通过"
+            self.running = True
+        except:
+            self.info = f"缺少{self.model_name}的依赖，如果要使用{self.model_name}，除了基础的pip依赖以外，您还需要运行{self.cmd_to_install}安装{self.model_name}的依赖。"
+            self.running = False
+
+    def run(self):
+        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
+        # 第一次运行，加载参数
+        try:
+            self._model, self._tokenizer = self.load_model_and_tokenizer()
+        except:
+            self.running = False
+            from toolbox import trimmed_format_exc
+            self.child.send(f'[Local Message] 不能正常加载{self.model_name}的参数.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
+            self.child.send('[FinishBad]')
+            raise RuntimeError(f"不能正常加载{self.model_name}的参数！")
+
+        while True:
+            # 进入任务等待状态
+            kwargs = self.child.recv()
+            # 收到消息，开始请求
+            try:
+                for response_full in self.llm_stream_generator(**kwargs):
+                    self.child.send(response_full)
+                self.child.send('[Finish]')
+                # 请求处理结束，开始下一个循环
+            except:
+                from toolbox import trimmed_format_exc
+                self.child.send(f'[Local Message] 调用{self.model_name}失败.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
+                self.child.send('[Finish]')
+
+    def stream_chat(self, **kwargs):
+        # ⭐主进程执行
+        self.threadLock.acquire()
+        self.parent.send(kwargs)
+        while True:
+            res = self.parent.recv()
+            if res == '[Finish]': 
+                break
+            if res == '[FinishBad]': 
+                self.running = False
+                self.corrupted = True
+                break
+            else: 
+                yield res
+        self.threadLock.release()
+    
+
+
+def get_local_llm_predict_fns(LLMSingletonClass, model_name):
+    load_message = f"{model_name}尚未加载，加载需要一段时间。注意，取决于`config.py`的配置，{model_name}消耗大量的内存（CPU）或显存（GPU），也许会导致低配计算机卡死 ……"
+
+    def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
+        """
+            ⭐多线程方法
+            函数的说明请见 request_llm/bridge_all.py
+        """
+        _llm_handle = LLMSingletonClass()
+        if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + _llm_handle.info
+        if not _llm_handle.running: raise RuntimeError(_llm_handle.info)
+
+        # chatglm 没有 sys_prompt 接口，因此把prompt加入 history
+        history_feedin = []
+        history_feedin.append(["What can I do?", sys_prompt])
+        for i in range(len(history)//2):
+            history_feedin.append([history[2*i], history[2*i+1]] )
+
+        watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可
+        response = ""
+        for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
+            if len(observe_window) >= 1:
+                observe_window[0] = response
+            if len(observe_window) >= 2:  
+                if (time.time()-observe_window[1]) > watch_dog_patience: raise RuntimeError("程序终止。")
+        return response
+
+
+
+    def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
+        """
+            ⭐单线程方法
+            函数的说明请见 request_llm/bridge_all.py
+        """
+        chatbot.append((inputs, ""))
+
+        _llm_handle = LLMSingletonClass()
+        chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.info)
+        yield from update_ui(chatbot=chatbot, history=[])
+        if not _llm_handle.running: raise RuntimeError(_llm_handle.info)
+
+        if additional_fn is not None:
+            from core_functional import handle_core_functionality
+            inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
+
+        # 处理历史信息
+        history_feedin = []
+        history_feedin.append(["What can I do?", system_prompt] )
+        for i in range(len(history)//2):
+            history_feedin.append([history[2*i], history[2*i+1]] )
+
+        # 开始接收回复
+        response = f"[Local Message]: 等待{model_name}响应中 ..."
+        for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
+            chatbot[-1] = (inputs, response)
+            yield from update_ui(chatbot=chatbot, history=history)
+
+        # 总结输出
+        if response == f"[Local Message]: 等待{model_name}响应中 ...":
+            response = f"[Local Message]: {model_name}响应异常 ..."
+        history.extend([inputs, response])
+        yield from update_ui(chatbot=chatbot, history=history)
+
+    return predict_no_ui_long_connection, predict
--- a/request_llm/requirements_chatglm.txt
+++ b/request_llm/requirements_chatglm.txt
@@ -1,5 +1,5 @@
 protobuf
-transformers==4.27.1
+transformers>=4.27.1
 cpm_kernels
 torch>=1.10
 mdtex2html
--- a/request_llm/requirements_chatglm_onnx.txt
+++ b/request_llm/requirements_chatglm_onnx.txt
@@ -0,0 +1,11 @@
+protobuf
+transformers>=4.27.1
+cpm_kernels
+torch>=1.10
+mdtex2html
+sentencepiece
+numpy
+onnxruntime
+sentencepiece
+streamlit
+streamlit-chat
--- a/request_llm/requirements_qwen.txt
+++ b/request_llm/requirements_qwen.txt
@@ -0,0 +1,2 @@
+modelscope
+transformers_stream_generator