"version": 3.48
This commit is contained in:
@@ -16,18 +16,9 @@ from toolbox import get_conf, trimmed_format_exc
|
||||
from .bridge_chatgpt import predict_no_ui_long_connection as chatgpt_noui
|
||||
from .bridge_chatgpt import predict as chatgpt_ui
|
||||
|
||||
from .bridge_azure_test import predict_no_ui_long_connection as azure_noui
|
||||
from .bridge_azure_test import predict as azure_ui
|
||||
|
||||
from .bridge_chatglm import predict_no_ui_long_connection as chatglm_noui
|
||||
from .bridge_chatglm import predict as chatglm_ui
|
||||
|
||||
from .bridge_newbing import predict_no_ui_long_connection as newbing_noui
|
||||
from .bridge_newbing import predict as newbing_ui
|
||||
|
||||
# from .bridge_tgui import predict_no_ui_long_connection as tgui_noui
|
||||
# from .bridge_tgui import predict as tgui_ui
|
||||
|
||||
colors = ['#FF00FF', '#00FFFF', '#FF0000', '#990099', '#009999', '#990044']
|
||||
|
||||
class LazyloadTiktoken(object):
|
||||
@@ -51,10 +42,11 @@ class LazyloadTiktoken(object):
|
||||
return encoder.decode(*args, **kwargs)
|
||||
|
||||
# Endpoint 重定向
|
||||
API_URL_REDIRECT, = get_conf("API_URL_REDIRECT")
|
||||
API_URL_REDIRECT, AZURE_ENDPOINT, AZURE_ENGINE = get_conf("API_URL_REDIRECT", "AZURE_ENDPOINT", "AZURE_ENGINE")
|
||||
openai_endpoint = "https://api.openai.com/v1/chat/completions"
|
||||
api2d_endpoint = "https://openai.api2d.net/v1/chat/completions"
|
||||
newbing_endpoint = "wss://sydney.bing.com/sydney/ChatHub"
|
||||
azure_endpoint = AZURE_ENDPOINT + f'openai/deployments/{AZURE_ENGINE}/chat/completions?api-version=2023-05-15'
|
||||
# 兼容旧版的配置
|
||||
try:
|
||||
API_URL, = get_conf("API_URL")
|
||||
@@ -76,6 +68,10 @@ get_token_num_gpt35 = lambda txt: len(tokenizer_gpt35.encode(txt, disallowed_spe
|
||||
get_token_num_gpt4 = lambda txt: len(tokenizer_gpt4.encode(txt, disallowed_special=()))
|
||||
|
||||
|
||||
# 开始初始化模型
|
||||
AVAIL_LLM_MODELS, LLM_MODEL = get_conf("AVAIL_LLM_MODELS", "LLM_MODEL")
|
||||
AVAIL_LLM_MODELS = AVAIL_LLM_MODELS + [LLM_MODEL]
|
||||
# -=-=-=-=-=-=- 以下这部分是最早加入的最稳定的模型 -=-=-=-=-=-=-
|
||||
model_info = {
|
||||
# openai
|
||||
"gpt-3.5-turbo": {
|
||||
@@ -124,10 +120,10 @@ model_info = {
|
||||
},
|
||||
|
||||
# azure openai
|
||||
"azure-gpt35":{
|
||||
"fn_with_ui": azure_ui,
|
||||
"fn_without_ui": azure_noui,
|
||||
"endpoint": get_conf("AZURE_ENDPOINT"),
|
||||
"azure-gpt-3.5":{
|
||||
"fn_with_ui": chatgpt_ui,
|
||||
"fn_without_ui": chatgpt_noui,
|
||||
"endpoint": azure_endpoint,
|
||||
"max_token": 4096,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
@@ -169,21 +165,33 @@ model_info = {
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
},
|
||||
|
||||
# newbing
|
||||
"newbing": {
|
||||
"fn_with_ui": newbing_ui,
|
||||
"fn_without_ui": newbing_noui,
|
||||
"endpoint": newbing_endpoint,
|
||||
"max_token": 4096,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
},
|
||||
|
||||
}
|
||||
|
||||
|
||||
AVAIL_LLM_MODELS, = get_conf("AVAIL_LLM_MODELS")
|
||||
# -=-=-=-=-=-=- 以下部分是新加入的模型,可能附带额外依赖 -=-=-=-=-=-=-
|
||||
if "claude-1-100k" in AVAIL_LLM_MODELS or "claude-2" in AVAIL_LLM_MODELS:
|
||||
from .bridge_claude import predict_no_ui_long_connection as claude_noui
|
||||
from .bridge_claude import predict as claude_ui
|
||||
model_info.update({
|
||||
"claude-1-100k": {
|
||||
"fn_with_ui": claude_ui,
|
||||
"fn_without_ui": claude_noui,
|
||||
"endpoint": None,
|
||||
"max_token": 8196,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
},
|
||||
})
|
||||
model_info.update({
|
||||
"claude-2": {
|
||||
"fn_with_ui": claude_ui,
|
||||
"fn_without_ui": claude_noui,
|
||||
"endpoint": None,
|
||||
"max_token": 8196,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
},
|
||||
})
|
||||
if "jittorllms_rwkv" in AVAIL_LLM_MODELS:
|
||||
from .bridge_jittorllms_rwkv import predict_no_ui_long_connection as rwkv_noui
|
||||
from .bridge_jittorllms_rwkv import predict as rwkv_ui
|
||||
@@ -239,7 +247,6 @@ if "moss" in AVAIL_LLM_MODELS:
|
||||
if "stack-claude" in AVAIL_LLM_MODELS:
|
||||
from .bridge_stackclaude import predict_no_ui_long_connection as claude_noui
|
||||
from .bridge_stackclaude import predict as claude_ui
|
||||
# claude
|
||||
model_info.update({
|
||||
"stack-claude": {
|
||||
"fn_with_ui": claude_ui,
|
||||
@@ -254,7 +261,6 @@ if "newbing-free" in AVAIL_LLM_MODELS:
|
||||
try:
|
||||
from .bridge_newbingfree import predict_no_ui_long_connection as newbingfree_noui
|
||||
from .bridge_newbingfree import predict as newbingfree_ui
|
||||
# claude
|
||||
model_info.update({
|
||||
"newbing-free": {
|
||||
"fn_with_ui": newbingfree_ui,
|
||||
@@ -267,6 +273,120 @@ if "newbing-free" in AVAIL_LLM_MODELS:
|
||||
})
|
||||
except:
|
||||
print(trimmed_format_exc())
|
||||
if "newbing" in AVAIL_LLM_MODELS: # same with newbing-free
|
||||
try:
|
||||
from .bridge_newbingfree import predict_no_ui_long_connection as newbingfree_noui
|
||||
from .bridge_newbingfree import predict as newbingfree_ui
|
||||
model_info.update({
|
||||
"newbing": {
|
||||
"fn_with_ui": newbingfree_ui,
|
||||
"fn_without_ui": newbingfree_noui,
|
||||
"endpoint": newbing_endpoint,
|
||||
"max_token": 4096,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
}
|
||||
})
|
||||
except:
|
||||
print(trimmed_format_exc())
|
||||
if "chatglmft" in AVAIL_LLM_MODELS: # same with newbing-free
|
||||
try:
|
||||
from .bridge_chatglmft import predict_no_ui_long_connection as chatglmft_noui
|
||||
from .bridge_chatglmft import predict as chatglmft_ui
|
||||
model_info.update({
|
||||
"chatglmft": {
|
||||
"fn_with_ui": chatglmft_ui,
|
||||
"fn_without_ui": chatglmft_noui,
|
||||
"endpoint": None,
|
||||
"max_token": 4096,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
}
|
||||
})
|
||||
except:
|
||||
print(trimmed_format_exc())
|
||||
if "internlm" in AVAIL_LLM_MODELS:
|
||||
try:
|
||||
from .bridge_internlm import predict_no_ui_long_connection as internlm_noui
|
||||
from .bridge_internlm import predict as internlm_ui
|
||||
model_info.update({
|
||||
"internlm": {
|
||||
"fn_with_ui": internlm_ui,
|
||||
"fn_without_ui": internlm_noui,
|
||||
"endpoint": None,
|
||||
"max_token": 4096,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
}
|
||||
})
|
||||
except:
|
||||
print(trimmed_format_exc())
|
||||
if "chatglm_onnx" in AVAIL_LLM_MODELS:
|
||||
try:
|
||||
from .bridge_chatglmonnx import predict_no_ui_long_connection as chatglm_onnx_noui
|
||||
from .bridge_chatglmonnx import predict as chatglm_onnx_ui
|
||||
model_info.update({
|
||||
"chatglm_onnx": {
|
||||
"fn_with_ui": chatglm_onnx_ui,
|
||||
"fn_without_ui": chatglm_onnx_noui,
|
||||
"endpoint": None,
|
||||
"max_token": 4096,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
}
|
||||
})
|
||||
except:
|
||||
print(trimmed_format_exc())
|
||||
if "qwen" in AVAIL_LLM_MODELS:
|
||||
try:
|
||||
from .bridge_qwen import predict_no_ui_long_connection as qwen_noui
|
||||
from .bridge_qwen import predict as qwen_ui
|
||||
model_info.update({
|
||||
"qwen": {
|
||||
"fn_with_ui": qwen_ui,
|
||||
"fn_without_ui": qwen_noui,
|
||||
"endpoint": None,
|
||||
"max_token": 4096,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
}
|
||||
})
|
||||
except:
|
||||
print(trimmed_format_exc())
|
||||
if "chatgpt_website" in AVAIL_LLM_MODELS: # 接入一些逆向工程https://github.com/acheong08/ChatGPT-to-API/
|
||||
try:
|
||||
from .bridge_chatgpt_website import predict_no_ui_long_connection as chatgpt_website_noui
|
||||
from .bridge_chatgpt_website import predict as chatgpt_website_ui
|
||||
model_info.update({
|
||||
"chatgpt_website": {
|
||||
"fn_with_ui": chatgpt_website_ui,
|
||||
"fn_without_ui": chatgpt_website_noui,
|
||||
"endpoint": None,
|
||||
"max_token": 4096,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
}
|
||||
})
|
||||
except:
|
||||
print(trimmed_format_exc())
|
||||
if "spark" in AVAIL_LLM_MODELS: # 讯飞星火认知大模型
|
||||
try:
|
||||
from .bridge_spark import predict_no_ui_long_connection as spark_noui
|
||||
from .bridge_spark import predict as spark_ui
|
||||
model_info.update({
|
||||
"spark": {
|
||||
"fn_with_ui": spark_ui,
|
||||
"fn_without_ui": spark_noui,
|
||||
"endpoint": None,
|
||||
"max_token": 4096,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
}
|
||||
})
|
||||
except:
|
||||
print(trimmed_format_exc())
|
||||
|
||||
|
||||
|
||||
def LLM_CATCH_EXCEPTION(f):
|
||||
"""
|
||||
@@ -307,7 +427,8 @@ def predict_no_ui_long_connection(inputs, llm_kwargs, history, sys_prompt, obser
|
||||
method = model_info[model]["fn_without_ui"]
|
||||
return method(inputs, llm_kwargs, history, sys_prompt, observe_window, console_slience)
|
||||
else:
|
||||
# 如果同时询问多个大语言模型:
|
||||
|
||||
# 如果同时询问多个大语言模型,这个稍微啰嗦一点,但思路相同,您不必读这个else分支
|
||||
executor = ThreadPoolExecutor(max_workers=4)
|
||||
models = model.split('&')
|
||||
n_model = len(models)
|
||||
@@ -370,6 +491,6 @@ def predict(inputs, llm_kwargs, *args, **kwargs):
|
||||
additional_fn代表点击的哪个按钮,按钮见functional.py
|
||||
"""
|
||||
|
||||
method = model_info[llm_kwargs['llm_model']]["fn_with_ui"]
|
||||
method = model_info[llm_kwargs['llm_model']]["fn_with_ui"] # 如果这里报错,检查config中的AVAIL_LLM_MODELS选项
|
||||
yield from method(inputs, llm_kwargs, *args, **kwargs)
|
||||
|
||||
|
||||
@@ -37,15 +37,23 @@ class GetGLMHandle(Process):
|
||||
# 子进程执行
|
||||
# 第一次运行,加载参数
|
||||
retry = 0
|
||||
LOCAL_MODEL_QUANT, device = get_conf('LOCAL_MODEL_QUANT', 'LOCAL_MODEL_DEVICE')
|
||||
|
||||
if LOCAL_MODEL_QUANT == "INT4": # INT4
|
||||
_model_name_ = "THUDM/chatglm2-6b-int4"
|
||||
elif LOCAL_MODEL_QUANT == "INT8": # INT8
|
||||
_model_name_ = "THUDM/chatglm2-6b-int8"
|
||||
else:
|
||||
_model_name_ = "THUDM/chatglm2-6b" # FP16
|
||||
|
||||
while True:
|
||||
try:
|
||||
if self.chatglm_model is None:
|
||||
self.chatglm_tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
|
||||
device, = get_conf('LOCAL_MODEL_DEVICE')
|
||||
self.chatglm_tokenizer = AutoTokenizer.from_pretrained(_model_name_, trust_remote_code=True)
|
||||
if device=='cpu':
|
||||
self.chatglm_model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True).float()
|
||||
self.chatglm_model = AutoModel.from_pretrained(_model_name_, trust_remote_code=True).float()
|
||||
else:
|
||||
self.chatglm_model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True).half().cuda()
|
||||
self.chatglm_model = AutoModel.from_pretrained(_model_name_, trust_remote_code=True).half().cuda()
|
||||
self.chatglm_model = self.chatglm_model.eval()
|
||||
break
|
||||
else:
|
||||
@@ -136,11 +144,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
return
|
||||
|
||||
if additional_fn is not None:
|
||||
import core_functional
|
||||
importlib.reload(core_functional) # 热更新prompt
|
||||
core_functional = core_functional.get_core_functions()
|
||||
if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs) # 获取预处理函数(如果有的话)
|
||||
inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
# 处理历史信息
|
||||
history_feedin = []
|
||||
|
||||
207
request_llm/bridge_chatglmft.py
Normal file
207
request_llm/bridge_chatglmft.py
Normal file
@@ -0,0 +1,207 @@
|
||||
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
import time
|
||||
import os
|
||||
import json
|
||||
import threading
|
||||
import importlib
|
||||
from toolbox import update_ui, get_conf
|
||||
from multiprocessing import Process, Pipe
|
||||
|
||||
load_message = "ChatGLMFT尚未加载,加载需要一段时间。注意,取决于`config.py`的配置,ChatGLMFT消耗大量的内存(CPU)或显存(GPU),也许会导致低配计算机卡死 ……"
|
||||
|
||||
def string_to_options(arguments):
|
||||
import argparse
|
||||
import shlex
|
||||
# Create an argparse.ArgumentParser instance
|
||||
parser = argparse.ArgumentParser()
|
||||
# Add command-line arguments
|
||||
parser.add_argument("--llm_to_learn", type=str, help="LLM model to learn", default="gpt-3.5-turbo")
|
||||
parser.add_argument("--prompt_prefix", type=str, help="Prompt prefix", default='')
|
||||
parser.add_argument("--system_prompt", type=str, help="System prompt", default='')
|
||||
parser.add_argument("--batch", type=int, help="System prompt", default=50)
|
||||
# Parse the arguments
|
||||
args = parser.parse_args(shlex.split(arguments))
|
||||
return args
|
||||
|
||||
|
||||
#################################################################################
|
||||
class GetGLMFTHandle(Process):
|
||||
def __init__(self):
|
||||
super().__init__(daemon=True)
|
||||
self.parent, self.child = Pipe()
|
||||
self.chatglmft_model = None
|
||||
self.chatglmft_tokenizer = None
|
||||
self.info = ""
|
||||
self.success = True
|
||||
self.check_dependency()
|
||||
self.start()
|
||||
self.threadLock = threading.Lock()
|
||||
|
||||
def check_dependency(self):
|
||||
try:
|
||||
import sentencepiece
|
||||
self.info = "依赖检测通过"
|
||||
self.success = True
|
||||
except:
|
||||
self.info = "缺少ChatGLMFT的依赖,如果要使用ChatGLMFT,除了基础的pip依赖以外,您还需要运行`pip install -r request_llm/requirements_chatglm.txt`安装ChatGLM的依赖。"
|
||||
self.success = False
|
||||
|
||||
def ready(self):
|
||||
return self.chatglmft_model is not None
|
||||
|
||||
def run(self):
|
||||
# 子进程执行
|
||||
# 第一次运行,加载参数
|
||||
retry = 0
|
||||
while True:
|
||||
try:
|
||||
if self.chatglmft_model is None:
|
||||
from transformers import AutoConfig
|
||||
import torch
|
||||
# conf = 'request_llm/current_ptune_model.json'
|
||||
# if not os.path.exists(conf): raise RuntimeError('找不到微调模型信息')
|
||||
# with open(conf, 'r', encoding='utf8') as f:
|
||||
# model_args = json.loads(f.read())
|
||||
ChatGLM_PTUNING_CHECKPOINT, = get_conf('ChatGLM_PTUNING_CHECKPOINT')
|
||||
assert os.path.exists(ChatGLM_PTUNING_CHECKPOINT), "找不到微调模型检查点"
|
||||
conf = os.path.join(ChatGLM_PTUNING_CHECKPOINT, "config.json")
|
||||
with open(conf, 'r', encoding='utf8') as f:
|
||||
model_args = json.loads(f.read())
|
||||
if 'model_name_or_path' not in model_args:
|
||||
model_args['model_name_or_path'] = model_args['_name_or_path']
|
||||
self.chatglmft_tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args['model_name_or_path'], trust_remote_code=True)
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args['model_name_or_path'], trust_remote_code=True)
|
||||
|
||||
config.pre_seq_len = model_args['pre_seq_len']
|
||||
config.prefix_projection = model_args['prefix_projection']
|
||||
|
||||
print(f"Loading prefix_encoder weight from {ChatGLM_PTUNING_CHECKPOINT}")
|
||||
model = AutoModel.from_pretrained(model_args['model_name_or_path'], config=config, trust_remote_code=True)
|
||||
prefix_state_dict = torch.load(os.path.join(ChatGLM_PTUNING_CHECKPOINT, "pytorch_model.bin"))
|
||||
new_prefix_state_dict = {}
|
||||
for k, v in prefix_state_dict.items():
|
||||
if k.startswith("transformer.prefix_encoder."):
|
||||
new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
|
||||
model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
|
||||
|
||||
if model_args['quantization_bit'] is not None:
|
||||
print(f"Quantized to {model_args['quantization_bit']} bit")
|
||||
model = model.quantize(model_args['quantization_bit'])
|
||||
model = model.cuda()
|
||||
if model_args['pre_seq_len'] is not None:
|
||||
# P-tuning v2
|
||||
model.transformer.prefix_encoder.float()
|
||||
self.chatglmft_model = model.eval()
|
||||
|
||||
break
|
||||
else:
|
||||
break
|
||||
except Exception as e:
|
||||
retry += 1
|
||||
if retry > 3:
|
||||
self.child.send('[Local Message] Call ChatGLMFT fail 不能正常加载ChatGLMFT的参数。')
|
||||
raise RuntimeError("不能正常加载ChatGLMFT的参数!")
|
||||
|
||||
while True:
|
||||
# 进入任务等待状态
|
||||
kwargs = self.child.recv()
|
||||
# 收到消息,开始请求
|
||||
try:
|
||||
for response, history in self.chatglmft_model.stream_chat(self.chatglmft_tokenizer, **kwargs):
|
||||
self.child.send(response)
|
||||
# # 中途接收可能的终止指令(如果有的话)
|
||||
# if self.child.poll():
|
||||
# command = self.child.recv()
|
||||
# if command == '[Terminate]': break
|
||||
except:
|
||||
from toolbox import trimmed_format_exc
|
||||
self.child.send('[Local Message] Call ChatGLMFT fail.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
|
||||
# 请求处理结束,开始下一个循环
|
||||
self.child.send('[Finish]')
|
||||
|
||||
def stream_chat(self, **kwargs):
|
||||
# 主进程执行
|
||||
self.threadLock.acquire()
|
||||
self.parent.send(kwargs)
|
||||
while True:
|
||||
res = self.parent.recv()
|
||||
if res != '[Finish]':
|
||||
yield res
|
||||
else:
|
||||
break
|
||||
self.threadLock.release()
|
||||
|
||||
global glmft_handle
|
||||
glmft_handle = None
|
||||
#################################################################################
|
||||
def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
|
||||
"""
|
||||
多线程方法
|
||||
函数的说明请见 request_llm/bridge_all.py
|
||||
"""
|
||||
global glmft_handle
|
||||
if glmft_handle is None:
|
||||
glmft_handle = GetGLMFTHandle()
|
||||
if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + glmft_handle.info
|
||||
if not glmft_handle.success:
|
||||
error = glmft_handle.info
|
||||
glmft_handle = None
|
||||
raise RuntimeError(error)
|
||||
|
||||
# chatglmft 没有 sys_prompt 接口,因此把prompt加入 history
|
||||
history_feedin = []
|
||||
history_feedin.append(["What can I do?", sys_prompt])
|
||||
for i in range(len(history)//2):
|
||||
history_feedin.append([history[2*i], history[2*i+1]] )
|
||||
|
||||
watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可
|
||||
response = ""
|
||||
for response in glmft_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
|
||||
if len(observe_window) >= 1: observe_window[0] = response
|
||||
if len(observe_window) >= 2:
|
||||
if (time.time()-observe_window[1]) > watch_dog_patience:
|
||||
raise RuntimeError("程序终止。")
|
||||
return response
|
||||
|
||||
|
||||
|
||||
def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
|
||||
"""
|
||||
单线程方法
|
||||
函数的说明请见 request_llm/bridge_all.py
|
||||
"""
|
||||
chatbot.append((inputs, ""))
|
||||
|
||||
global glmft_handle
|
||||
if glmft_handle is None:
|
||||
glmft_handle = GetGLMFTHandle()
|
||||
chatbot[-1] = (inputs, load_message + "\n\n" + glmft_handle.info)
|
||||
yield from update_ui(chatbot=chatbot, history=[])
|
||||
if not glmft_handle.success:
|
||||
glmft_handle = None
|
||||
return
|
||||
|
||||
if additional_fn is not None:
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
# 处理历史信息
|
||||
history_feedin = []
|
||||
history_feedin.append(["What can I do?", system_prompt] )
|
||||
for i in range(len(history)//2):
|
||||
history_feedin.append([history[2*i], history[2*i+1]] )
|
||||
|
||||
# 开始接收chatglmft的回复
|
||||
response = "[Local Message]: 等待ChatGLMFT响应中 ..."
|
||||
for response in glmft_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
|
||||
chatbot[-1] = (inputs, response)
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
# 总结输出
|
||||
if response == "[Local Message]: 等待ChatGLMFT响应中 ...":
|
||||
response = "[Local Message]: ChatGLMFT响应异常 ..."
|
||||
history.extend([inputs, response])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
73
request_llm/bridge_chatglmonnx.py
Normal file
73
request_llm/bridge_chatglmonnx.py
Normal file
@@ -0,0 +1,73 @@
|
||||
model_name = "ChatGLM-ONNX"
|
||||
cmd_to_install = "`pip install -r request_llm/requirements_chatglm_onnx.txt`"
|
||||
|
||||
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
import time
|
||||
import threading
|
||||
import importlib
|
||||
from toolbox import update_ui, get_conf
|
||||
from multiprocessing import Process, Pipe
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
|
||||
|
||||
from .chatglmoonx import ChatGLMModel, chat_template
|
||||
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 Local Model
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
@SingletonLocalLLM
|
||||
class GetONNXGLMHandle(LocalLLMHandle):
|
||||
|
||||
def load_model_info(self):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
self.model_name = model_name
|
||||
self.cmd_to_install = cmd_to_install
|
||||
|
||||
def load_model_and_tokenizer(self):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
import os, glob
|
||||
if not len(glob.glob("./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/*.bin")) >= 7: # 该模型有七个 bin 文件
|
||||
from huggingface_hub import snapshot_download
|
||||
snapshot_download(repo_id="K024/ChatGLM-6b-onnx-u8s8", local_dir="./request_llm/ChatGLM-6b-onnx-u8s8")
|
||||
def create_model():
|
||||
return ChatGLMModel(
|
||||
tokenizer_path = "./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/sentencepiece.model",
|
||||
onnx_model_path = "./request_llm/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
|
||||
)
|
||||
self._model = create_model()
|
||||
return self._model, None
|
||||
|
||||
def llm_stream_generator(self, **kwargs):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
def adaptor(kwargs):
|
||||
query = kwargs['query']
|
||||
max_length = kwargs['max_length']
|
||||
top_p = kwargs['top_p']
|
||||
temperature = kwargs['temperature']
|
||||
history = kwargs['history']
|
||||
return query, max_length, top_p, temperature, history
|
||||
|
||||
query, max_length, top_p, temperature, history = adaptor(kwargs)
|
||||
|
||||
prompt = chat_template(history, query)
|
||||
for answer in self._model.generate_iterate(
|
||||
prompt,
|
||||
max_generated_tokens=max_length,
|
||||
top_k=1,
|
||||
top_p=top_p,
|
||||
temperature=temperature,
|
||||
):
|
||||
yield answer
|
||||
|
||||
def try_to_import_special_deps(self, **kwargs):
|
||||
# import something that will raise error if the user does not install requirement_*.txt
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
pass
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 GPT-Academic Interface
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetONNXGLMHandle, model_name)
|
||||
@@ -22,8 +22,8 @@ import importlib
|
||||
# config_private.py放自己的秘密如API和代理网址
|
||||
# 读取时首先看是否存在私密的config_private配置文件(不受git管控),如果有,则覆盖原config文件
|
||||
from toolbox import get_conf, update_ui, is_any_api_key, select_api_key, what_keys, clip_history, trimmed_format_exc
|
||||
proxies, API_KEY, TIMEOUT_SECONDS, MAX_RETRY = \
|
||||
get_conf('proxies', 'API_KEY', 'TIMEOUT_SECONDS', 'MAX_RETRY')
|
||||
proxies, TIMEOUT_SECONDS, MAX_RETRY, API_ORG = \
|
||||
get_conf('proxies', 'TIMEOUT_SECONDS', 'MAX_RETRY', 'API_ORG')
|
||||
|
||||
timeout_bot_msg = '[Local Message] Request timeout. Network error. Please check proxy settings in config.py.' + \
|
||||
'网络错误,检查代理服务器是否可用,以及代理设置的格式是否正确,格式须是[协议]://[地址]:[端口],缺一不可。'
|
||||
@@ -101,6 +101,8 @@ def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="",
|
||||
if (time.time()-observe_window[1]) > watch_dog_patience:
|
||||
raise RuntimeError("用户取消了程序。")
|
||||
else: raise RuntimeError("意外Json结构:"+delta)
|
||||
if json_data['finish_reason'] == 'content_filter':
|
||||
raise RuntimeError("由于提问含不合规内容被Azure过滤。")
|
||||
if json_data['finish_reason'] == 'length':
|
||||
raise ConnectionAbortedError("正常结束,但显示Token不足,导致输出不完整,请削减单次输入的文本量。")
|
||||
return result
|
||||
@@ -127,11 +129,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
return
|
||||
|
||||
if additional_fn is not None:
|
||||
import core_functional
|
||||
importlib.reload(core_functional) # 热更新prompt
|
||||
core_functional = core_functional.get_core_functions()
|
||||
if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs) # 获取预处理函数(如果有的话)
|
||||
inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
raw_input = inputs
|
||||
logging.info(f'[raw_input] {raw_input}')
|
||||
@@ -172,9 +171,10 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
chunk = next(stream_response)
|
||||
except StopIteration:
|
||||
# 非OpenAI官方接口的出现这样的报错,OpenAI和API2D不会走这里
|
||||
from toolbox import regular_txt_to_markdown; tb_str = '```\n' + trimmed_format_exc() + '```'
|
||||
chatbot[-1] = (chatbot[-1][0], f"[Local Message] 远程返回错误: \n\n{tb_str} \n\n{regular_txt_to_markdown(chunk.decode())}")
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="远程返回错误:" + chunk.decode()) # 刷新界面
|
||||
chunk_decoded = chunk.decode()
|
||||
error_msg = chunk_decoded
|
||||
chatbot, history = handle_error(inputs, llm_kwargs, chatbot, history, chunk_decoded, error_msg)
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="非Openai官方接口返回了错误:" + chunk.decode()) # 刷新界面
|
||||
return
|
||||
|
||||
# print(chunk.decode()[6:])
|
||||
@@ -185,7 +185,7 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
if chunk:
|
||||
try:
|
||||
chunk_decoded = chunk.decode()
|
||||
# 前者API2D的
|
||||
# 前者是API2D的结束条件,后者是OPENAI的结束条件
|
||||
if ('data: [DONE]' in chunk_decoded) or (len(json.loads(chunk_decoded[6:])['choices'][0]["delta"]) == 0):
|
||||
# 判定为数据流的结束,gpt_replying_buffer也写完了
|
||||
logging.info(f'[response] {gpt_replying_buffer}')
|
||||
@@ -198,36 +198,45 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
history[-1] = gpt_replying_buffer
|
||||
chatbot[-1] = (history[-2], history[-1])
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg=status_text) # 刷新界面
|
||||
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="Json解析不合常规") # 刷新界面
|
||||
chunk = get_full_error(chunk, stream_response)
|
||||
chunk_decoded = chunk.decode()
|
||||
error_msg = chunk_decoded
|
||||
if "reduce the length" in error_msg:
|
||||
if len(history) >= 2: history[-1] = ""; history[-2] = "" # 清除当前溢出的输入:history[-2] 是本次输入, history[-1] 是本次输出
|
||||
history = clip_history(inputs=inputs, history=history, tokenizer=model_info[llm_kwargs['llm_model']]['tokenizer'],
|
||||
max_token_limit=(model_info[llm_kwargs['llm_model']]['max_token'])) # history至少释放二分之一
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Reduce the length. 本次输入过长, 或历史数据过长. 历史缓存数据已部分释放, 您可以请再次尝试. (若再次失败则更可能是因为输入过长.)")
|
||||
# history = [] # 清除历史
|
||||
elif "does not exist" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], f"[Local Message] Model {llm_kwargs['llm_model']} does not exist. 模型不存在, 或者您没有获得体验资格.")
|
||||
elif "Incorrect API key" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Incorrect API key. OpenAI以提供了不正确的API_KEY为由, 拒绝服务.")
|
||||
elif "exceeded your current quota" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] You exceeded your current quota. OpenAI以账户额度不足为由, 拒绝服务.")
|
||||
elif "bad forward key" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Bad forward key. API2D账户额度不足.")
|
||||
elif "Not enough point" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Not enough point. API2D账户点数不足.")
|
||||
else:
|
||||
from toolbox import regular_txt_to_markdown
|
||||
tb_str = '```\n' + trimmed_format_exc() + '```'
|
||||
chatbot[-1] = (chatbot[-1][0], f"[Local Message] 异常 \n\n{tb_str} \n\n{regular_txt_to_markdown(chunk_decoded)}")
|
||||
chatbot, history = handle_error(inputs, llm_kwargs, chatbot, history, chunk_decoded, error_msg)
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="Json异常" + error_msg) # 刷新界面
|
||||
print(error_msg)
|
||||
return
|
||||
|
||||
def handle_error(inputs, llm_kwargs, chatbot, history, chunk_decoded, error_msg):
|
||||
from .bridge_all import model_info
|
||||
openai_website = ' 请登录OpenAI查看详情 https://platform.openai.com/signup'
|
||||
if "reduce the length" in error_msg:
|
||||
if len(history) >= 2: history[-1] = ""; history[-2] = "" # 清除当前溢出的输入:history[-2] 是本次输入, history[-1] 是本次输出
|
||||
history = clip_history(inputs=inputs, history=history, tokenizer=model_info[llm_kwargs['llm_model']]['tokenizer'],
|
||||
max_token_limit=(model_info[llm_kwargs['llm_model']]['max_token'])) # history至少释放二分之一
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Reduce the length. 本次输入过长, 或历史数据过长. 历史缓存数据已部分释放, 您可以请再次尝试. (若再次失败则更可能是因为输入过长.)")
|
||||
# history = [] # 清除历史
|
||||
elif "does not exist" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], f"[Local Message] Model {llm_kwargs['llm_model']} does not exist. 模型不存在, 或者您没有获得体验资格.")
|
||||
elif "Incorrect API key" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Incorrect API key. OpenAI以提供了不正确的API_KEY为由, 拒绝服务. " + openai_website)
|
||||
elif "exceeded your current quota" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] You exceeded your current quota. OpenAI以账户额度不足为由, 拒绝服务." + openai_website)
|
||||
elif "account is not active" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Your account is not active. OpenAI以账户失效为由, 拒绝服务." + openai_website)
|
||||
elif "associated with a deactivated account" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] You are associated with a deactivated account. OpenAI以账户失效为由, 拒绝服务." + openai_website)
|
||||
elif "bad forward key" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Bad forward key. API2D账户额度不足.")
|
||||
elif "Not enough point" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Not enough point. API2D账户点数不足.")
|
||||
else:
|
||||
from toolbox import regular_txt_to_markdown
|
||||
tb_str = '```\n' + trimmed_format_exc() + '```'
|
||||
chatbot[-1] = (chatbot[-1][0], f"[Local Message] 异常 \n\n{tb_str} \n\n{regular_txt_to_markdown(chunk_decoded)}")
|
||||
return chatbot, history
|
||||
|
||||
def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
|
||||
"""
|
||||
整合所有信息,选择LLM模型,生成http请求,为发送请求做准备
|
||||
@@ -241,6 +250,8 @@ def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}"
|
||||
}
|
||||
if API_ORG.startswith('org-'): headers.update({"OpenAI-Organization": API_ORG})
|
||||
if llm_kwargs['llm_model'].startswith('azure-'): headers.update({"api-key": api_key})
|
||||
|
||||
conversation_cnt = len(history) // 2
|
||||
|
||||
|
||||
297
request_llm/bridge_chatgpt_website.py
Normal file
297
request_llm/bridge_chatgpt_website.py
Normal file
@@ -0,0 +1,297 @@
|
||||
# 借鉴了 https://github.com/GaiZhenbiao/ChuanhuChatGPT 项目
|
||||
|
||||
"""
|
||||
该文件中主要包含三个函数
|
||||
|
||||
不具备多线程能力的函数:
|
||||
1. predict: 正常对话时使用,具备完备的交互功能,不可多线程
|
||||
|
||||
具备多线程调用能力的函数
|
||||
2. predict_no_ui:高级实验性功能模块调用,不会实时显示在界面上,参数简单,可以多线程并行,方便实现复杂的功能逻辑
|
||||
3. predict_no_ui_long_connection:在实验过程中发现调用predict_no_ui处理长文档时,和openai的连接容易断掉,这个函数用stream的方式解决这个问题,同样支持多线程
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import gradio as gr
|
||||
import logging
|
||||
import traceback
|
||||
import requests
|
||||
import importlib
|
||||
|
||||
# config_private.py放自己的秘密如API和代理网址
|
||||
# 读取时首先看是否存在私密的config_private配置文件(不受git管控),如果有,则覆盖原config文件
|
||||
from toolbox import get_conf, update_ui, is_any_api_key, select_api_key, what_keys, clip_history, trimmed_format_exc
|
||||
proxies, TIMEOUT_SECONDS, MAX_RETRY, API_ORG = \
|
||||
get_conf('proxies', 'TIMEOUT_SECONDS', 'MAX_RETRY', 'API_ORG')
|
||||
|
||||
timeout_bot_msg = '[Local Message] Request timeout. Network error. Please check proxy settings in config.py.' + \
|
||||
'网络错误,检查代理服务器是否可用,以及代理设置的格式是否正确,格式须是[协议]://[地址]:[端口],缺一不可。'
|
||||
|
||||
def get_full_error(chunk, stream_response):
|
||||
"""
|
||||
获取完整的从Openai返回的报错
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
chunk += next(stream_response)
|
||||
except:
|
||||
break
|
||||
return chunk
|
||||
|
||||
|
||||
def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=None, console_slience=False):
|
||||
"""
|
||||
发送至chatGPT,等待回复,一次性完成,不显示中间过程。但内部用stream的方法避免中途网线被掐。
|
||||
inputs:
|
||||
是本次问询的输入
|
||||
sys_prompt:
|
||||
系统静默prompt
|
||||
llm_kwargs:
|
||||
chatGPT的内部调优参数
|
||||
history:
|
||||
是之前的对话列表
|
||||
observe_window = None:
|
||||
用于负责跨越线程传递已经输出的部分,大部分时候仅仅为了fancy的视觉效果,留空即可。observe_window[0]:观测窗。observe_window[1]:看门狗
|
||||
"""
|
||||
watch_dog_patience = 5 # 看门狗的耐心, 设置5秒即可
|
||||
headers, payload = generate_payload(inputs, llm_kwargs, history, system_prompt=sys_prompt, stream=True)
|
||||
retry = 0
|
||||
while True:
|
||||
try:
|
||||
# make a POST request to the API endpoint, stream=False
|
||||
from .bridge_all import model_info
|
||||
endpoint = model_info[llm_kwargs['llm_model']]['endpoint']
|
||||
response = requests.post(endpoint, headers=headers, proxies=proxies,
|
||||
json=payload, stream=True, timeout=TIMEOUT_SECONDS); break
|
||||
except requests.exceptions.ReadTimeout as e:
|
||||
retry += 1
|
||||
traceback.print_exc()
|
||||
if retry > MAX_RETRY: raise TimeoutError
|
||||
if MAX_RETRY!=0: print(f'请求超时,正在重试 ({retry}/{MAX_RETRY}) ……')
|
||||
|
||||
stream_response = response.iter_lines()
|
||||
result = ''
|
||||
while True:
|
||||
try: chunk = next(stream_response).decode()
|
||||
except StopIteration:
|
||||
break
|
||||
except requests.exceptions.ConnectionError:
|
||||
chunk = next(stream_response).decode() # 失败了,重试一次?再失败就没办法了。
|
||||
if len(chunk)==0: continue
|
||||
if not chunk.startswith('data:'):
|
||||
error_msg = get_full_error(chunk.encode('utf8'), stream_response).decode()
|
||||
if "reduce the length" in error_msg:
|
||||
raise ConnectionAbortedError("OpenAI拒绝了请求:" + error_msg)
|
||||
else:
|
||||
raise RuntimeError("OpenAI拒绝了请求:" + error_msg)
|
||||
if ('data: [DONE]' in chunk): break # api2d 正常完成
|
||||
json_data = json.loads(chunk.lstrip('data:'))['choices'][0]
|
||||
delta = json_data["delta"]
|
||||
if len(delta) == 0: break
|
||||
if "role" in delta: continue
|
||||
if "content" in delta:
|
||||
result += delta["content"]
|
||||
if not console_slience: print(delta["content"], end='')
|
||||
if observe_window is not None:
|
||||
# 观测窗,把已经获取的数据显示出去
|
||||
if len(observe_window) >= 1: observe_window[0] += delta["content"]
|
||||
# 看门狗,如果超过期限没有喂狗,则终止
|
||||
if len(observe_window) >= 2:
|
||||
if (time.time()-observe_window[1]) > watch_dog_patience:
|
||||
raise RuntimeError("用户取消了程序。")
|
||||
else: raise RuntimeError("意外Json结构:"+delta)
|
||||
if json_data['finish_reason'] == 'content_filter':
|
||||
raise RuntimeError("由于提问含不合规内容被Azure过滤。")
|
||||
if json_data['finish_reason'] == 'length':
|
||||
raise ConnectionAbortedError("正常结束,但显示Token不足,导致输出不完整,请削减单次输入的文本量。")
|
||||
return result
|
||||
|
||||
|
||||
def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
|
||||
"""
|
||||
发送至chatGPT,流式获取输出。
|
||||
用于基础的对话功能。
|
||||
inputs 是本次问询的输入
|
||||
top_p, temperature是chatGPT的内部调优参数
|
||||
history 是之前的对话列表(注意无论是inputs还是history,内容太长了都会触发token数量溢出的错误)
|
||||
chatbot 为WebUI中显示的对话列表,修改它,然后yeild出去,可以直接修改对话界面内容
|
||||
additional_fn代表点击的哪个按钮,按钮见functional.py
|
||||
"""
|
||||
if is_any_api_key(inputs):
|
||||
chatbot._cookies['api_key'] = inputs
|
||||
chatbot.append(("输入已识别为openai的api_key", what_keys(inputs)))
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="api_key已导入") # 刷新界面
|
||||
return
|
||||
elif not is_any_api_key(chatbot._cookies['api_key']):
|
||||
chatbot.append((inputs, "缺少api_key。\n\n1. 临时解决方案:直接在输入区键入api_key,然后回车提交。\n\n2. 长效解决方案:在config.py中配置。"))
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="缺少api_key") # 刷新界面
|
||||
return
|
||||
|
||||
if additional_fn is not None:
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
raw_input = inputs
|
||||
logging.info(f'[raw_input] {raw_input}')
|
||||
chatbot.append((inputs, ""))
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="等待响应") # 刷新界面
|
||||
|
||||
try:
|
||||
headers, payload = generate_payload(inputs, llm_kwargs, history, system_prompt, stream)
|
||||
except RuntimeError as e:
|
||||
chatbot[-1] = (inputs, f"您提供的api-key不满足要求,不包含任何可用于{llm_kwargs['llm_model']}的api-key。您可能选择了错误的模型或请求源。")
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="api-key不满足要求") # 刷新界面
|
||||
return
|
||||
|
||||
history.append(inputs); history.append("")
|
||||
|
||||
retry = 0
|
||||
while True:
|
||||
try:
|
||||
# make a POST request to the API endpoint, stream=True
|
||||
from .bridge_all import model_info
|
||||
endpoint = model_info[llm_kwargs['llm_model']]['endpoint']
|
||||
response = requests.post(endpoint, headers=headers, proxies=proxies,
|
||||
json=payload, stream=True, timeout=TIMEOUT_SECONDS);break
|
||||
except:
|
||||
retry += 1
|
||||
chatbot[-1] = ((chatbot[-1][0], timeout_bot_msg))
|
||||
retry_msg = f",正在重试 ({retry}/{MAX_RETRY}) ……" if MAX_RETRY > 0 else ""
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="请求超时"+retry_msg) # 刷新界面
|
||||
if retry > MAX_RETRY: raise TimeoutError
|
||||
|
||||
gpt_replying_buffer = ""
|
||||
|
||||
is_head_of_the_stream = True
|
||||
if stream:
|
||||
stream_response = response.iter_lines()
|
||||
while True:
|
||||
try:
|
||||
chunk = next(stream_response)
|
||||
except StopIteration:
|
||||
# 非OpenAI官方接口的出现这样的报错,OpenAI和API2D不会走这里
|
||||
chunk_decoded = chunk.decode()
|
||||
error_msg = chunk_decoded
|
||||
chatbot, history = handle_error(inputs, llm_kwargs, chatbot, history, chunk_decoded, error_msg)
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="非Openai官方接口返回了错误:" + chunk.decode()) # 刷新界面
|
||||
return
|
||||
|
||||
# print(chunk.decode()[6:])
|
||||
if is_head_of_the_stream and (r'"object":"error"' not in chunk.decode()):
|
||||
# 数据流的第一帧不携带content
|
||||
is_head_of_the_stream = False; continue
|
||||
|
||||
if chunk:
|
||||
try:
|
||||
chunk_decoded = chunk.decode()
|
||||
# 前者是API2D的结束条件,后者是OPENAI的结束条件
|
||||
if 'data: [DONE]' in chunk_decoded:
|
||||
# 判定为数据流的结束,gpt_replying_buffer也写完了
|
||||
logging.info(f'[response] {gpt_replying_buffer}')
|
||||
break
|
||||
# 处理数据流的主体
|
||||
chunkjson = json.loads(chunk_decoded[6:])
|
||||
status_text = f"finish_reason: {chunkjson['choices'][0]['finish_reason']}"
|
||||
delta = chunkjson['choices'][0]["delta"]
|
||||
if "content" in delta:
|
||||
gpt_replying_buffer = gpt_replying_buffer + delta["content"]
|
||||
history[-1] = gpt_replying_buffer
|
||||
chatbot[-1] = (history[-2], history[-1])
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg=status_text) # 刷新界面
|
||||
except Exception as e:
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="Json解析不合常规") # 刷新界面
|
||||
chunk = get_full_error(chunk, stream_response)
|
||||
chunk_decoded = chunk.decode()
|
||||
error_msg = chunk_decoded
|
||||
chatbot, history = handle_error(inputs, llm_kwargs, chatbot, history, chunk_decoded, error_msg)
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="Json异常" + error_msg) # 刷新界面
|
||||
print(error_msg)
|
||||
return
|
||||
|
||||
def handle_error(inputs, llm_kwargs, chatbot, history, chunk_decoded, error_msg):
|
||||
from .bridge_all import model_info
|
||||
openai_website = ' 请登录OpenAI查看详情 https://platform.openai.com/signup'
|
||||
if "reduce the length" in error_msg:
|
||||
if len(history) >= 2: history[-1] = ""; history[-2] = "" # 清除当前溢出的输入:history[-2] 是本次输入, history[-1] 是本次输出
|
||||
history = clip_history(inputs=inputs, history=history, tokenizer=model_info[llm_kwargs['llm_model']]['tokenizer'],
|
||||
max_token_limit=(model_info[llm_kwargs['llm_model']]['max_token'])) # history至少释放二分之一
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Reduce the length. 本次输入过长, 或历史数据过长. 历史缓存数据已部分释放, 您可以请再次尝试. (若再次失败则更可能是因为输入过长.)")
|
||||
# history = [] # 清除历史
|
||||
elif "does not exist" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], f"[Local Message] Model {llm_kwargs['llm_model']} does not exist. 模型不存在, 或者您没有获得体验资格.")
|
||||
elif "Incorrect API key" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Incorrect API key. OpenAI以提供了不正确的API_KEY为由, 拒绝服务. " + openai_website)
|
||||
elif "exceeded your current quota" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] You exceeded your current quota. OpenAI以账户额度不足为由, 拒绝服务." + openai_website)
|
||||
elif "account is not active" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Your account is not active. OpenAI以账户失效为由, 拒绝服务." + openai_website)
|
||||
elif "associated with a deactivated account" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] You are associated with a deactivated account. OpenAI以账户失效为由, 拒绝服务." + openai_website)
|
||||
elif "bad forward key" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Bad forward key. API2D账户额度不足.")
|
||||
elif "Not enough point" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Not enough point. API2D账户点数不足.")
|
||||
else:
|
||||
from toolbox import regular_txt_to_markdown
|
||||
tb_str = '```\n' + trimmed_format_exc() + '```'
|
||||
chatbot[-1] = (chatbot[-1][0], f"[Local Message] 异常 \n\n{tb_str} \n\n{regular_txt_to_markdown(chunk_decoded)}")
|
||||
return chatbot, history
|
||||
|
||||
def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
|
||||
"""
|
||||
整合所有信息,选择LLM模型,生成http请求,为发送请求做准备
|
||||
"""
|
||||
if not is_any_api_key(llm_kwargs['api_key']):
|
||||
raise AssertionError("你提供了错误的API_KEY。\n\n1. 临时解决方案:直接在输入区键入api_key,然后回车提交。\n\n2. 长效解决方案:在config.py中配置。")
|
||||
|
||||
api_key = select_api_key(llm_kwargs['api_key'], llm_kwargs['llm_model'])
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}"
|
||||
}
|
||||
if API_ORG.startswith('org-'): headers.update({"OpenAI-Organization": API_ORG})
|
||||
if llm_kwargs['llm_model'].startswith('azure-'): headers.update({"api-key": api_key})
|
||||
|
||||
conversation_cnt = len(history) // 2
|
||||
|
||||
messages = [{"role": "system", "content": system_prompt}]
|
||||
if conversation_cnt:
|
||||
for index in range(0, 2*conversation_cnt, 2):
|
||||
what_i_have_asked = {}
|
||||
what_i_have_asked["role"] = "user"
|
||||
what_i_have_asked["content"] = history[index]
|
||||
what_gpt_answer = {}
|
||||
what_gpt_answer["role"] = "assistant"
|
||||
what_gpt_answer["content"] = history[index+1]
|
||||
if what_i_have_asked["content"] != "":
|
||||
if what_gpt_answer["content"] == "": continue
|
||||
if what_gpt_answer["content"] == timeout_bot_msg: continue
|
||||
messages.append(what_i_have_asked)
|
||||
messages.append(what_gpt_answer)
|
||||
else:
|
||||
messages[-1]['content'] = what_gpt_answer['content']
|
||||
|
||||
what_i_ask_now = {}
|
||||
what_i_ask_now["role"] = "user"
|
||||
what_i_ask_now["content"] = inputs
|
||||
messages.append(what_i_ask_now)
|
||||
|
||||
payload = {
|
||||
"model": llm_kwargs['llm_model'].strip('api2d-'),
|
||||
"messages": messages,
|
||||
"temperature": llm_kwargs['temperature'], # 1.0,
|
||||
"top_p": llm_kwargs['top_p'], # 1.0,
|
||||
"n": 1,
|
||||
"stream": stream,
|
||||
"presence_penalty": 0,
|
||||
"frequency_penalty": 0,
|
||||
}
|
||||
try:
|
||||
print(f" {llm_kwargs['llm_model']} : {conversation_cnt} : {inputs[:100]} ..........")
|
||||
except:
|
||||
print('输入中可能存在乱码。')
|
||||
return headers,payload
|
||||
|
||||
|
||||
228
request_llm/bridge_claude.py
Normal file
228
request_llm/bridge_claude.py
Normal file
@@ -0,0 +1,228 @@
|
||||
# 借鉴了 https://github.com/GaiZhenbiao/ChuanhuChatGPT 项目
|
||||
|
||||
"""
|
||||
该文件中主要包含2个函数
|
||||
|
||||
不具备多线程能力的函数:
|
||||
1. predict: 正常对话时使用,具备完备的交互功能,不可多线程
|
||||
|
||||
具备多线程调用能力的函数
|
||||
2. predict_no_ui_long_connection:在实验过程中发现调用predict_no_ui处理长文档时,和openai的连接容易断掉,这个函数用stream的方式解决这个问题,同样支持多线程
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import gradio as gr
|
||||
import logging
|
||||
import traceback
|
||||
import requests
|
||||
import importlib
|
||||
|
||||
# config_private.py放自己的秘密如API和代理网址
|
||||
# 读取时首先看是否存在私密的config_private配置文件(不受git管控),如果有,则覆盖原config文件
|
||||
from toolbox import get_conf, update_ui, trimmed_format_exc, ProxyNetworkActivate
|
||||
proxies, TIMEOUT_SECONDS, MAX_RETRY, ANTHROPIC_API_KEY = \
|
||||
get_conf('proxies', 'TIMEOUT_SECONDS', 'MAX_RETRY', 'ANTHROPIC_API_KEY')
|
||||
|
||||
timeout_bot_msg = '[Local Message] Request timeout. Network error. Please check proxy settings in config.py.' + \
|
||||
'网络错误,检查代理服务器是否可用,以及代理设置的格式是否正确,格式须是[协议]://[地址]:[端口],缺一不可。'
|
||||
|
||||
def get_full_error(chunk, stream_response):
|
||||
"""
|
||||
获取完整的从Openai返回的报错
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
chunk += next(stream_response)
|
||||
except:
|
||||
break
|
||||
return chunk
|
||||
|
||||
|
||||
def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=None, console_slience=False):
|
||||
"""
|
||||
发送至chatGPT,等待回复,一次性完成,不显示中间过程。但内部用stream的方法避免中途网线被掐。
|
||||
inputs:
|
||||
是本次问询的输入
|
||||
sys_prompt:
|
||||
系统静默prompt
|
||||
llm_kwargs:
|
||||
chatGPT的内部调优参数
|
||||
history:
|
||||
是之前的对话列表
|
||||
observe_window = None:
|
||||
用于负责跨越线程传递已经输出的部分,大部分时候仅仅为了fancy的视觉效果,留空即可。observe_window[0]:观测窗。observe_window[1]:看门狗
|
||||
"""
|
||||
from anthropic import Anthropic
|
||||
watch_dog_patience = 5 # 看门狗的耐心, 设置5秒即可
|
||||
prompt = generate_payload(inputs, llm_kwargs, history, system_prompt=sys_prompt, stream=True)
|
||||
retry = 0
|
||||
if len(ANTHROPIC_API_KEY) == 0:
|
||||
raise RuntimeError("没有设置ANTHROPIC_API_KEY选项")
|
||||
|
||||
while True:
|
||||
try:
|
||||
# make a POST request to the API endpoint, stream=False
|
||||
from .bridge_all import model_info
|
||||
anthropic = Anthropic(api_key=ANTHROPIC_API_KEY)
|
||||
# endpoint = model_info[llm_kwargs['llm_model']]['endpoint']
|
||||
# with ProxyNetworkActivate()
|
||||
stream = anthropic.completions.create(
|
||||
prompt=prompt,
|
||||
max_tokens_to_sample=4096, # The maximum number of tokens to generate before stopping.
|
||||
model=llm_kwargs['llm_model'],
|
||||
stream=True,
|
||||
temperature = llm_kwargs['temperature']
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
retry += 1
|
||||
traceback.print_exc()
|
||||
if retry > MAX_RETRY: raise TimeoutError
|
||||
if MAX_RETRY!=0: print(f'请求超时,正在重试 ({retry}/{MAX_RETRY}) ……')
|
||||
result = ''
|
||||
try:
|
||||
for completion in stream:
|
||||
result += completion.completion
|
||||
if not console_slience: print(completion.completion, end='')
|
||||
if observe_window is not None:
|
||||
# 观测窗,把已经获取的数据显示出去
|
||||
if len(observe_window) >= 1: observe_window[0] += completion.completion
|
||||
# 看门狗,如果超过期限没有喂狗,则终止
|
||||
if len(observe_window) >= 2:
|
||||
if (time.time()-observe_window[1]) > watch_dog_patience:
|
||||
raise RuntimeError("用户取消了程序。")
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
|
||||
"""
|
||||
发送至chatGPT,流式获取输出。
|
||||
用于基础的对话功能。
|
||||
inputs 是本次问询的输入
|
||||
top_p, temperature是chatGPT的内部调优参数
|
||||
history 是之前的对话列表(注意无论是inputs还是history,内容太长了都会触发token数量溢出的错误)
|
||||
chatbot 为WebUI中显示的对话列表,修改它,然后yeild出去,可以直接修改对话界面内容
|
||||
additional_fn代表点击的哪个按钮,按钮见functional.py
|
||||
"""
|
||||
from anthropic import Anthropic
|
||||
if len(ANTHROPIC_API_KEY) == 0:
|
||||
chatbot.append((inputs, "没有设置ANTHROPIC_API_KEY"))
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="等待响应") # 刷新界面
|
||||
return
|
||||
|
||||
if additional_fn is not None:
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
raw_input = inputs
|
||||
logging.info(f'[raw_input] {raw_input}')
|
||||
chatbot.append((inputs, ""))
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="等待响应") # 刷新界面
|
||||
|
||||
try:
|
||||
prompt = generate_payload(inputs, llm_kwargs, history, system_prompt, stream)
|
||||
except RuntimeError as e:
|
||||
chatbot[-1] = (inputs, f"您提供的api-key不满足要求,不包含任何可用于{llm_kwargs['llm_model']}的api-key。您可能选择了错误的模型或请求源。")
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="api-key不满足要求") # 刷新界面
|
||||
return
|
||||
|
||||
history.append(inputs); history.append("")
|
||||
|
||||
retry = 0
|
||||
while True:
|
||||
try:
|
||||
# make a POST request to the API endpoint, stream=True
|
||||
from .bridge_all import model_info
|
||||
anthropic = Anthropic(api_key=ANTHROPIC_API_KEY)
|
||||
# endpoint = model_info[llm_kwargs['llm_model']]['endpoint']
|
||||
# with ProxyNetworkActivate()
|
||||
stream = anthropic.completions.create(
|
||||
prompt=prompt,
|
||||
max_tokens_to_sample=4096, # The maximum number of tokens to generate before stopping.
|
||||
model=llm_kwargs['llm_model'],
|
||||
stream=True,
|
||||
temperature = llm_kwargs['temperature']
|
||||
)
|
||||
|
||||
break
|
||||
except:
|
||||
retry += 1
|
||||
chatbot[-1] = ((chatbot[-1][0], timeout_bot_msg))
|
||||
retry_msg = f",正在重试 ({retry}/{MAX_RETRY}) ……" if MAX_RETRY > 0 else ""
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="请求超时"+retry_msg) # 刷新界面
|
||||
if retry > MAX_RETRY: raise TimeoutError
|
||||
|
||||
gpt_replying_buffer = ""
|
||||
|
||||
for completion in stream:
|
||||
try:
|
||||
gpt_replying_buffer = gpt_replying_buffer + completion.completion
|
||||
history[-1] = gpt_replying_buffer
|
||||
chatbot[-1] = (history[-2], history[-1])
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg='正常') # 刷新界面
|
||||
|
||||
except Exception as e:
|
||||
from toolbox import regular_txt_to_markdown
|
||||
tb_str = '```\n' + trimmed_format_exc() + '```'
|
||||
chatbot[-1] = (chatbot[-1][0], f"[Local Message] 异常 \n\n{tb_str}")
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="Json异常" + tb_str) # 刷新界面
|
||||
return
|
||||
|
||||
|
||||
|
||||
|
||||
# https://github.com/jtsang4/claude-to-chatgpt/blob/main/claude_to_chatgpt/adapter.py
|
||||
def convert_messages_to_prompt(messages):
|
||||
prompt = ""
|
||||
role_map = {
|
||||
"system": "Human",
|
||||
"user": "Human",
|
||||
"assistant": "Assistant",
|
||||
}
|
||||
for message in messages:
|
||||
role = message["role"]
|
||||
content = message["content"]
|
||||
transformed_role = role_map[role]
|
||||
prompt += f"\n\n{transformed_role.capitalize()}: {content}"
|
||||
prompt += "\n\nAssistant: "
|
||||
return prompt
|
||||
|
||||
def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
|
||||
"""
|
||||
整合所有信息,选择LLM模型,生成http请求,为发送请求做准备
|
||||
"""
|
||||
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
|
||||
|
||||
conversation_cnt = len(history) // 2
|
||||
|
||||
messages = [{"role": "system", "content": system_prompt}]
|
||||
if conversation_cnt:
|
||||
for index in range(0, 2*conversation_cnt, 2):
|
||||
what_i_have_asked = {}
|
||||
what_i_have_asked["role"] = "user"
|
||||
what_i_have_asked["content"] = history[index]
|
||||
what_gpt_answer = {}
|
||||
what_gpt_answer["role"] = "assistant"
|
||||
what_gpt_answer["content"] = history[index+1]
|
||||
if what_i_have_asked["content"] != "":
|
||||
if what_gpt_answer["content"] == "": continue
|
||||
if what_gpt_answer["content"] == timeout_bot_msg: continue
|
||||
messages.append(what_i_have_asked)
|
||||
messages.append(what_gpt_answer)
|
||||
else:
|
||||
messages[-1]['content'] = what_gpt_answer['content']
|
||||
|
||||
what_i_ask_now = {}
|
||||
what_i_ask_now["role"] = "user"
|
||||
what_i_ask_now["content"] = inputs
|
||||
messages.append(what_i_ask_now)
|
||||
prompt = convert_messages_to_prompt(messages)
|
||||
|
||||
return prompt
|
||||
|
||||
|
||||
202
request_llm/bridge_internlm.py
Normal file
202
request_llm/bridge_internlm.py
Normal file
@@ -0,0 +1,202 @@
|
||||
model_name = "InternLM"
|
||||
cmd_to_install = "`pip install -r request_llm/requirements_chatglm.txt`"
|
||||
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
import time
|
||||
import threading
|
||||
import importlib
|
||||
from toolbox import update_ui, get_conf
|
||||
from multiprocessing import Process, Pipe
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 Local Model Utils
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
def try_to_import_special_deps():
|
||||
import sentencepiece
|
||||
|
||||
def combine_history(prompt, hist):
|
||||
user_prompt = "<|User|>:{user}<eoh>\n"
|
||||
robot_prompt = "<|Bot|>:{robot}<eoa>\n"
|
||||
cur_query_prompt = "<|User|>:{user}<eoh>\n<|Bot|>:"
|
||||
messages = hist
|
||||
total_prompt = ""
|
||||
for message in messages:
|
||||
cur_content = message
|
||||
cur_prompt = user_prompt.replace("{user}", cur_content[0])
|
||||
total_prompt += cur_prompt
|
||||
cur_prompt = robot_prompt.replace("{robot}", cur_content[1])
|
||||
total_prompt += cur_prompt
|
||||
total_prompt = total_prompt + cur_query_prompt.replace("{user}", prompt)
|
||||
return total_prompt
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 Local Model
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
@SingletonLocalLLM
|
||||
class GetInternlmHandle(LocalLLMHandle):
|
||||
|
||||
def load_model_info(self):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
self.model_name = model_name
|
||||
self.cmd_to_install = cmd_to_install
|
||||
|
||||
def try_to_import_special_deps(self, **kwargs):
|
||||
"""
|
||||
import something that will raise error if the user does not install requirement_*.txt
|
||||
"""
|
||||
import sentencepiece
|
||||
|
||||
def load_model_and_tokenizer(self):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
device, = get_conf('LOCAL_MODEL_DEVICE')
|
||||
if self._model is None:
|
||||
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
|
||||
if device=='cpu':
|
||||
model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).to(torch.bfloat16)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).to(torch.bfloat16).cuda()
|
||||
|
||||
model = model.eval()
|
||||
return model, tokenizer
|
||||
|
||||
def llm_stream_generator(self, **kwargs):
|
||||
import torch
|
||||
import logging
|
||||
import copy
|
||||
import warnings
|
||||
import torch.nn as nn
|
||||
from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig
|
||||
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
def adaptor():
|
||||
model = self._model
|
||||
tokenizer = self._tokenizer
|
||||
prompt = kwargs['query']
|
||||
max_length = kwargs['max_length']
|
||||
top_p = kwargs['top_p']
|
||||
temperature = kwargs['temperature']
|
||||
history = kwargs['history']
|
||||
real_prompt = combine_history(prompt, history)
|
||||
return model, tokenizer, real_prompt, max_length, top_p, temperature
|
||||
|
||||
model, tokenizer, prompt, max_length, top_p, temperature = adaptor()
|
||||
prefix_allowed_tokens_fn = None
|
||||
logits_processor = None
|
||||
stopping_criteria = None
|
||||
additional_eos_token_id = 103028
|
||||
generation_config = None
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
# 🏃♂️🏃♂️🏃♂️ https://github.com/InternLM/InternLM/blob/efbf5335709a8c8faeac6eaf07193973ff1d56a1/web_demo.py#L25
|
||||
|
||||
inputs = tokenizer([prompt], padding=True, return_tensors="pt")
|
||||
input_length = len(inputs["input_ids"][0])
|
||||
for k, v in inputs.items():
|
||||
inputs[k] = v.cuda()
|
||||
input_ids = inputs["input_ids"]
|
||||
batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
|
||||
if generation_config is None:
|
||||
generation_config = model.generation_config
|
||||
generation_config = copy.deepcopy(generation_config)
|
||||
model_kwargs = generation_config.update(**kwargs)
|
||||
bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
|
||||
if isinstance(eos_token_id, int):
|
||||
eos_token_id = [eos_token_id]
|
||||
if additional_eos_token_id is not None:
|
||||
eos_token_id.append(additional_eos_token_id)
|
||||
has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
|
||||
if has_default_max_length and generation_config.max_new_tokens is None:
|
||||
warnings.warn(
|
||||
f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
|
||||
"This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
|
||||
" recommend using `max_new_tokens` to control the maximum length of the generation.",
|
||||
UserWarning,
|
||||
)
|
||||
elif generation_config.max_new_tokens is not None:
|
||||
generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
|
||||
if not has_default_max_length:
|
||||
logging.warn(
|
||||
f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
|
||||
f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
|
||||
"Please refer to the documentation for more information. "
|
||||
"(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
|
||||
UserWarning,
|
||||
)
|
||||
|
||||
if input_ids_seq_length >= generation_config.max_length:
|
||||
input_ids_string = "input_ids"
|
||||
logging.warning(
|
||||
f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
|
||||
f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
|
||||
" increasing `max_new_tokens`."
|
||||
)
|
||||
|
||||
# 2. Set generation parameters if not already defined
|
||||
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
|
||||
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
|
||||
|
||||
logits_processor = model._get_logits_processor(
|
||||
generation_config=generation_config,
|
||||
input_ids_seq_length=input_ids_seq_length,
|
||||
encoder_input_ids=input_ids,
|
||||
prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
|
||||
logits_processor=logits_processor,
|
||||
)
|
||||
|
||||
stopping_criteria = model._get_stopping_criteria(
|
||||
generation_config=generation_config, stopping_criteria=stopping_criteria
|
||||
)
|
||||
logits_warper = model._get_logits_warper(generation_config)
|
||||
|
||||
unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
|
||||
scores = None
|
||||
while True:
|
||||
model_inputs = model.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
||||
# forward pass to get next token
|
||||
outputs = model(
|
||||
**model_inputs,
|
||||
return_dict=True,
|
||||
output_attentions=False,
|
||||
output_hidden_states=False,
|
||||
)
|
||||
|
||||
next_token_logits = outputs.logits[:, -1, :]
|
||||
|
||||
# pre-process distribution
|
||||
next_token_scores = logits_processor(input_ids, next_token_logits)
|
||||
next_token_scores = logits_warper(input_ids, next_token_scores)
|
||||
|
||||
# sample
|
||||
probs = nn.functional.softmax(next_token_scores, dim=-1)
|
||||
if generation_config.do_sample:
|
||||
next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
|
||||
else:
|
||||
next_tokens = torch.argmax(probs, dim=-1)
|
||||
|
||||
# update generated ids, model inputs, and length for next step
|
||||
input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
|
||||
model_kwargs = model._update_model_kwargs_for_generation(
|
||||
outputs, model_kwargs, is_encoder_decoder=False
|
||||
)
|
||||
unfinished_sequences = unfinished_sequences.mul((min(next_tokens != i for i in eos_token_id)).long())
|
||||
|
||||
output_token_ids = input_ids[0].cpu().tolist()
|
||||
output_token_ids = output_token_ids[input_length:]
|
||||
for each_eos_token_id in eos_token_id:
|
||||
if output_token_ids[-1] == each_eos_token_id:
|
||||
output_token_ids = output_token_ids[:-1]
|
||||
response = tokenizer.decode(output_token_ids)
|
||||
|
||||
yield response
|
||||
# stop when each sentence is finished, or if we exceed the maximum length
|
||||
if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
|
||||
return
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 GPT-Academic Interface
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetInternlmHandle, model_name)
|
||||
@@ -154,11 +154,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
return
|
||||
|
||||
if additional_fn is not None:
|
||||
import core_functional
|
||||
importlib.reload(core_functional) # 热更新prompt
|
||||
core_functional = core_functional.get_core_functions()
|
||||
if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs) # 获取预处理函数(如果有的话)
|
||||
inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
# 处理历史信息
|
||||
history_feedin = []
|
||||
|
||||
@@ -154,11 +154,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
return
|
||||
|
||||
if additional_fn is not None:
|
||||
import core_functional
|
||||
importlib.reload(core_functional) # 热更新prompt
|
||||
core_functional = core_functional.get_core_functions()
|
||||
if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs) # 获取预处理函数(如果有的话)
|
||||
inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
# 处理历史信息
|
||||
history_feedin = []
|
||||
|
||||
@@ -154,11 +154,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
return
|
||||
|
||||
if additional_fn is not None:
|
||||
import core_functional
|
||||
importlib.reload(core_functional) # 热更新prompt
|
||||
core_functional = core_functional.get_core_functions()
|
||||
if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs) # 获取预处理函数(如果有的话)
|
||||
inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
# 处理历史信息
|
||||
history_feedin = []
|
||||
|
||||
@@ -224,11 +224,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
if additional_fn is not None:
|
||||
import core_functional
|
||||
importlib.reload(core_functional) # 热更新prompt
|
||||
core_functional = core_functional.get_core_functions()
|
||||
if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs) # 获取预处理函数(如果有的话)
|
||||
inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
# 处理历史信息
|
||||
history_feedin = []
|
||||
|
||||
@@ -89,9 +89,6 @@ class NewBingHandle(Process):
|
||||
if a not in self.local_history:
|
||||
self.local_history.append(a)
|
||||
prompt += a + '\n'
|
||||
# if b not in self.local_history:
|
||||
# self.local_history.append(b)
|
||||
# prompt += b + '\n'
|
||||
|
||||
# 问题
|
||||
prompt += question
|
||||
@@ -101,7 +98,7 @@ class NewBingHandle(Process):
|
||||
async for final, response in self.newbing_model.ask_stream(
|
||||
prompt=question,
|
||||
conversation_style=NEWBING_STYLE, # ["creative", "balanced", "precise"]
|
||||
wss_link=endpoint, # "wss://sydney.bing.com/sydney/ChatHub"
|
||||
wss_link=endpoint, # "wss://sydney.bing.com/sydney/ChatHub"
|
||||
):
|
||||
if not final:
|
||||
print(response)
|
||||
@@ -121,14 +118,26 @@ class NewBingHandle(Process):
|
||||
self.local_history = []
|
||||
if (self.newbing_model is None) or (not self.success):
|
||||
# 代理设置
|
||||
proxies, = get_conf('proxies')
|
||||
proxies, NEWBING_COOKIES = get_conf('proxies', 'NEWBING_COOKIES')
|
||||
if proxies is None:
|
||||
self.proxies_https = None
|
||||
else:
|
||||
self.proxies_https = proxies['https']
|
||||
|
||||
if (NEWBING_COOKIES is not None) and len(NEWBING_COOKIES) > 100:
|
||||
try:
|
||||
cookies = json.loads(NEWBING_COOKIES)
|
||||
except:
|
||||
self.success = False
|
||||
tb_str = '\n```\n' + trimmed_format_exc() + '\n```\n'
|
||||
self.child.send(f'[Local Message] NEWBING_COOKIES未填写或有格式错误。')
|
||||
self.child.send('[Fail]'); self.child.send('[Finish]')
|
||||
raise RuntimeError(f"NEWBING_COOKIES未填写或有格式错误。")
|
||||
else:
|
||||
cookies = None
|
||||
|
||||
try:
|
||||
self.newbing_model = NewbingChatbot(proxy=self.proxies_https)
|
||||
self.newbing_model = NewbingChatbot(proxy=self.proxies_https, cookies=cookies)
|
||||
except:
|
||||
self.success = False
|
||||
tb_str = '\n```\n' + trimmed_format_exc() + '\n```\n'
|
||||
@@ -143,7 +152,7 @@ class NewBingHandle(Process):
|
||||
asyncio.run(self.async_run())
|
||||
except Exception:
|
||||
tb_str = '\n```\n' + trimmed_format_exc() + '\n```\n'
|
||||
self.child.send(f'[Local Message] Newbing失败 {tb_str}.')
|
||||
self.child.send(f'[Local Message] Newbing 请求失败,报错信息如下. 如果是与网络相关的问题,建议更换代理协议(推荐http)或代理节点 {tb_str}.')
|
||||
self.child.send('[Fail]')
|
||||
self.child.send('[Finish]')
|
||||
|
||||
@@ -151,18 +160,14 @@ class NewBingHandle(Process):
|
||||
"""
|
||||
这个函数运行在主进程
|
||||
"""
|
||||
self.threadLock.acquire()
|
||||
self.parent.send(kwargs) # 发送请求到子进程
|
||||
self.threadLock.acquire() # 获取线程锁
|
||||
self.parent.send(kwargs) # 请求子进程
|
||||
while True:
|
||||
res = self.parent.recv() # 等待newbing回复的片段
|
||||
if res == '[Finish]':
|
||||
break # 结束
|
||||
elif res == '[Fail]':
|
||||
self.success = False
|
||||
break
|
||||
else:
|
||||
yield res # newbing回复的片段
|
||||
self.threadLock.release()
|
||||
res = self.parent.recv() # 等待newbing回复的片段
|
||||
if res == '[Finish]': break # 结束
|
||||
elif res == '[Fail]': self.success = False; break # 失败
|
||||
else: yield res # newbing回复的片段
|
||||
self.threadLock.release() # 释放线程锁
|
||||
|
||||
|
||||
"""
|
||||
@@ -219,11 +224,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
return
|
||||
|
||||
if additional_fn is not None:
|
||||
import core_functional
|
||||
importlib.reload(core_functional) # 热更新prompt
|
||||
core_functional = core_functional.get_core_functions()
|
||||
if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs) # 获取预处理函数(如果有的话)
|
||||
inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
history_feedin = []
|
||||
for i in range(len(history)//2):
|
||||
|
||||
68
request_llm/bridge_qwen.py
Normal file
68
request_llm/bridge_qwen.py
Normal file
@@ -0,0 +1,68 @@
|
||||
model_name = "Qwen"
|
||||
cmd_to_install = "`pip install -r request_llm/requirements_qwen.txt`"
|
||||
|
||||
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
import time
|
||||
import threading
|
||||
import importlib
|
||||
from toolbox import update_ui, get_conf
|
||||
from multiprocessing import Process, Pipe
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
|
||||
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 Local Model
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
@SingletonLocalLLM
|
||||
class GetONNXGLMHandle(LocalLLMHandle):
|
||||
|
||||
def load_model_info(self):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
self.model_name = model_name
|
||||
self.cmd_to_install = cmd_to_install
|
||||
|
||||
def load_model_and_tokenizer(self):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
import os, glob
|
||||
import os
|
||||
import platform
|
||||
from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
||||
|
||||
model_id = 'qwen/Qwen-7B-Chat'
|
||||
revision = 'v1.0.1'
|
||||
self._tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True)
|
||||
# use fp16
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", revision=revision, trust_remote_code=True, fp16=True).eval()
|
||||
model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
|
||||
self._model = model
|
||||
|
||||
return self._model, self._tokenizer
|
||||
|
||||
def llm_stream_generator(self, **kwargs):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
def adaptor(kwargs):
|
||||
query = kwargs['query']
|
||||
max_length = kwargs['max_length']
|
||||
top_p = kwargs['top_p']
|
||||
temperature = kwargs['temperature']
|
||||
history = kwargs['history']
|
||||
return query, max_length, top_p, temperature, history
|
||||
|
||||
query, max_length, top_p, temperature, history = adaptor(kwargs)
|
||||
|
||||
for response in self._model.chat(self._tokenizer, query, history=history, stream=True):
|
||||
yield response
|
||||
|
||||
def try_to_import_special_deps(self, **kwargs):
|
||||
# import something that will raise error if the user does not install requirement_*.txt
|
||||
# 🏃♂️🏃♂️🏃♂️ 主进程执行
|
||||
import importlib
|
||||
importlib.import_module('modelscope')
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 GPT-Academic Interface
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetONNXGLMHandle, model_name)
|
||||
49
request_llm/bridge_spark.py
Normal file
49
request_llm/bridge_spark.py
Normal file
@@ -0,0 +1,49 @@
|
||||
|
||||
import time
|
||||
import threading
|
||||
import importlib
|
||||
from toolbox import update_ui, get_conf
|
||||
from multiprocessing import Process, Pipe
|
||||
|
||||
model_name = '星火认知大模型'
|
||||
|
||||
def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
|
||||
"""
|
||||
⭐多线程方法
|
||||
函数的说明请见 request_llm/bridge_all.py
|
||||
"""
|
||||
watch_dog_patience = 5
|
||||
response = ""
|
||||
|
||||
from .com_sparkapi import SparkRequestInstance
|
||||
sri = SparkRequestInstance()
|
||||
for response in sri.generate(inputs, llm_kwargs, history, sys_prompt):
|
||||
if len(observe_window) >= 1:
|
||||
observe_window[0] = response
|
||||
if len(observe_window) >= 2:
|
||||
if (time.time()-observe_window[1]) > watch_dog_patience: raise RuntimeError("程序终止。")
|
||||
return response
|
||||
|
||||
def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
|
||||
"""
|
||||
⭐单线程方法
|
||||
函数的说明请见 request_llm/bridge_all.py
|
||||
"""
|
||||
chatbot.append((inputs, ""))
|
||||
|
||||
if additional_fn is not None:
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
# 开始接收回复
|
||||
from .com_sparkapi import SparkRequestInstance
|
||||
sri = SparkRequestInstance()
|
||||
for response in sri.generate(inputs, llm_kwargs, history, system_prompt):
|
||||
chatbot[-1] = (inputs, response)
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
# 总结输出
|
||||
if response == f"[Local Message]: 等待{model_name}响应中 ...":
|
||||
response = f"[Local Message]: {model_name}响应异常 ..."
|
||||
history.extend([inputs, response])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
@@ -1,4 +1,4 @@
|
||||
from .bridge_newbing import preprocess_newbing_out, preprocess_newbing_out_simple
|
||||
from .bridge_newbingfree import preprocess_newbing_out, preprocess_newbing_out_simple
|
||||
from multiprocessing import Process, Pipe
|
||||
from toolbox import update_ui, get_conf, trimmed_format_exc
|
||||
import threading
|
||||
@@ -248,14 +248,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
return
|
||||
|
||||
if additional_fn is not None:
|
||||
import core_functional
|
||||
importlib.reload(core_functional) # 热更新prompt
|
||||
core_functional = core_functional.get_core_functions()
|
||||
if "PreProcess" in core_functional[additional_fn]:
|
||||
inputs = core_functional[additional_fn]["PreProcess"](
|
||||
inputs) # 获取预处理函数(如果有的话)
|
||||
inputs = core_functional[additional_fn]["Prefix"] + \
|
||||
inputs + core_functional[additional_fn]["Suffix"]
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
history_feedin = []
|
||||
for i in range(len(history)//2):
|
||||
|
||||
@@ -96,11 +96,8 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
additional_fn代表点击的哪个按钮,按钮见functional.py
|
||||
"""
|
||||
if additional_fn is not None:
|
||||
import core_functional
|
||||
importlib.reload(core_functional) # 热更新prompt
|
||||
core_functional = core_functional.get_core_functions()
|
||||
if "PreProcess" in core_functional[additional_fn]: inputs = core_functional[additional_fn]["PreProcess"](inputs) # 获取预处理函数(如果有的话)
|
||||
inputs = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"]
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
raw_input = "What I would like to say is the following: " + inputs
|
||||
history.extend([inputs, ""])
|
||||
|
||||
229
request_llm/chatglmoonx.py
Normal file
229
request_llm/chatglmoonx.py
Normal file
@@ -0,0 +1,229 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/model.py
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
import re
|
||||
import numpy as np
|
||||
# import torch
|
||||
from onnxruntime import InferenceSession, SessionOptions
|
||||
|
||||
|
||||
# Currently `MatMulInteger` and `DynamicQuantizeLinear` are only supported on CPU,
|
||||
# although they are documented as supported on CUDA.
|
||||
providers = ["CPUExecutionProvider"]
|
||||
|
||||
# if torch.cuda.is_available():
|
||||
# providers = ["CUDAExecutionProvider"] + providers
|
||||
|
||||
|
||||
# Default paths
|
||||
tokenizer_path = "chatglm-6b-int8-onnx-merged/sentencepiece.model"
|
||||
onnx_model_path = "chatglm-6b-int8-onnx-merged/chatglm-6b-int8.onnx"
|
||||
|
||||
|
||||
# input & output names
|
||||
past_names = [f"past_{name}_{i}" for i in range(28) for name in ["key", "value"]]
|
||||
present_names = [f"present_{name}_{i}" for i in range(28) for name in ["key", "value"]]
|
||||
output_names = ["logits"] + present_names
|
||||
|
||||
|
||||
# default kv_cache for first inference
|
||||
default_past_key_values = {
|
||||
k: np.zeros((1, 0, 32, 128), dtype=np.float32) for k in past_names
|
||||
}
|
||||
|
||||
|
||||
def chat_template(history: list[tuple[str, str]], current: str):
|
||||
prompt = ""
|
||||
chat_round = 0
|
||||
for question, answer in history:
|
||||
prompt += f"[Round {chat_round}]\n问:{question}\n答:{answer}\n"
|
||||
chat_round += 1
|
||||
prompt += f"[Round {chat_round}]\n问:{current}\n答:"
|
||||
return prompt
|
||||
|
||||
|
||||
def process_response(response: str):
|
||||
response = response.strip()
|
||||
response = response.replace("[[训练时间]]", "2023年")
|
||||
punkts = [
|
||||
[",", ","],
|
||||
["!", "!"],
|
||||
[":", ":"],
|
||||
[";", ";"],
|
||||
["\?", "?"],
|
||||
]
|
||||
for item in punkts:
|
||||
response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
|
||||
response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
|
||||
return response
|
||||
|
||||
|
||||
class ChatGLMModel():
|
||||
|
||||
def __init__(self, onnx_model_path=onnx_model_path, tokenizer_path=tokenizer_path, profile=False) -> None:
|
||||
self.tokenizer = ChatGLMTokenizer(tokenizer_path)
|
||||
options = SessionOptions()
|
||||
options.enable_profiling = profile
|
||||
self.session = InferenceSession(onnx_model_path, options, providers=providers)
|
||||
self.eop_token_id = self.tokenizer["<eop>"]
|
||||
|
||||
|
||||
def prepare_input(self, prompt: str):
|
||||
input_ids, prefix_mask = self.tokenizer.encode(prompt)
|
||||
|
||||
input_ids = np.array([input_ids], dtype=np.longlong)
|
||||
prefix_mask = np.array([prefix_mask], dtype=np.longlong)
|
||||
|
||||
return input_ids, prefix_mask, default_past_key_values
|
||||
|
||||
|
||||
def sample_next_token(self, logits: np.ndarray, top_k=50, top_p=0.7, temperature=1):
|
||||
# softmax with temperature
|
||||
exp_logits = np.exp(logits / temperature)
|
||||
probs = exp_logits / np.sum(exp_logits)
|
||||
|
||||
# top k
|
||||
top_k_idx = np.argsort(-probs)[:top_k]
|
||||
top_k_probs = probs[top_k_idx]
|
||||
|
||||
# top p
|
||||
cumsum_probs = np.cumsum(top_k_probs)
|
||||
top_k_probs[(cumsum_probs - top_k_probs) > top_p] = 0.0
|
||||
top_k_probs = top_k_probs / np.sum(top_k_probs)
|
||||
|
||||
# sample
|
||||
next_token = np.random.choice(top_k_idx, size=1, p=top_k_probs)
|
||||
return next_token[0].item()
|
||||
|
||||
|
||||
def generate_iterate(self, prompt: str, max_generated_tokens=100, top_k=50, top_p=0.7, temperature=1):
|
||||
input_ids, prefix_mask, past_key_values = self.prepare_input(prompt)
|
||||
output_tokens = []
|
||||
|
||||
while True:
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"prefix_mask": prefix_mask,
|
||||
"use_past": np.array(len(output_tokens) > 0),
|
||||
}
|
||||
inputs.update(past_key_values)
|
||||
|
||||
logits, *past_key_values = self.session.run(output_names, inputs)
|
||||
past_key_values = { k: v for k, v in zip(past_names, past_key_values) }
|
||||
|
||||
next_token = self.sample_next_token(logits[0, -1], top_k=top_k, top_p=top_p, temperature=temperature)
|
||||
|
||||
output_tokens += [next_token]
|
||||
|
||||
if next_token == self.eop_token_id or len(output_tokens) > max_generated_tokens:
|
||||
break
|
||||
|
||||
input_ids = np.array([[next_token]], dtype=np.longlong)
|
||||
prefix_mask = np.concatenate([prefix_mask, np.array([[0]], dtype=np.longlong)], axis=1)
|
||||
|
||||
yield process_response(self.tokenizer.decode(output_tokens))
|
||||
|
||||
return process_response(self.tokenizer.decode(output_tokens))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 Source Code From https://huggingface.co/K024/ChatGLM-6b-onnx-u8s8/blob/main/tokenizer.py
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
import re
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
|
||||
def replace_spaces_with_blank(match: re.Match[str]):
|
||||
return f"<|blank_{len(match.group())}|>"
|
||||
|
||||
|
||||
def replace_blank_with_spaces(match: re.Match[str]):
|
||||
return " " * int(match.group(1))
|
||||
|
||||
|
||||
class ChatGLMTokenizer:
|
||||
def __init__(self, vocab_file):
|
||||
assert vocab_file is not None
|
||||
self.vocab_file = vocab_file
|
||||
self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
|
||||
self.text_tokenizer = SentencePieceProcessor(str(vocab_file))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.text_tokenizer)
|
||||
|
||||
def __getitem__(self, key: str):
|
||||
return self.text_tokenizer[key]
|
||||
|
||||
|
||||
def preprocess(self, text: str, linebreak=True, whitespaces=True):
|
||||
if linebreak:
|
||||
text = text.replace("\n", "<n>")
|
||||
if whitespaces:
|
||||
text = text.replace("\t", "<|tab|>")
|
||||
text = re.sub(r" {2,80}", replace_spaces_with_blank, text)
|
||||
return text
|
||||
|
||||
|
||||
def encode(
|
||||
self, text: str, text_pair: str = None,
|
||||
linebreak=True, whitespaces=True,
|
||||
add_dummy_prefix=True, special_tokens=True,
|
||||
) -> tuple[list[int], list[int]]:
|
||||
"""
|
||||
text: Text to encode. Bidirectional part with a [gMASK] and an <sop> for causal LM.
|
||||
text_pair: causal LM part.
|
||||
linebreak: Whether to encode newline (\n) in text.
|
||||
whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
|
||||
special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
|
||||
add_dummy_prefix: Whether to add dummy blank space in the beginning.
|
||||
"""
|
||||
text = self.preprocess(text, linebreak, whitespaces)
|
||||
if not add_dummy_prefix:
|
||||
text = "<n>" + text
|
||||
|
||||
tokens = self.text_tokenizer.encode(text)
|
||||
prefix_mask = [1] * len(tokens)
|
||||
if special_tokens:
|
||||
tokens += [self.text_tokenizer["[gMASK]"], self.text_tokenizer["<sop>"]]
|
||||
prefix_mask += [1, 0]
|
||||
|
||||
if text_pair is not None:
|
||||
text_pair = self.preprocess(text_pair, linebreak, whitespaces)
|
||||
pair_tokens = self.text_tokenizer.encode(text_pair)
|
||||
tokens += pair_tokens
|
||||
prefix_mask += [0] * len(pair_tokens)
|
||||
if special_tokens:
|
||||
tokens += [self.text_tokenizer["<eop>"]]
|
||||
prefix_mask += [0]
|
||||
|
||||
return (tokens if add_dummy_prefix else tokens[2:]), prefix_mask
|
||||
|
||||
|
||||
def decode(self, text_ids: list[int]) -> str:
|
||||
text = self.text_tokenizer.decode(text_ids)
|
||||
text = text.replace("<n>", "\n")
|
||||
text = text.replace("<|tab|>", "\t")
|
||||
text = re.sub(r"<\|blank_(\d\d?)\|>", replace_blank_with_spaces, text)
|
||||
return text
|
||||
|
||||
|
||||
184
request_llm/com_sparkapi.py
Normal file
184
request_llm/com_sparkapi.py
Normal file
@@ -0,0 +1,184 @@
|
||||
from toolbox import get_conf
|
||||
import base64
|
||||
import datetime
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
from urllib.parse import urlparse
|
||||
import ssl
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
from urllib.parse import urlencode
|
||||
from wsgiref.handlers import format_date_time
|
||||
import websocket
|
||||
import threading, time
|
||||
|
||||
timeout_bot_msg = '[Local Message] Request timeout. Network error.'
|
||||
|
||||
class Ws_Param(object):
|
||||
# 初始化
|
||||
def __init__(self, APPID, APIKey, APISecret, gpt_url):
|
||||
self.APPID = APPID
|
||||
self.APIKey = APIKey
|
||||
self.APISecret = APISecret
|
||||
self.host = urlparse(gpt_url).netloc
|
||||
self.path = urlparse(gpt_url).path
|
||||
self.gpt_url = gpt_url
|
||||
|
||||
# 生成url
|
||||
def create_url(self):
|
||||
# 生成RFC1123格式的时间戳
|
||||
now = datetime.now()
|
||||
date = format_date_time(mktime(now.timetuple()))
|
||||
|
||||
# 拼接字符串
|
||||
signature_origin = "host: " + self.host + "\n"
|
||||
signature_origin += "date: " + date + "\n"
|
||||
signature_origin += "GET " + self.path + " HTTP/1.1"
|
||||
|
||||
# 进行hmac-sha256进行加密
|
||||
signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'), digestmod=hashlib.sha256).digest()
|
||||
signature_sha_base64 = base64.b64encode(signature_sha).decode(encoding='utf-8')
|
||||
authorization_origin = f'api_key="{self.APIKey}", algorithm="hmac-sha256", headers="host date request-line", signature="{signature_sha_base64}"'
|
||||
authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
|
||||
|
||||
# 将请求的鉴权参数组合为字典
|
||||
v = {
|
||||
"authorization": authorization,
|
||||
"date": date,
|
||||
"host": self.host
|
||||
}
|
||||
# 拼接鉴权参数,生成url
|
||||
url = self.gpt_url + '?' + urlencode(v)
|
||||
# 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释,比对相同参数时生成的url与自己代码生成的url是否一致
|
||||
return url
|
||||
|
||||
|
||||
|
||||
class SparkRequestInstance():
|
||||
def __init__(self):
|
||||
XFYUN_APPID, XFYUN_API_SECRET, XFYUN_API_KEY = get_conf('XFYUN_APPID', 'XFYUN_API_SECRET', 'XFYUN_API_KEY')
|
||||
|
||||
self.appid = XFYUN_APPID
|
||||
self.api_secret = XFYUN_API_SECRET
|
||||
self.api_key = XFYUN_API_KEY
|
||||
self.gpt_url = "ws://spark-api.xf-yun.com/v1.1/chat"
|
||||
self.time_to_yield_event = threading.Event()
|
||||
self.time_to_exit_event = threading.Event()
|
||||
|
||||
self.result_buf = ""
|
||||
|
||||
def generate(self, inputs, llm_kwargs, history, system_prompt):
|
||||
llm_kwargs = llm_kwargs
|
||||
history = history
|
||||
system_prompt = system_prompt
|
||||
import _thread as thread
|
||||
thread.start_new_thread(self.create_blocking_request, (inputs, llm_kwargs, history, system_prompt))
|
||||
while True:
|
||||
self.time_to_yield_event.wait(timeout=1)
|
||||
if self.time_to_yield_event.is_set():
|
||||
yield self.result_buf
|
||||
if self.time_to_exit_event.is_set():
|
||||
return self.result_buf
|
||||
|
||||
|
||||
def create_blocking_request(self, inputs, llm_kwargs, history, system_prompt):
|
||||
wsParam = Ws_Param(self.appid, self.api_key, self.api_secret, self.gpt_url)
|
||||
websocket.enableTrace(False)
|
||||
wsUrl = wsParam.create_url()
|
||||
|
||||
# 收到websocket连接建立的处理
|
||||
def on_open(ws):
|
||||
import _thread as thread
|
||||
thread.start_new_thread(run, (ws,))
|
||||
|
||||
def run(ws, *args):
|
||||
data = json.dumps(gen_params(ws.appid, *ws.all_args))
|
||||
ws.send(data)
|
||||
|
||||
# 收到websocket消息的处理
|
||||
def on_message(ws, message):
|
||||
data = json.loads(message)
|
||||
code = data['header']['code']
|
||||
if code != 0:
|
||||
print(f'请求错误: {code}, {data}')
|
||||
ws.close()
|
||||
self.time_to_exit_event.set()
|
||||
else:
|
||||
choices = data["payload"]["choices"]
|
||||
status = choices["status"]
|
||||
content = choices["text"][0]["content"]
|
||||
ws.content += content
|
||||
self.result_buf += content
|
||||
if status == 2:
|
||||
ws.close()
|
||||
self.time_to_exit_event.set()
|
||||
self.time_to_yield_event.set()
|
||||
|
||||
# 收到websocket错误的处理
|
||||
def on_error(ws, error):
|
||||
print("error:", error)
|
||||
self.time_to_exit_event.set()
|
||||
|
||||
# 收到websocket关闭的处理
|
||||
def on_close(ws, *args):
|
||||
self.time_to_exit_event.set()
|
||||
|
||||
# websocket
|
||||
ws = websocket.WebSocketApp(wsUrl, on_message=on_message, on_error=on_error, on_close=on_close, on_open=on_open)
|
||||
ws.appid = self.appid
|
||||
ws.content = ""
|
||||
ws.all_args = (inputs, llm_kwargs, history, system_prompt)
|
||||
ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})
|
||||
|
||||
def generate_message_payload(inputs, llm_kwargs, history, system_prompt):
|
||||
conversation_cnt = len(history) // 2
|
||||
messages = [{"role": "system", "content": system_prompt}]
|
||||
if conversation_cnt:
|
||||
for index in range(0, 2*conversation_cnt, 2):
|
||||
what_i_have_asked = {}
|
||||
what_i_have_asked["role"] = "user"
|
||||
what_i_have_asked["content"] = history[index]
|
||||
what_gpt_answer = {}
|
||||
what_gpt_answer["role"] = "assistant"
|
||||
what_gpt_answer["content"] = history[index+1]
|
||||
if what_i_have_asked["content"] != "":
|
||||
if what_gpt_answer["content"] == "": continue
|
||||
if what_gpt_answer["content"] == timeout_bot_msg: continue
|
||||
messages.append(what_i_have_asked)
|
||||
messages.append(what_gpt_answer)
|
||||
else:
|
||||
messages[-1]['content'] = what_gpt_answer['content']
|
||||
what_i_ask_now = {}
|
||||
what_i_ask_now["role"] = "user"
|
||||
what_i_ask_now["content"] = inputs
|
||||
messages.append(what_i_ask_now)
|
||||
return messages
|
||||
|
||||
|
||||
def gen_params(appid, inputs, llm_kwargs, history, system_prompt):
|
||||
"""
|
||||
通过appid和用户的提问来生成请参数
|
||||
"""
|
||||
data = {
|
||||
"header": {
|
||||
"app_id": appid,
|
||||
"uid": "1234"
|
||||
},
|
||||
"parameter": {
|
||||
"chat": {
|
||||
"domain": "general",
|
||||
"temperature": llm_kwargs["temperature"],
|
||||
"random_threshold": 0.5,
|
||||
"max_tokens": 4096,
|
||||
"auditing": "default"
|
||||
}
|
||||
},
|
||||
"payload": {
|
||||
"message": {
|
||||
"text": generate_message_payload(inputs, llm_kwargs, history, system_prompt)
|
||||
}
|
||||
}
|
||||
}
|
||||
return data
|
||||
|
||||
@@ -447,6 +447,15 @@ class _ChatHub:
|
||||
"""
|
||||
Ask a question to the bot
|
||||
"""
|
||||
req_header = HEADERS
|
||||
if self.cookies is not None:
|
||||
ws_cookies = []
|
||||
for cookie in self.cookies:
|
||||
ws_cookies.append(f"{cookie['name']}={cookie['value']}")
|
||||
req_header.update({
|
||||
'Cookie': ';'.join(ws_cookies),
|
||||
})
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
self.session = aiohttp.ClientSession(timeout=timeout)
|
||||
|
||||
@@ -455,7 +464,7 @@ class _ChatHub:
|
||||
# Check if websocket is closed
|
||||
self.wss = await self.session.ws_connect(
|
||||
wss_link,
|
||||
headers=HEADERS,
|
||||
headers=req_header,
|
||||
ssl=ssl_context,
|
||||
proxy=self.proxy,
|
||||
autoping=False,
|
||||
@@ -510,7 +519,11 @@ class _ChatHub:
|
||||
resp_txt_no_link = ""
|
||||
while not final:
|
||||
msg = await self.wss.receive()
|
||||
objects = msg.data.split(DELIMITER)
|
||||
try:
|
||||
objects = msg.data.split(DELIMITER)
|
||||
except :
|
||||
continue
|
||||
|
||||
for obj in objects:
|
||||
if obj is None or not obj:
|
||||
continue
|
||||
@@ -1109,4 +1122,4 @@ class ImageQuery(Query):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
180
request_llm/local_llm_class.py
Normal file
180
request_llm/local_llm_class.py
Normal file
@@ -0,0 +1,180 @@
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
import time
|
||||
import threading
|
||||
import importlib
|
||||
from toolbox import update_ui, get_conf, Singleton
|
||||
from multiprocessing import Process, Pipe
|
||||
|
||||
def SingletonLocalLLM(cls):
|
||||
"""
|
||||
一个单实例装饰器
|
||||
"""
|
||||
_instance = {}
|
||||
def _singleton(*args, **kargs):
|
||||
if cls not in _instance:
|
||||
_instance[cls] = cls(*args, **kargs)
|
||||
return _instance[cls]
|
||||
elif _instance[cls].corrupted:
|
||||
_instance[cls] = cls(*args, **kargs)
|
||||
return _instance[cls]
|
||||
else:
|
||||
return _instance[cls]
|
||||
return _singleton
|
||||
|
||||
class LocalLLMHandle(Process):
|
||||
def __init__(self):
|
||||
# ⭐主进程执行
|
||||
super().__init__(daemon=True)
|
||||
self.corrupted = False
|
||||
self.load_model_info()
|
||||
self.parent, self.child = Pipe()
|
||||
self.running = True
|
||||
self._model = None
|
||||
self._tokenizer = None
|
||||
self.info = ""
|
||||
self.check_dependency()
|
||||
self.start()
|
||||
self.threadLock = threading.Lock()
|
||||
|
||||
def load_model_info(self):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
raise NotImplementedError("Method not implemented yet")
|
||||
self.model_name = ""
|
||||
self.cmd_to_install = ""
|
||||
|
||||
def load_model_and_tokenizer(self):
|
||||
"""
|
||||
This function should return the model and the tokenizer
|
||||
"""
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
raise NotImplementedError("Method not implemented yet")
|
||||
|
||||
def llm_stream_generator(self, **kwargs):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
raise NotImplementedError("Method not implemented yet")
|
||||
|
||||
def try_to_import_special_deps(self, **kwargs):
|
||||
"""
|
||||
import something that will raise error if the user does not install requirement_*.txt
|
||||
"""
|
||||
# ⭐主进程执行
|
||||
raise NotImplementedError("Method not implemented yet")
|
||||
|
||||
def check_dependency(self):
|
||||
# ⭐主进程执行
|
||||
try:
|
||||
self.try_to_import_special_deps()
|
||||
self.info = "依赖检测通过"
|
||||
self.running = True
|
||||
except:
|
||||
self.info = f"缺少{self.model_name}的依赖,如果要使用{self.model_name},除了基础的pip依赖以外,您还需要运行{self.cmd_to_install}安装{self.model_name}的依赖。"
|
||||
self.running = False
|
||||
|
||||
def run(self):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
# 第一次运行,加载参数
|
||||
try:
|
||||
self._model, self._tokenizer = self.load_model_and_tokenizer()
|
||||
except:
|
||||
self.running = False
|
||||
from toolbox import trimmed_format_exc
|
||||
self.child.send(f'[Local Message] 不能正常加载{self.model_name}的参数.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
|
||||
self.child.send('[FinishBad]')
|
||||
raise RuntimeError(f"不能正常加载{self.model_name}的参数!")
|
||||
|
||||
while True:
|
||||
# 进入任务等待状态
|
||||
kwargs = self.child.recv()
|
||||
# 收到消息,开始请求
|
||||
try:
|
||||
for response_full in self.llm_stream_generator(**kwargs):
|
||||
self.child.send(response_full)
|
||||
self.child.send('[Finish]')
|
||||
# 请求处理结束,开始下一个循环
|
||||
except:
|
||||
from toolbox import trimmed_format_exc
|
||||
self.child.send(f'[Local Message] 调用{self.model_name}失败.' + '\n```\n' + trimmed_format_exc() + '\n```\n')
|
||||
self.child.send('[Finish]')
|
||||
|
||||
def stream_chat(self, **kwargs):
|
||||
# ⭐主进程执行
|
||||
self.threadLock.acquire()
|
||||
self.parent.send(kwargs)
|
||||
while True:
|
||||
res = self.parent.recv()
|
||||
if res == '[Finish]':
|
||||
break
|
||||
if res == '[FinishBad]':
|
||||
self.running = False
|
||||
self.corrupted = True
|
||||
break
|
||||
else:
|
||||
yield res
|
||||
self.threadLock.release()
|
||||
|
||||
|
||||
|
||||
def get_local_llm_predict_fns(LLMSingletonClass, model_name):
|
||||
load_message = f"{model_name}尚未加载,加载需要一段时间。注意,取决于`config.py`的配置,{model_name}消耗大量的内存(CPU)或显存(GPU),也许会导致低配计算机卡死 ……"
|
||||
|
||||
def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
|
||||
"""
|
||||
⭐多线程方法
|
||||
函数的说明请见 request_llm/bridge_all.py
|
||||
"""
|
||||
_llm_handle = LLMSingletonClass()
|
||||
if len(observe_window) >= 1: observe_window[0] = load_message + "\n\n" + _llm_handle.info
|
||||
if not _llm_handle.running: raise RuntimeError(_llm_handle.info)
|
||||
|
||||
# chatglm 没有 sys_prompt 接口,因此把prompt加入 history
|
||||
history_feedin = []
|
||||
history_feedin.append(["What can I do?", sys_prompt])
|
||||
for i in range(len(history)//2):
|
||||
history_feedin.append([history[2*i], history[2*i+1]] )
|
||||
|
||||
watch_dog_patience = 5 # 看门狗 (watchdog) 的耐心, 设置5秒即可
|
||||
response = ""
|
||||
for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
|
||||
if len(observe_window) >= 1:
|
||||
observe_window[0] = response
|
||||
if len(observe_window) >= 2:
|
||||
if (time.time()-observe_window[1]) > watch_dog_patience: raise RuntimeError("程序终止。")
|
||||
return response
|
||||
|
||||
|
||||
|
||||
def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
|
||||
"""
|
||||
⭐单线程方法
|
||||
函数的说明请见 request_llm/bridge_all.py
|
||||
"""
|
||||
chatbot.append((inputs, ""))
|
||||
|
||||
_llm_handle = LLMSingletonClass()
|
||||
chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.info)
|
||||
yield from update_ui(chatbot=chatbot, history=[])
|
||||
if not _llm_handle.running: raise RuntimeError(_llm_handle.info)
|
||||
|
||||
if additional_fn is not None:
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
# 处理历史信息
|
||||
history_feedin = []
|
||||
history_feedin.append(["What can I do?", system_prompt] )
|
||||
for i in range(len(history)//2):
|
||||
history_feedin.append([history[2*i], history[2*i+1]] )
|
||||
|
||||
# 开始接收回复
|
||||
response = f"[Local Message]: 等待{model_name}响应中 ..."
|
||||
for response in _llm_handle.stream_chat(query=inputs, history=history_feedin, max_length=llm_kwargs['max_length'], top_p=llm_kwargs['top_p'], temperature=llm_kwargs['temperature']):
|
||||
chatbot[-1] = (inputs, response)
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
# 总结输出
|
||||
if response == f"[Local Message]: 等待{model_name}响应中 ...":
|
||||
response = f"[Local Message]: {model_name}响应异常 ..."
|
||||
history.extend([inputs, response])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
|
||||
return predict_no_ui_long_connection, predict
|
||||
@@ -1,5 +1,5 @@
|
||||
protobuf
|
||||
transformers==4.27.1
|
||||
transformers>=4.27.1
|
||||
cpm_kernels
|
||||
torch>=1.10
|
||||
mdtex2html
|
||||
|
||||
11
request_llm/requirements_chatglm_onnx.txt
Normal file
11
request_llm/requirements_chatglm_onnx.txt
Normal file
@@ -0,0 +1,11 @@
|
||||
protobuf
|
||||
transformers>=4.27.1
|
||||
cpm_kernels
|
||||
torch>=1.10
|
||||
mdtex2html
|
||||
sentencepiece
|
||||
numpy
|
||||
onnxruntime
|
||||
sentencepiece
|
||||
streamlit
|
||||
streamlit-chat
|
||||
2
request_llm/requirements_qwen.txt
Normal file
2
request_llm/requirements_qwen.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
modelscope
|
||||
transformers_stream_generator
|
||||
Reference in New Issue
Block a user