up
This commit is contained in:
@@ -19,6 +19,12 @@ from .bridge_chatgpt import predict as chatgpt_ui
|
||||
from .bridge_chatglm import predict_no_ui_long_connection as chatglm_noui
|
||||
from .bridge_chatglm import predict as chatglm_ui
|
||||
|
||||
from .bridge_chatglm import predict_no_ui_long_connection as chatglm_noui
|
||||
from .bridge_chatglm import predict as chatglm_ui
|
||||
|
||||
from .bridge_qianfan import predict_no_ui_long_connection as qianfan_noui
|
||||
from .bridge_qianfan import predict as qianfan_ui
|
||||
|
||||
colors = ['#FF00FF', '#00FFFF', '#FF0000', '#990099', '#009999', '#990044']
|
||||
|
||||
class LazyloadTiktoken(object):
|
||||
@@ -165,7 +171,14 @@ model_info = {
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
},
|
||||
|
||||
"qianfan": {
|
||||
"fn_with_ui": qianfan_ui,
|
||||
"fn_without_ui": qianfan_noui,
|
||||
"endpoint": None,
|
||||
"max_token": 2000,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
},
|
||||
}
|
||||
|
||||
# -=-=-=-=-=-=- 以下部分是新加入的模型,可能附带额外依赖 -=-=-=-=-=-=-
|
||||
@@ -361,7 +374,7 @@ if "chatgpt_website" in AVAIL_LLM_MODELS: # 接入一些逆向工程https://gi
|
||||
"chatgpt_website": {
|
||||
"fn_with_ui": chatgpt_website_ui,
|
||||
"fn_without_ui": chatgpt_website_noui,
|
||||
"endpoint": None,
|
||||
"endpoint": openai_endpoint,
|
||||
"max_token": 4096,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
@@ -385,6 +398,22 @@ if "spark" in AVAIL_LLM_MODELS: # 讯飞星火认知大模型
|
||||
})
|
||||
except:
|
||||
print(trimmed_format_exc())
|
||||
if "llama2" in AVAIL_LLM_MODELS: # llama2
|
||||
try:
|
||||
from .bridge_llama2 import predict_no_ui_long_connection as llama2_noui
|
||||
from .bridge_llama2 import predict as llama2_ui
|
||||
model_info.update({
|
||||
"llama2": {
|
||||
"fn_with_ui": llama2_ui,
|
||||
"fn_without_ui": llama2_noui,
|
||||
"endpoint": None,
|
||||
"max_token": 4096,
|
||||
"tokenizer": tokenizer_gpt35,
|
||||
"token_cnt": get_token_num_gpt35,
|
||||
}
|
||||
})
|
||||
except:
|
||||
print(trimmed_format_exc())
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -177,14 +177,13 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="非Openai官方接口返回了错误:" + chunk.decode()) # 刷新界面
|
||||
return
|
||||
|
||||
# print(chunk.decode()[6:])
|
||||
if is_head_of_the_stream and (r'"object":"error"' not in chunk.decode()):
|
||||
chunk_decoded = chunk.decode()
|
||||
if is_head_of_the_stream and (r'"object":"error"' not in chunk_decoded) and (r"choices" not in chunk_decoded):
|
||||
# 数据流的第一帧不携带content
|
||||
is_head_of_the_stream = False; continue
|
||||
|
||||
if chunk:
|
||||
try:
|
||||
chunk_decoded = chunk.decode()
|
||||
# 前者是API2D的结束条件,后者是OPENAI的结束条件
|
||||
if ('data: [DONE]' in chunk_decoded) or (len(json.loads(chunk_decoded[6:])['choices'][0]["delta"]) == 0):
|
||||
# 判定为数据流的结束,gpt_replying_buffer也写完了
|
||||
@@ -192,7 +191,7 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
break
|
||||
# 处理数据流的主体
|
||||
chunkjson = json.loads(chunk_decoded[6:])
|
||||
status_text = f"finish_reason: {chunkjson['choices'][0]['finish_reason']}"
|
||||
status_text = f"finish_reason: {chunkjson['choices'][0].get('finish_reason', 'null')}"
|
||||
# 如果这里抛出异常,一般是文本过长,详情见get_full_error的输出
|
||||
gpt_replying_buffer = gpt_replying_buffer + json.loads(chunk_decoded[6:])['choices'][0]["delta"]["content"]
|
||||
history[-1] = gpt_replying_buffer
|
||||
@@ -216,7 +215,6 @@ def handle_error(inputs, llm_kwargs, chatbot, history, chunk_decoded, error_msg)
|
||||
history = clip_history(inputs=inputs, history=history, tokenizer=model_info[llm_kwargs['llm_model']]['tokenizer'],
|
||||
max_token_limit=(model_info[llm_kwargs['llm_model']]['max_token'])) # history至少释放二分之一
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Reduce the length. 本次输入过长, 或历史数据过长. 历史缓存数据已部分释放, 您可以请再次尝试. (若再次失败则更可能是因为输入过长.)")
|
||||
# history = [] # 清除历史
|
||||
elif "does not exist" in error_msg:
|
||||
chatbot[-1] = (chatbot[-1][0], f"[Local Message] Model {llm_kwargs['llm_model']} does not exist. 模型不存在, 或者您没有获得体验资格.")
|
||||
elif "Incorrect API key" in error_msg:
|
||||
|
||||
@@ -118,16 +118,6 @@ def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_promp
|
||||
chatbot 为WebUI中显示的对话列表,修改它,然后yeild出去,可以直接修改对话界面内容
|
||||
additional_fn代表点击的哪个按钮,按钮见functional.py
|
||||
"""
|
||||
if is_any_api_key(inputs):
|
||||
chatbot._cookies['api_key'] = inputs
|
||||
chatbot.append(("输入已识别为openai的api_key", what_keys(inputs)))
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="api_key已导入") # 刷新界面
|
||||
return
|
||||
elif not is_any_api_key(chatbot._cookies['api_key']):
|
||||
chatbot.append((inputs, "缺少api_key。\n\n1. 临时解决方案:直接在输入区键入api_key,然后回车提交。\n\n2. 长效解决方案:在config.py中配置。"))
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="缺少api_key") # 刷新界面
|
||||
return
|
||||
|
||||
if additional_fn is not None:
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
@@ -245,14 +235,9 @@ def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
|
||||
if not is_any_api_key(llm_kwargs['api_key']):
|
||||
raise AssertionError("你提供了错误的API_KEY。\n\n1. 临时解决方案:直接在输入区键入api_key,然后回车提交。\n\n2. 长效解决方案:在config.py中配置。")
|
||||
|
||||
api_key = select_api_key(llm_kwargs['api_key'], llm_kwargs['llm_model'])
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}"
|
||||
}
|
||||
if API_ORG.startswith('org-'): headers.update({"OpenAI-Organization": API_ORG})
|
||||
if llm_kwargs['llm_model'].startswith('azure-'): headers.update({"api-key": api_key})
|
||||
|
||||
conversation_cnt = len(history) // 2
|
||||
|
||||
|
||||
91
request_llm/bridge_llama2.py
Normal file
91
request_llm/bridge_llama2.py
Normal file
@@ -0,0 +1,91 @@
|
||||
model_name = "LLaMA"
|
||||
cmd_to_install = "`pip install -r request_llm/requirements_chatglm.txt`"
|
||||
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
||||
from toolbox import update_ui, get_conf, ProxyNetworkActivate
|
||||
from multiprocessing import Process, Pipe
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
|
||||
from threading import Thread
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 Local Model
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
@SingletonLocalLLM
|
||||
class GetONNXGLMHandle(LocalLLMHandle):
|
||||
|
||||
def load_model_info(self):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
self.model_name = model_name
|
||||
self.cmd_to_install = cmd_to_install
|
||||
|
||||
def load_model_and_tokenizer(self):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
import os, glob
|
||||
import os
|
||||
import platform
|
||||
huggingface_token, device = get_conf('HUGGINGFACE_ACCESS_TOKEN', 'LOCAL_MODEL_DEVICE')
|
||||
assert len(huggingface_token) != 0, "没有填写 HUGGINGFACE_ACCESS_TOKEN"
|
||||
with open(os.path.expanduser('~/.cache/huggingface/token'), 'w') as f:
|
||||
f.write(huggingface_token)
|
||||
model_id = 'meta-llama/Llama-2-7b-chat-hf'
|
||||
with ProxyNetworkActivate():
|
||||
self._tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=huggingface_token)
|
||||
# use fp16
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=huggingface_token).eval()
|
||||
if device.startswith('cuda'): model = model.half().to(device)
|
||||
self._model = model
|
||||
|
||||
return self._model, self._tokenizer
|
||||
|
||||
def llm_stream_generator(self, **kwargs):
|
||||
# 🏃♂️🏃♂️🏃♂️ 子进程执行
|
||||
def adaptor(kwargs):
|
||||
query = kwargs['query']
|
||||
max_length = kwargs['max_length']
|
||||
top_p = kwargs['top_p']
|
||||
temperature = kwargs['temperature']
|
||||
history = kwargs['history']
|
||||
console_slience = kwargs.get('console_slience', True)
|
||||
return query, max_length, top_p, temperature, history, console_slience
|
||||
|
||||
def convert_messages_to_prompt(query, history):
|
||||
prompt = ""
|
||||
for a, b in history:
|
||||
prompt += f"\n[INST]{a}[/INST]"
|
||||
prompt += "\n{b}" + b
|
||||
prompt += f"\n[INST]{query}[/INST]"
|
||||
return prompt
|
||||
|
||||
query, max_length, top_p, temperature, history, console_slience = adaptor(kwargs)
|
||||
prompt = convert_messages_to_prompt(query, history)
|
||||
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-=-
|
||||
# code from transformers.llama
|
||||
streamer = TextIteratorStreamer(self._tokenizer)
|
||||
# Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
|
||||
inputs = self._tokenizer([prompt], return_tensors="pt")
|
||||
prompt_tk_back = self._tokenizer.batch_decode(inputs['input_ids'])[0]
|
||||
|
||||
generation_kwargs = dict(inputs.to(self._model.device), streamer=streamer, max_new_tokens=max_length)
|
||||
thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
|
||||
thread.start()
|
||||
generated_text = ""
|
||||
for new_text in streamer:
|
||||
generated_text += new_text
|
||||
if not console_slience: print(new_text, end='')
|
||||
yield generated_text.lstrip(prompt_tk_back).rstrip("</s>")
|
||||
if not console_slience: print()
|
||||
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-=-
|
||||
|
||||
def try_to_import_special_deps(self, **kwargs):
|
||||
# import something that will raise error if the user does not install requirement_*.txt
|
||||
# 🏃♂️🏃♂️🏃♂️ 主进程执行
|
||||
import importlib
|
||||
importlib.import_module('transformers')
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 GPT-Academic Interface
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetONNXGLMHandle, model_name)
|
||||
164
request_llm/bridge_qianfan.py
Normal file
164
request_llm/bridge_qianfan.py
Normal file
@@ -0,0 +1,164 @@
|
||||
|
||||
import time, requests, json
|
||||
from multiprocessing import Process, Pipe
|
||||
from functools import wraps
|
||||
from datetime import datetime, timedelta
|
||||
from toolbox import get_conf, update_ui, is_any_api_key, select_api_key, what_keys, clip_history, trimmed_format_exc, get_conf
|
||||
|
||||
model_name = '千帆大模型平台'
|
||||
timeout_bot_msg = '[Local Message] Request timeout. Network error.'
|
||||
|
||||
def cache_decorator(timeout):
|
||||
cache = {}
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
key = (func.__name__, args, frozenset(kwargs.items()))
|
||||
# Check if result is already cached and not expired
|
||||
if key in cache:
|
||||
result, timestamp = cache[key]
|
||||
if datetime.now() - timestamp < timedelta(seconds=timeout):
|
||||
return result
|
||||
|
||||
# Call the function and cache the result
|
||||
result = func(*args, **kwargs)
|
||||
cache[key] = (result, datetime.now())
|
||||
return result
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
@cache_decorator(timeout=3600)
|
||||
def get_access_token():
|
||||
"""
|
||||
使用 AK,SK 生成鉴权签名(Access Token)
|
||||
:return: access_token,或是None(如果错误)
|
||||
"""
|
||||
# if (access_token_cache is None) or (time.time() - last_access_token_obtain_time > 3600):
|
||||
BAIDU_CLOUD_API_KEY, BAIDU_CLOUD_SECRET_KEY = get_conf('BAIDU_CLOUD_API_KEY', 'BAIDU_CLOUD_SECRET_KEY')
|
||||
|
||||
if len(BAIDU_CLOUD_SECRET_KEY) == 0: raise RuntimeError("没有配置BAIDU_CLOUD_SECRET_KEY")
|
||||
if len(BAIDU_CLOUD_API_KEY) == 0: raise RuntimeError("没有配置BAIDU_CLOUD_API_KEY")
|
||||
|
||||
url = "https://aip.baidubce.com/oauth/2.0/token"
|
||||
params = {"grant_type": "client_credentials", "client_id": BAIDU_CLOUD_API_KEY, "client_secret": BAIDU_CLOUD_SECRET_KEY}
|
||||
access_token_cache = str(requests.post(url, params=params).json().get("access_token"))
|
||||
return access_token_cache
|
||||
# else:
|
||||
# return access_token_cache
|
||||
|
||||
|
||||
def generate_message_payload(inputs, llm_kwargs, history, system_prompt):
|
||||
conversation_cnt = len(history) // 2
|
||||
messages = [{"role": "user", "content": system_prompt}]
|
||||
messages.append({"role": "assistant", "content": 'Certainly!'})
|
||||
if conversation_cnt:
|
||||
for index in range(0, 2*conversation_cnt, 2):
|
||||
what_i_have_asked = {}
|
||||
what_i_have_asked["role"] = "user"
|
||||
what_i_have_asked["content"] = history[index]
|
||||
what_gpt_answer = {}
|
||||
what_gpt_answer["role"] = "assistant"
|
||||
what_gpt_answer["content"] = history[index+1]
|
||||
if what_i_have_asked["content"] != "":
|
||||
if what_gpt_answer["content"] == "": continue
|
||||
if what_gpt_answer["content"] == timeout_bot_msg: continue
|
||||
messages.append(what_i_have_asked)
|
||||
messages.append(what_gpt_answer)
|
||||
else:
|
||||
messages[-1]['content'] = what_gpt_answer['content']
|
||||
what_i_ask_now = {}
|
||||
what_i_ask_now["role"] = "user"
|
||||
what_i_ask_now["content"] = inputs
|
||||
messages.append(what_i_ask_now)
|
||||
return messages
|
||||
|
||||
|
||||
def generate_from_baidu_qianfan(inputs, llm_kwargs, history, system_prompt):
|
||||
BAIDU_CLOUD_QIANFAN_MODEL, = get_conf('BAIDU_CLOUD_QIANFAN_MODEL')
|
||||
|
||||
url_lib = {
|
||||
"ERNIE-Bot": "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions" ,
|
||||
"ERNIE-Bot-turbo": "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/eb-instant" ,
|
||||
"BLOOMZ-7B": "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/bloomz_7b1",
|
||||
|
||||
"Llama-2-70B-Chat": "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/llama_2_70b",
|
||||
"Llama-2-13B-Chat": "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/llama_2_13b",
|
||||
"Llama-2-7B-Chat": "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/llama_2_7b",
|
||||
}
|
||||
|
||||
url = url_lib[BAIDU_CLOUD_QIANFAN_MODEL]
|
||||
|
||||
url += "?access_token=" + get_access_token()
|
||||
|
||||
|
||||
payload = json.dumps({
|
||||
"messages": generate_message_payload(inputs, llm_kwargs, history, system_prompt),
|
||||
"stream": True
|
||||
})
|
||||
headers = {
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
response = requests.request("POST", url, headers=headers, data=payload, stream=True)
|
||||
buffer = ""
|
||||
for line in response.iter_lines():
|
||||
if len(line) == 0: continue
|
||||
try:
|
||||
dec = line.decode().lstrip('data:')
|
||||
dec = json.loads(dec)
|
||||
incoming = dec['result']
|
||||
buffer += incoming
|
||||
yield buffer
|
||||
except:
|
||||
if ('error_code' in dec) and ("max length" in dec['error_msg']):
|
||||
raise ConnectionAbortedError(dec['error_msg']) # 上下文太长导致 token 溢出
|
||||
elif ('error_code' in dec):
|
||||
raise RuntimeError(dec['error_msg'])
|
||||
|
||||
|
||||
def predict_no_ui_long_connection(inputs, llm_kwargs, history=[], sys_prompt="", observe_window=[], console_slience=False):
|
||||
"""
|
||||
⭐多线程方法
|
||||
函数的说明请见 request_llm/bridge_all.py
|
||||
"""
|
||||
watch_dog_patience = 5
|
||||
response = ""
|
||||
|
||||
for response in generate_from_baidu_qianfan(inputs, llm_kwargs, history, sys_prompt):
|
||||
if len(observe_window) >= 1:
|
||||
observe_window[0] = response
|
||||
if len(observe_window) >= 2:
|
||||
if (time.time()-observe_window[1]) > watch_dog_patience: raise RuntimeError("程序终止。")
|
||||
return response
|
||||
|
||||
def predict(inputs, llm_kwargs, plugin_kwargs, chatbot, history=[], system_prompt='', stream = True, additional_fn=None):
|
||||
"""
|
||||
⭐单线程方法
|
||||
函数的说明请见 request_llm/bridge_all.py
|
||||
"""
|
||||
chatbot.append((inputs, ""))
|
||||
|
||||
if additional_fn is not None:
|
||||
from core_functional import handle_core_functionality
|
||||
inputs, history = handle_core_functionality(additional_fn, inputs, history, chatbot)
|
||||
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
# 开始接收回复
|
||||
try:
|
||||
for response in generate_from_baidu_qianfan(inputs, llm_kwargs, history, system_prompt):
|
||||
chatbot[-1] = (inputs, response)
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
except ConnectionAbortedError as e:
|
||||
from .bridge_all import model_info
|
||||
if len(history) >= 2: history[-1] = ""; history[-2] = "" # 清除当前溢出的输入:history[-2] 是本次输入, history[-1] 是本次输出
|
||||
history = clip_history(inputs=inputs, history=history, tokenizer=model_info[llm_kwargs['llm_model']]['tokenizer'],
|
||||
max_token_limit=(model_info[llm_kwargs['llm_model']]['max_token'])) # history至少释放二分之一
|
||||
chatbot[-1] = (chatbot[-1][0], "[Local Message] Reduce the length. 本次输入过长, 或历史数据过长. 历史缓存数据已部分释放, 您可以请再次尝试. (若再次失败则更可能是因为输入过长.)")
|
||||
yield from update_ui(chatbot=chatbot, history=history, msg="异常") # 刷新界面
|
||||
return
|
||||
|
||||
# 总结输出
|
||||
response = f"[Local Message]: {model_name}响应异常 ..."
|
||||
if response == f"[Local Message]: 等待{model_name}响应中 ...":
|
||||
response = f"[Local Message]: {model_name}响应异常 ..."
|
||||
history.extend([inputs, response])
|
||||
yield from update_ui(chatbot=chatbot, history=history)
|
||||
@@ -128,7 +128,7 @@ def get_local_llm_predict_fns(LLMSingletonClass, model_name):
|
||||
|
||||
# chatglm 没有 sys_prompt 接口,因此把prompt加入 history
|
||||
history_feedin = []
|
||||
history_feedin.append(["What can I do?", sys_prompt])
|
||||
history_feedin.append([sys_prompt, "Certainly!"])
|
||||
for i in range(len(history)//2):
|
||||
history_feedin.append([history[2*i], history[2*i+1]] )
|
||||
|
||||
@@ -161,7 +161,7 @@ def get_local_llm_predict_fns(LLMSingletonClass, model_name):
|
||||
|
||||
# 处理历史信息
|
||||
history_feedin = []
|
||||
history_feedin.append(["What can I do?", system_prompt] )
|
||||
history_feedin.append([system_prompt, "Certainly!"])
|
||||
for i in range(len(history)//2):
|
||||
history_feedin.append([history[2*i], history[2*i+1]] )
|
||||
|
||||
|
||||
Reference in New Issue
Block a user