new

2023-12-26 23:59:36 +08:00
parent 15f14f51ff
commit 8dd4d48474
43 changed files with 1343 additions and 618 deletions
--- a/crazy_functions/crazy_utils.py
+++ b/crazy_functions/crazy_utils.py
@@ -1,4 +1,4 @@
-from toolbox import update_ui, get_conf, trimmed_format_exc, get_max_token
+from toolbox import update_ui, get_conf, trimmed_format_exc, get_max_token, Singleton
 import threading
 import os
 import logging
@@ -139,6 +139,8 @@ def can_multi_process(llm):
    if llm.startswith('gpt-'): return True
    if llm.startswith('api2d-'): return True
    if llm.startswith('azure-'): return True
+    if llm.startswith('spark'): return True
+    if llm.startswith('zhipuai'): return True
    return False

 def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
@@ -312,95 +314,6 @@ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
    return gpt_response_collection


-def breakdown_txt_to_satisfy_token_limit(txt, get_token_fn, limit):
-    def cut(txt_tocut, must_break_at_empty_line):  # 递归
-        if get_token_fn(txt_tocut) <= limit:
-            return [txt_tocut]
-        else:
-            lines = txt_tocut.split('\n')
-            estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
-            estimated_line_cut = int(estimated_line_cut)
-            for cnt in reversed(range(estimated_line_cut)):
-                if must_break_at_empty_line:
-                    if lines[cnt] != "":
-                        continue
-                print(cnt)
-                prev = "\n".join(lines[:cnt])
-                post = "\n".join(lines[cnt:])
-                if get_token_fn(prev) < limit:
-                    break
-            if cnt == 0:
-                raise RuntimeError("存在一行极长的文本！")
-            # print(len(post))
-            # 列表递归接龙
-            result = [prev]
-            result.extend(cut(post, must_break_at_empty_line))
-            return result
-    try:
-        return cut(txt, must_break_at_empty_line=True)
-    except RuntimeError:
-        return cut(txt, must_break_at_empty_line=False)
-
-
-def force_breakdown(txt, limit, get_token_fn):
-    """
-    当无法用标点、空行分割时，我们用最暴力的方法切割
-    """
-    for i in reversed(range(len(txt))):
-        if get_token_fn(txt[:i]) < limit:
-            return txt[:i], txt[i:]
-    return "Tiktoken未知错误", "Tiktoken未知错误"
-
-def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
-    # 递归
-    def cut(txt_tocut, must_break_at_empty_line, break_anyway=False):  
-        if get_token_fn(txt_tocut) <= limit:
-            return [txt_tocut]
-        else:
-            lines = txt_tocut.split('\n')
-            estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
-            estimated_line_cut = int(estimated_line_cut)
-            cnt = 0
-            for cnt in reversed(range(estimated_line_cut)):
-                if must_break_at_empty_line:
-                    if lines[cnt] != "":
-                        continue
-                prev = "\n".join(lines[:cnt])
-                post = "\n".join(lines[cnt:])
-                if get_token_fn(prev) < limit:
-                    break
-            if cnt == 0:
-                if break_anyway:
-                    prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
-                else:
-                    raise RuntimeError(f"存在一行极长的文本！{txt_tocut}")
-            # print(len(post))
-            # 列表递归接龙
-            result = [prev]
-            result.extend(cut(post, must_break_at_empty_line, break_anyway=break_anyway))
-            return result
-    try:
-        # 第1次尝试，将双空行（\n\n）作为切分点
-        return cut(txt, must_break_at_empty_line=True)
-    except RuntimeError:
-        try:
-            # 第2次尝试，将单空行（\n）作为切分点
-            return cut(txt, must_break_at_empty_line=False)
-        except RuntimeError:
-            try:
-                # 第3次尝试，将英文句号（.）作为切分点
-                res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的，作为一个标识而存在
-                return [r.replace('。\n', '.') for r in res]
-            except RuntimeError as e:
-                try:
-                    # 第4次尝试，将中文句号（。）作为切分点
-                    res = cut(txt.replace('。', '。。\n'), must_break_at_empty_line=False)
-                    return [r.replace('。。\n', '。') for r in res]
-                except RuntimeError as e:
-                    # 第5次尝试，没办法了，随便切一下敷衍吧
-                    return cut(txt, must_break_at_empty_line=False, break_anyway=True)
-
-

 def read_and_clean_pdf_text(fp):
    """
@@ -631,90 +544,6 @@ def get_files_from_everything(txt, type): # type='.md'



-
-def Singleton(cls):
-    _instance = {}
- 
-    def _singleton(*args, **kargs):
-        if cls not in _instance:
-            _instance[cls] = cls(*args, **kargs)
-        return _instance[cls]
- 
-    return _singleton
-
-
-@Singleton
-class knowledge_archive_interface():
-    def __init__(self) -> None:
-        self.threadLock = threading.Lock()
-        self.current_id = ""
-        self.kai_path = None
-        self.qa_handle = None
-        self.text2vec_large_chinese = None
-
-    def get_chinese_text2vec(self):
-        if self.text2vec_large_chinese is None:
-            # < -------------------预热文本向量化模组--------------- >
-            from toolbox import ProxyNetworkActivate
-            print('Checking Text2vec ...')
-            from langchain.embeddings.huggingface import HuggingFaceEmbeddings
-            with ProxyNetworkActivate('Download_LLM'):    # 临时地激活代理网络
-                self.text2vec_large_chinese = HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")
-
-        return self.text2vec_large_chinese
-
-
-    def feed_archive(self, file_manifest, id="default"):
-        self.threadLock.acquire()
-        # import uuid
-        self.current_id = id
-        from zh_langchain import construct_vector_store
-        self.qa_handle, self.kai_path = construct_vector_store(   
-            vs_id=self.current_id, 
-            files=file_manifest, 
-            sentence_size=100,
-            history=[],
-            one_conent="",
-            one_content_segmentation="",
-            text2vec = self.get_chinese_text2vec(),
-        )
-        self.threadLock.release()
-
-    def get_current_archive_id(self):
-        return self.current_id
-    
-    def get_loaded_file(self):
-        return self.qa_handle.get_loaded_file()
-
-    def answer_with_archive_by_id(self, txt, id):
-        self.threadLock.acquire()
-        if not self.current_id == id:
-            self.current_id = id
-            from zh_langchain import construct_vector_store
-            self.qa_handle, self.kai_path = construct_vector_store(   
-                vs_id=self.current_id, 
-                files=[], 
-                sentence_size=100,
-                history=[],
-                one_conent="",
-                one_content_segmentation="",
-                text2vec = self.get_chinese_text2vec(),
-            )
-        VECTOR_SEARCH_SCORE_THRESHOLD = 0
-        VECTOR_SEARCH_TOP_K = 4
-        CHUNK_SIZE = 512
-        resp, prompt = self.qa_handle.get_knowledge_based_conent_test(
-            query = txt,
-            vs_path = self.kai_path,
-            score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD,
-            vector_search_top_k=VECTOR_SEARCH_TOP_K, 
-            chunk_conent=True,
-            chunk_size=CHUNK_SIZE,
-            text2vec = self.get_chinese_text2vec(),
-        )
-        self.threadLock.release()
-        return resp, prompt
-    
@Singleton
 class nougat_interface():
    def __init__(self):