update

2024-02-25 22:16:46 +08:00
parent 47289f863d
commit d0703ef32d
96 changed files with 7507 additions and 2453 deletions
--- a/crazy_functions/pdf_fns/parse_word.py
+++ b/crazy_functions/pdf_fns/parse_word.py
@@ -0,0 +1,85 @@
+from crazy_functions.crazy_utils import read_and_clean_pdf_text, get_files_from_everything
+import os
+import re
+def extract_text_from_files(txt, chatbot, history):
+    """
+    查找pdf/md/word并获取文本内容并返回状态以及文本
+
+    输入参数 Args:
+        chatbot: chatbot inputs and outputs （用户界面对话窗口句柄，用于数据流可视化）
+        history (list): List of chat history （历史，对话历史列表）
+
+    输出 Returns:
+        文件是否存在(bool)
+        final_result(list):文本内容
+        page_one(list):第一页内容/摘要
+        file_manifest(list):文件路径
+        excption(string):需要用户手动处理的信息,如没出错则保持为空
+    """
+
+    final_result = []
+    page_one = []
+    file_manifest = []
+    excption = ""
+
+    if txt == "": 
+        final_result.append(txt)
+        return False, final_result, page_one, file_manifest, excption   #如输入区内容不是文件则直接返回输入区内容
+    
+    #查找输入区内容中的文件
+    file_pdf,pdf_manifest,folder_pdf = get_files_from_everything(txt, '.pdf')
+    file_md,md_manifest,folder_md = get_files_from_everything(txt, '.md')
+    file_word,word_manifest,folder_word = get_files_from_everything(txt, '.docx')
+    file_doc,doc_manifest,folder_doc = get_files_from_everything(txt, '.doc')
+
+    if file_doc:
+        excption = "word"
+        return False, final_result, page_one, file_manifest, excption
+    
+    file_num = len(pdf_manifest) + len(md_manifest) + len(word_manifest)
+    if file_num == 0:
+        final_result.append(txt)
+        return False, final_result, page_one, file_manifest, excption   #如输入区内容不是文件则直接返回输入区内容
+    
+    if file_pdf:
+        try:    # 尝试导入依赖，如果缺少依赖，则给出安装建议
+            import fitz
+        except:
+            excption = "pdf"
+            return False, final_result, page_one, file_manifest, excption
+        for index, fp in enumerate(pdf_manifest):
+            file_content, pdf_one = read_and_clean_pdf_text(fp) # （尝试）按照章节切割PDF
+            file_content = file_content.encode('utf-8', 'ignore').decode()   # avoid reading non-utf8 chars
+            pdf_one = str(pdf_one).encode('utf-8', 'ignore').decode()  # avoid reading non-utf8 chars
+            final_result.append(file_content)
+            page_one.append(pdf_one)
+            file_manifest.append(os.path.relpath(fp, folder_pdf))
+
+    if file_md:
+        for index, fp in enumerate(md_manifest):
+            with open(fp, 'r', encoding='utf-8', errors='replace') as f:
+                file_content = f.read()
+            file_content = file_content.encode('utf-8', 'ignore').decode()
+            headers = re.findall(r'^#\s(.*)$', file_content, re.MULTILINE)  #接下来提取md中的一级/二级标题作为摘要
+            if len(headers) > 0: 
+                page_one.append("\n".join(headers)) #合并所有的标题,以换行符分割
+            else:
+                page_one.append("")
+            final_result.append(file_content)
+            file_manifest.append(os.path.relpath(fp, folder_md))
+
+    if file_word:
+        try:    # 尝试导入依赖，如果缺少依赖，则给出安装建议
+            from docx import Document
+        except:
+            excption = "word_pip"
+            return False, final_result, page_one, file_manifest, excption
+        for index, fp in enumerate(word_manifest):
+            doc = Document(fp)
+            file_content = '\n'.join([p.text for p in doc.paragraphs])
+            file_content = file_content.encode('utf-8', 'ignore').decode()
+            page_one.append(file_content[:200])
+            final_result.append(file_content)
+            file_manifest.append(os.path.relpath(fp, folder_word))
+            
+    return True, final_result, page_one, file_manifest, excption