diff --git a/crazy_functions/rag_fns/rag_file_support.py b/crazy_functions/rag_fns/rag_file_support.py new file mode 100644 index 00000000..98ba3bee --- /dev/null +++ b/crazy_functions/rag_fns/rag_file_support.py @@ -0,0 +1,22 @@ +import os +from llama_index.core import SimpleDirectoryReader + +supports_format = ['.csv', '.docx', '.epub', '.ipynb', '.mbox', '.md', '.pdf', '.txt', '.ppt', + '.pptm', '.pptx'] + + +# 修改后的 extract_text 函数,结合 SimpleDirectoryReader 和自定义解析逻辑 +def extract_text(file_path): + _, ext = os.path.splitext(file_path.lower()) + + # 使用 SimpleDirectoryReader 处理它支持的文件格式 + if ext in supports_format: + try: + reader = SimpleDirectoryReader(input_files=[file_path]) + documents = reader.load_data() + if len(documents) > 0: + return documents[0].text + except Exception as e: + pass + + return None