Files
gpt_academic/crazy_functions/rag_fns/rag_file_support.py
binary-husky 8042750d41 Master 4.0 (#2210)
* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-08-23 15:59:22 +08:00

49 lines
1.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import subprocess
import os
supports_format = ['.csv', '.docx', '.epub', '.ipynb', '.mbox', '.md', '.pdf', '.txt', '.ppt', '.pptm', '.pptx', '.bat']
def convert_to_markdown(file_path: str) -> str:
"""
将支持的文件格式转换为Markdown格式
Args:
file_path: 输入文件路径
Returns:
str: 转换后的Markdown文件路径如果转换失败则返回原始文件路径
"""
_, ext = os.path.splitext(file_path.lower())
if ext in ['.docx', '.doc', '.pptx', '.ppt', '.pptm', '.xls', '.xlsx', '.csv', 'pdf']:
try:
# 创建输出Markdown文件路径
md_path = os.path.splitext(file_path)[0] + '.md'
# 使用markitdown工具将文件转换为Markdown
command = f"markitdown {file_path} > {md_path}"
subprocess.run(command, shell=True, check=True)
print(f"已将{ext}文件转换为Markdown: {md_path}")
return md_path
except Exception as e:
print(f"{ext}转Markdown失败: {str(e)},将继续处理原文件")
return file_path
return file_path
# 修改后的 extract_text 函数,结合 SimpleDirectoryReader 和自定义解析逻辑
def extract_text(file_path):
from llama_index.core import SimpleDirectoryReader
_, ext = os.path.splitext(file_path.lower())
# 使用 SimpleDirectoryReader 处理它支持的文件格式
if ext in supports_format:
try:
reader = SimpleDirectoryReader(input_files=[file_path])
print(f"Extracting text from {file_path} using SimpleDirectoryReader")
documents = reader.load_data()
print(f"Complete: Extracting text from {file_path} using SimpleDirectoryReader")
buffer = [ doc.text for doc in documents ]
return '\n'.join(buffer)
except Exception as e:
pass
else:
return '格式不支持'