修复分割函数中使用的变量错误 (#1443)

* Fix force_breakdown function parameter name

* Add handling for PDFs with lowercase starting paragraphs

* Change first lowercase word in meta_txt to uppercase
This commit is contained in:
Menghuan1918
2024-01-03 19:49:17 +08:00
committed by GitHub
parent a96f842b3a
commit aba871342f
2 changed files with 5 additions and 2 deletions

View File

@@ -65,10 +65,10 @@ def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=F
# 如果没有找到合适的切分点
if break_anyway:
# 是否允许暴力切分
prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
prev, post = force_breakdown(remain_txt_to_cut, limit, get_token_fn)
else:
# 不允许直接报错
raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
raise RuntimeError(f"存在一行极长的文本!{remain_txt_to_cut}")
# 追加列表
res.append(prev); fin_len+=len(prev)