solve the concatenate error.

change arxiv download attempt url order
compat: deal with arxiv url change
2024-10-15 15:42:35 +00:00 · 2024-10-15 09:09:24 +00:00 · 2024-10-15 09:07:39 +00:00 · 2024-10-15 07:32:29 +00:00 · 2024-10-15 06:41:12 +00:00 · 2024-10-15 14:36:51 +08:00
7 changed files with 217 additions and 134 deletions
--- a/.github/workflows/build-with-latex-arm.yml
+++ b/.github/workflows/build-with-latex-arm.yml
@@ -46,6 +46,6 @@ jobs:
          context: .
          push: true
          platforms: linux/arm64
-          file: docs/GithubAction+NoLocal+Latex
+          file: docs/GithubAction+NoLocal+Latex+Arm
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
--- a/crazy_functions/Latex_Function.py
+++ b/crazy_functions/Latex_Function.py
@@ -138,25 +138,43 @@ def arxiv_download(chatbot, history, txt, allow_cache=True):
    cached_translation_pdf = check_cached_translation_pdf(arxiv_id)
    if cached_translation_pdf and allow_cache: return cached_translation_pdf, arxiv_id

-    url_tar = url_.replace('/abs/', '/e-print/')
-    translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'e-print')
    extract_dst = pj(ARXIV_CACHE_DIR, arxiv_id, 'extract')
-    os.makedirs(translation_dir, exist_ok=True)
-
-    # <-------------- download arxiv source file ------------->
+    translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'e-print')
    dst = pj(translation_dir, arxiv_id + '.tar')
-    if os.path.exists(dst):
-        yield from update_ui_lastest_msg("调用缓存", chatbot=chatbot, history=history)  # 刷新界面
+    os.makedirs(translation_dir, exist_ok=True)
+    # <-------------- download arxiv source file ------------->
+
+    def fix_url_and_download():
+        # for url_tar in [url_.replace('/abs/', '/e-print/'), url_.replace('/abs/', '/src/')]:
+        for url_tar in [url_.replace('/abs/', '/src/'), url_.replace('/abs/', '/e-print/')]:
+            proxies = get_conf('proxies')
+            r = requests.get(url_tar, proxies=proxies)
+            if r.status_code == 200:
+                with open(dst, 'wb+') as f:
+                    f.write(r.content)
+                return True
+        return False
+
+    if os.path.exists(dst) and allow_cache:
+        yield from update_ui_lastest_msg(f"调用缓存 {arxiv_id}", chatbot=chatbot, history=history)  # 刷新界面
+        success = True
    else:
-        yield from update_ui_lastest_msg("开始下载", chatbot=chatbot, history=history)  # 刷新界面
-        proxies = get_conf('proxies')
-        r = requests.get(url_tar, proxies=proxies)
-        with open(dst, 'wb+') as f:
-            f.write(r.content)
+        yield from update_ui_lastest_msg(f"开始下载 {arxiv_id}", chatbot=chatbot, history=history)  # 刷新界面
+        success = fix_url_and_download()
+        yield from update_ui_lastest_msg(f"下载完成 {arxiv_id}", chatbot=chatbot, history=history)  # 刷新界面
+
+
+    if not success:
+        yield from update_ui_lastest_msg(f"下载失败 {arxiv_id}", chatbot=chatbot, history=history)
+        raise tarfile.ReadError(f"论文下载失败 {arxiv_id}")
+
    # <-------------- extract file ------------->
-    yield from update_ui_lastest_msg("下载完成", chatbot=chatbot, history=history)  # 刷新界面
    from toolbox import extract_archive
-    extract_archive(file_path=dst, dest_dir=extract_dst)
+    try:
+        extract_archive(file_path=dst, dest_dir=extract_dst)
+    except tarfile.ReadError:
+        os.remove(dst)
+        raise tarfile.ReadError(f"论文下载失败")
    return extract_dst, arxiv_id


--- a/crazy_functions/latex_fns/latex_toolbox.py
+++ b/crazy_functions/latex_fns/latex_toolbox.py
@@ -697,15 +697,6 @@ def _merge_pdfs_ng(pdf1_path, pdf2_path, output_path):
                    ),
                    0,
                )
-                if "/Annots" in page1:
-                    page1_annot_id = [annot.idnum for annot in page1["/Annots"]]
-                else:
-                    page1_annot_id = []
-
-                if "/Annots" in page2:
-                    page2_annot_id = [annot.idnum for annot in page2["/Annots"]]
-                else:
-                    page2_annot_id = []
                if "/Annots" in new_page:
                    annotations = new_page["/Annots"]
                    for i, annot in enumerate(annotations):
@@ -720,114 +711,148 @@ def _merge_pdfs_ng(pdf1_path, pdf2_path, output_path):
                                if "/S" in action and action["/S"] == "/GoTo":
                                    # 内部链接：跳转到文档中的某个页面
                                    dest = action.get("/D")  # 目标页或目标位置
-                                    if dest and annot.idnum in page2_annot_id:
-                                        # 获取原始文件中跳转信息，包括跳转页面
-                                        destination = pdf2_reader.named_destinations[
-                                            dest
-                                        ]
-                                        page_number = (
-                                            pdf2_reader.get_destination_page_number(
-                                                destination
-                                            )
-                                        )
-                                        # 更新跳转信息，跳转到对应的页面和，指定坐标 (100, 150)，缩放比例为 100%
-                                        # “/D”:[10,'/XYZ',100,100,0]
-                                        annot_obj["/A"].update(
-                                            {
-                                                NameObject("/D"): ArrayObject(
-                                                    [
-                                                        NumberObject(page_number),
-                                                        destination.dest_array[1],
-                                                        FloatObject(
-                                                            destination.dest_array[2]
-                                                            + int(
-                                                                page1.mediaBox.getWidth()
-                                                            )
-                                                        ),
-                                                        destination.dest_array[3],
-                                                        destination.dest_array[4],
-                                                    ]
-                                                )  # 确保键和值是 PdfObject
-                                            }
-                                        )
-                                        rect = annot_obj.get("/Rect")
-                                        # 更新点击坐标
-                                        rect = ArrayObject(
-                                            [
-                                                FloatObject(
-                                                    rect[0]
-                                                    + int(page1.mediaBox.getWidth())
-                                                ),
-                                                rect[1],
-                                                FloatObject(
-                                                    rect[2]
-                                                    + int(page1.mediaBox.getWidth())
-                                                ),
-                                                rect[3],
+                                    # if dest and annot.idnum in page2_annot_id:
+                                    # if dest in pdf2_reader.named_destinations:
+                                    if dest and page2.annotations:
+                                        if annot in page2.annotations:
+                                            # 获取原始文件中跳转信息，包括跳转页面
+                                            destination = pdf2_reader.named_destinations[
+                                                dest
                                            ]
-                                        )
-                                        annot_obj.update(
-                                            {
-                                                NameObject(
-                                                    "/Rect"
-                                                ): rect  # 确保键和值是 PdfObject
-                                            }
-                                        )
-                                    if dest and annot.idnum in page1_annot_id:
-                                        # 获取原始文件中跳转信息，包括跳转页面
-                                        destination = pdf1_reader.named_destinations[
-                                            dest
-                                        ]
-                                        page_number = (
-                                            pdf1_reader.get_destination_page_number(
-                                                destination
+                                            page_number = (
+                                                pdf2_reader.get_destination_page_number(
+                                                    destination
+                                                )
                                            )
-                                        )
-                                        # 更新跳转信息，跳转到对应的页面和，指定坐标 (100, 150)，缩放比例为 100%
-                                        # “/D”:[10,'/XYZ',100,100,0]
-                                        annot_obj["/A"].update(
-                                            {
-                                                NameObject("/D"): ArrayObject(
-                                                    [
-                                                        NumberObject(page_number),
-                                                        destination.dest_array[1],
-                                                        FloatObject(
-                                                            destination.dest_array[2]
-                                                        ),
-                                                        destination.dest_array[3],
-                                                        destination.dest_array[4],
-                                                    ]
-                                                )  # 确保键和值是 PdfObject
-                                            }
-                                        )
-                                        rect = annot_obj.get("/Rect")
-                                        rect = ArrayObject(
-                                            [
-                                                FloatObject(rect[0]),
-                                                rect[1],
-                                                FloatObject(rect[2]),
-                                                rect[3],
+                                            # 更新跳转信息，跳转到对应的页面和，指定坐标 (100, 150)，缩放比例为 100%
+                                            # “/D”:[10,'/XYZ',100,100,0]
+                                            if destination.dest_array[1] == "/XYZ":
+                                                annot_obj["/A"].update(
+                                                    {
+                                                        NameObject("/D"): ArrayObject(
+                                                            [
+                                                                NumberObject(page_number),
+                                                                destination.dest_array[1],
+                                                                FloatObject(
+                                                                    destination.dest_array[
+                                                                        2
+                                                                    ]
+                                                                    + int(
+                                                                        page1.mediaBox.getWidth()
+                                                                    )
+                                                                ),
+                                                                destination.dest_array[3],
+                                                                destination.dest_array[4],
+                                                            ]
+                                                        )  # 确保键和值是 PdfObject
+                                                    }
+                                                )
+                                            else:
+                                                annot_obj["/A"].update(
+                                                    {
+                                                        NameObject("/D"): ArrayObject(
+                                                            [
+                                                                NumberObject(page_number),
+                                                                destination.dest_array[1],
+                                                            ]
+                                                        )  # 确保键和值是 PdfObject
+                                                    }
+                                                )
+
+                                            rect = annot_obj.get("/Rect")
+                                            # 更新点击坐标
+                                            rect = ArrayObject(
+                                                [
+                                                    FloatObject(
+                                                        rect[0]
+                                                        + int(page1.mediaBox.getWidth())
+                                                    ),
+                                                    rect[1],
+                                                    FloatObject(
+                                                        rect[2]
+                                                        + int(page1.mediaBox.getWidth())
+                                                    ),
+                                                    rect[3],
+                                                ]
+                                            )
+                                            annot_obj.update(
+                                                {
+                                                    NameObject(
+                                                        "/Rect"
+                                                    ): rect  # 确保键和值是 PdfObject
+                                                }
+                                            )
+                                    # if dest and annot.idnum in page1_annot_id:
+                                    # if dest in pdf1_reader.named_destinations:
+                                    if dest and page1.annotations:
+                                        if annot in page1.annotations:
+                                            # 获取原始文件中跳转信息，包括跳转页面
+                                            destination = pdf1_reader.named_destinations[
+                                                dest
                                            ]
-                                        )
-                                        annot_obj.update(
-                                            {
-                                                NameObject(
-                                                    "/Rect"
-                                                ): rect  # 确保键和值是 PdfObject
-                                            }
-                                        )
+                                            page_number = (
+                                                pdf1_reader.get_destination_page_number(
+                                                    destination
+                                                )
+                                            )
+                                            # 更新跳转信息，跳转到对应的页面和，指定坐标 (100, 150)，缩放比例为 100%
+                                            # “/D”:[10,'/XYZ',100,100,0]
+                                            if destination.dest_array[1] == "/XYZ":
+                                                annot_obj["/A"].update(
+                                                    {
+                                                        NameObject("/D"): ArrayObject(
+                                                            [
+                                                                NumberObject(page_number),
+                                                                destination.dest_array[1],
+                                                                FloatObject(
+                                                                    destination.dest_array[
+                                                                        2
+                                                                    ]
+                                                                ),
+                                                                destination.dest_array[3],
+                                                                destination.dest_array[4],
+                                                            ]
+                                                        )  # 确保键和值是 PdfObject
+                                                    }
+                                                )
+                                            else:
+                                                annot_obj["/A"].update(
+                                                    {
+                                                        NameObject("/D"): ArrayObject(
+                                                            [
+                                                                NumberObject(page_number),
+                                                                destination.dest_array[1],
+                                                            ]
+                                                        )  # 确保键和值是 PdfObject
+                                                    }
+                                                )
+
+                                            rect = annot_obj.get("/Rect")
+                                            rect = ArrayObject(
+                                                [
+                                                    FloatObject(rect[0]),
+                                                    rect[1],
+                                                    FloatObject(rect[2]),
+                                                    rect[3],
+                                                ]
+                                            )
+                                            annot_obj.update(
+                                                {
+                                                    NameObject(
+                                                        "/Rect"
+                                                    ): rect  # 确保键和值是 PdfObject
+                                                }
+                                            )

                                elif "/S" in action and action["/S"] == "/URI":
                                    # 外部链接：跳转到某个URI
                                    uri = action.get("/URI")
-
                output_writer.addPage(new_page)
            # Save the merged PDF file
            with open(output_path, "wb") as output_file:
                output_writer.write(output_file)


-
 def _merge_pdfs_legacy(pdf1_path, pdf2_path, output_path):
    import PyPDF2  # PyPDF2这个库有严重的内存泄露问题，把它放到子进程中运行，从而方便内存的释放

--- a/docs/GithubAction+NoLocal+Latex
+++ b/docs/GithubAction+NoLocal+Latex
@@ -3,19 +3,33 @@
 # - 2 构建 docker build -t gpt-academic-nolocal-latex -f docs/GithubAction+NoLocal+Latex .
 # - 3 运行 docker run -v /home/fuqingxu/arxiv_cache:/root/arxiv_cache --rm -it --net=host gpt-academic-nolocal-latex

-FROM menghuan1918/ubuntu_uv_ctex:latest
-ENV DEBIAN_FRONTEND=noninteractive
-SHELL ["/bin/bash", "-c"]
+FROM fuqingxu/python311_texlive_ctex:latest
+ENV PATH "$PATH:/usr/local/texlive/2022/bin/x86_64-linux"
+ENV PATH "$PATH:/usr/local/texlive/2023/bin/x86_64-linux"
+ENV PATH "$PATH:/usr/local/texlive/2024/bin/x86_64-linux"
+ENV PATH "$PATH:/usr/local/texlive/2025/bin/x86_64-linux"
+ENV PATH "$PATH:/usr/local/texlive/2026/bin/x86_64-linux"
+
+# 指定路径
 WORKDIR /gpt
+
+RUN pip3 install openai numpy arxiv rich
+RUN pip3 install colorama Markdown pygments pymupdf
+RUN pip3 install python-docx pdfminer
+RUN pip3 install nougat-ocr
+
+# 装载项目文件
 COPY . .
-RUN /root/.cargo/bin/uv venv --seed \
-    && source .venv/bin/activate \
-    && /root/.cargo/bin/uv pip install openai numpy arxiv rich colorama Markdown pygments pymupdf python-docx pdfminer \
-    && /root/.cargo/bin/uv pip install -r requirements.txt \
-    && /root/.cargo/bin/uv clean
+
+
+# 安装依赖
+RUN pip3 install -r requirements.txt
+
+# edge-tts需要的依赖
+RUN apt update && apt install ffmpeg -y

 # 可选步骤，用于预热模块
-RUN .venv/bin/python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()'
+RUN python3  -c 'from check_proxy import warm_up_modules; warm_up_modules()'

 # 启动
-CMD [".venv/bin/python3", "-u", "main.py"]
+CMD ["python3", "-u", "main.py"]
--- a/docs/GithubAction+NoLocal+Latex+Arm
+++ b/docs/GithubAction+NoLocal+Latex+Arm
@@ -0,0 +1,25 @@
+# 此Dockerfile适用于“无本地模型”的环境构建，如果需要使用chatglm等本地模型，请参考 docs/Dockerfile+ChatGLM
+# - 1 修改 `config.py`
+# - 2 构建 docker build -t gpt-academic-nolocal-latex -f docs/GithubAction+NoLocal+Latex .
+# - 3 运行 docker run -v /home/fuqingxu/arxiv_cache:/root/arxiv_cache --rm -it --net=host gpt-academic-nolocal-latex
+
+FROM menghuan1918/ubuntu_uv_ctex:latest
+ENV DEBIAN_FRONTEND=noninteractive
+SHELL ["/bin/bash", "-c"]
+WORKDIR /gpt
+COPY . .
+RUN /root/.cargo/bin/uv venv --seed \
+    && source .venv/bin/activate \
+    && /root/.cargo/bin/uv pip install openai numpy arxiv rich colorama Markdown pygments pymupdf python-docx pdfminer \
+    && /root/.cargo/bin/uv pip install -r requirements.txt \
+    && /root/.cargo/bin/uv clean
+
+# 对齐python3
+RUN rm -f /usr/bin/python3 && ln -s /gpt/.venv/bin/python /usr/bin/python3
+RUN rm -f /usr/bin/python && ln -s /gpt/.venv/bin/python /usr/bin/python
+
+# 可选步骤，用于预热模块
+RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()'
+
+# 启动
+CMD ["python3", "-u", "main.py"]
--- a/request_llms/bridge_all.py
+++ b/request_llms/bridge_all.py
@@ -1285,4 +1285,3 @@ def predict(inputs:str, llm_kwargs:dict, plugin_kwargs:dict, chatbot,

    # 更新一下llm_kwargs的参数，否则会出现参数不匹配的问题
    yield from method(inputs, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, stream, additional_fn)
-
--- a/request_llms/bridge_chatgpt.py
+++ b/request_llms/bridge_chatgpt.py
@@ -202,10 +202,13 @@ def predict_no_ui_long_connection(inputs:str, llm_kwargs:dict, history:list=[],
                    if (time.time()-observe_window[1]) > watch_dog_patience:
                        raise RuntimeError("用户取消了程序。")
        else: raise RuntimeError("意外Json结构："+delta)
-    if json_data and json_data['finish_reason'] == 'content_filter':
-        raise RuntimeError("由于提问含不合规内容被Azure过滤。")
-    if json_data and json_data['finish_reason'] == 'length':
+
+    finish_reason = json_data.get('finish_reason', None) if json_data else None
+    if finish_reason == 'content_filter':
+        raise RuntimeError("由于提问含不合规内容被过滤。")
+    if finish_reason == 'length':
        raise ConnectionAbortedError("正常结束，但显示Token不足，导致输出不完整，请削减单次输入的文本量。")
+
    return result


@@ -536,4 +539,3 @@ def generate_payload(inputs:str, llm_kwargs:dict, history:list, system_prompt:st

    return headers,payload

-
Author	SHA1	Message	Date
wsg1873	303d55c8a1	solve the concatenate error.	2024-10-15 15:42:35 +00:00
binary-husky	c83bf214d0	change arxiv download attempt url order	2024-10-15 09:09:24 +00:00
binary-husky	e34c49dce5	compat: deal with arxiv url change	2024-10-15 09:07:39 +00:00
binary-husky	3890467c84	replace `rm` with `rm -f`	2024-10-15 07:32:29 +00:00
binary-husky	074b3c9828	explicitly declare default value	2024-10-15 06:41:12 +00:00
Nextstrain	b8e8457a01	关于o1系列模型无法正常请求的修复，多模型轮询KeyError: 'finish_reason'的修复 (#1992 ) * Update bridge_all.py * Update bridge_chatgpt.py * Update bridge_chatgpt.py * Update bridge_all.py * Update bridge_all.py	2024-10-15 14:36:51 +08:00
binary-husky	2c93a24d7e	fix dockerfile: try align python	2024-10-15 06:35:35 +00:00
binary-husky	e9af6ef3a0	fix: github action glitch	2024-10-15 06:32:47 +00:00
wsg1873	5ae8981dbb	add the '/Fit' destination (#2009 )	2024-10-14 22:50:56 +08:00