From 83c48dbed977187eddf665971faedd6c8cf648b5 Mon Sep 17 00:00:00 2001 From: van Date: Sun, 26 Apr 2026 13:55:54 +0800 Subject: [PATCH] 1 --- .py | 479 ++++++++++++++++++ jd/fetch_logistics_ubuntu.py | 457 +++++++++++++++++ logistics.log | 68 +++ logistics_result.json | 48 ++ run_logistics.sh | 105 ++++ setup_ubuntu.sh | 153 ++++++ venv/bin/Activate.ps1 | 247 +++++++++ venv/bin/activate | 70 +++ venv/bin/activate.csh | 27 + venv/bin/activate.fish | 69 +++ venv/bin/dp | 7 + venv/bin/flask | 7 + venv/bin/normalizer | 7 + venv/bin/pip | 8 + venv/bin/pip3 | 8 + venv/bin/pip3.12 | 8 + venv/bin/python | 1 + venv/bin/python3 | 1 + venv/bin/python3.12 | 1 + venv/bin/tldextract | 7 + venv/bin/wsdump | 7 + .../site/python3.12/greenlet/greenlet.h | 164 ++++++ venv/lib64 | 1 + venv/pyvenv.cfg | 5 + 24 files changed, 1955 insertions(+) create mode 100644 .py create mode 100644 jd/fetch_logistics_ubuntu.py create mode 100644 logistics.log create mode 100644 logistics_result.json create mode 100644 run_logistics.sh create mode 100644 setup_ubuntu.sh create mode 100644 venv/bin/Activate.ps1 create mode 100644 venv/bin/activate create mode 100644 venv/bin/activate.csh create mode 100644 venv/bin/activate.fish create mode 100644 venv/bin/dp create mode 100644 venv/bin/flask create mode 100644 venv/bin/normalizer create mode 100644 venv/bin/pip create mode 100644 venv/bin/pip3 create mode 100644 venv/bin/pip3.12 create mode 100644 venv/bin/python create mode 100644 venv/bin/python3 create mode 100644 venv/bin/python3.12 create mode 100644 venv/bin/tldextract create mode 100644 venv/bin/wsdump create mode 100644 venv/include/site/python3.12/greenlet/greenlet.h create mode 100644 venv/lib64 create mode 100644 venv/pyvenv.cfg diff --git a/.py b/.py new file mode 100644 index 0000000..4d3ce97 --- /dev/null +++ b/.py @@ -0,0 +1,479 @@ +import time +import json +import re +import os +import platform +from DrissionPage import ChromiumPage, ChromiumOptions + +# Ubuntu 上常见的 Chrome/Chromium 路径 +UBUNTU_CHROME_PATHS = [ + '/usr/bin/google-chrome', + '/usr/bin/google-chrome-stable', + '/usr/bin/chromium-browser', + '/usr/bin/chromium', + '/snap/bin/chromium', + '/opt/google/chrome/chrome', +] + +# 是否使用无头模式(headless) +# True: 无界面模式,适合服务器环境 +# False: 有界面模式,需要 X11 或 Wayland +USE_HEADLESS = True # 可以根据需要修改 + +# 全局浏览器实例 +global_page = None + + +def find_chrome_path(): + """自动查找 Ubuntu 系统中的 Chrome/Chromium 路径""" + print("正在查找 Chrome/Chromium 浏览器...") + + # 首先尝试常见的路径 + for path in UBUNTU_CHROME_PATHS: + if os.path.exists(path): + print(f"✅ 找到浏览器: {path}") + return path + + # 尝试使用 which 命令查找 + import subprocess + try: + result = subprocess.run(['which', 'google-chrome'], + capture_output=True, text=True, timeout=5) + if result.returncode == 0 and os.path.exists(result.stdout.strip()): + path = result.stdout.strip() + print(f"✅ 通过 which 找到浏览器: {path}") + return path + except: + pass + + try: + result = subprocess.run(['which', 'chromium-browser'], + capture_output=True, text=True, timeout=5) + if result.returncode == 0 and os.path.exists(result.stdout.strip()): + path = result.stdout.strip() + print(f"✅ 通过 which 找到浏览器: {path}") + return path + except: + pass + + # 如果都找不到,返回最常见的路径 + default_path = '/usr/bin/google-chrome' + print(f"⚠️ 未找到浏览器,将使用默认路径: {default_path}") + print("请确保已安装 Google Chrome 或 Chromium:") + print(" sudo apt update") + print(" sudo apt install -y google-chrome-stable") + print(" 或者") + print(" sudo apt install -y chromium-browser") + return default_path + + +def get_global_browser(): + """获取全局浏览器实例(Ubuntu 版本)""" + global global_page + if global_page is None: + print("="*60) + print("Ubuntu 浏览器初始化") + print("="*60) + + # 检查操作系统 + if platform.system() != 'Linux': + print(f"⚠️ 警告: 当前系统是 {platform.system()},此脚本专为 Ubuntu 设计") + + # 查找 Chrome 路径 + chrome_path = find_chrome_path() + + options = ChromiumOptions() + options.set_browser_path(chrome_path) + + # Ubuntu 服务器环境通常使用无头模式 + if USE_HEADLESS: + print("配置为无头模式(headless)...") + try: + options.headless(True) + except: + # 如果 headless 方法不存在,使用参数 + try: + options.set_argument('--headless=new') + options.set_argument('--no-sandbox') + options.set_argument('--disable-dev-shm-usage') + except: + pass + else: + print("配置为有界面模式...") + # 检查是否有显示环境 + display = os.environ.get('DISPLAY') + if not display: + print("⚠️ 警告: 未检测到 DISPLAY 环境变量") + print("如果无法显示浏览器,请:") + print(" 1. 设置 USE_HEADLESS = True") + print(" 2. 或者设置 DISPLAY 环境变量(如 DISPLAY=:0)") + print(" 3. 或者使用 Xvfb(虚拟显示)") + + # Linux 特定参数 + try: + options.set_argument('--no-sandbox') # 在某些环境下需要 + options.set_argument('--disable-dev-shm-usage') # 避免 /dev/shm 空间不足 + options.set_argument('--disable-gpu') # 禁用 GPU(可选,在 headless 模式下有用) + except: + pass + + print(f"正在启动浏览器...") + print(f"浏览器路径: {chrome_path}") + if USE_HEADLESS: + print("模式: 无头模式(后台运行)") + else: + print("模式: 有界面模式") + + try: + global_page = ChromiumPage(options) + print("✅ 浏览器已成功启动!") + time.sleep(2) # 等待浏览器完全启动 + except Exception as e: + print(f"❌ 浏览器启动失败: {e}") + print("\n可能的解决方案:") + print("1. 确保已安装 Chrome/Chromium:") + print(" sudo apt update") + print(" sudo apt install -y google-chrome-stable") + print("2. 如果使用无头模式失败,尝试设置 USE_HEADLESS = False") + print("3. 确保有足够的权限") + print("4. 检查是否缺少依赖:") + print(" sudo apt install -y libnss3 libatk-bridge2.0-0 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libasound2") + import traceback + traceback.print_exc() + raise + else: + print("使用已存在的浏览器实例") + + return global_page + + +def extract_logistics_info(tracking_url): + """ + 从京东物流追踪页面提取运单号、承运人等信息(Ubuntu 版本) + + Args: + tracking_url: 物流追踪页面 URL,例如 https://3.cn/2t-Iibig + + Returns: + dict: 包含运单号、承运人、承运人电话、物流跟踪信息等的字典 + """ + page = get_global_browser() + + try: + print(f"\n正在打开物流追踪页面: {tracking_url}") + page.get(tracking_url) + print("页面加载中,请稍候...") + time.sleep(5) # 等待页面加载 + + # 检查页面是否成功加载 + current_url = page.url + print(f"当前页面 URL: {current_url}") + + # 检查页面标题 + try: + title = page.title + print(f"页面标题: {title}") + except: + print("无法获取页面标题") + + # 检查页面是否有内容 + try: + html_length = len(page.html) + print(f"页面 HTML 长度: {html_length} 字符") + if html_length < 100: + print("⚠️ 警告: 页面内容可能未完全加载") + except Exception as e: + print(f"⚠️ 无法获取页面 HTML: {e}") + + result = { + "waybill_no": None, # 运单号 + "carrier": None, # 国内承运人 + "carrier_phone": None, # 国内承运人电话 + "tracking_info": [], # 物流跟踪信息列表 + "raw_html": None # 原始 HTML(用于调试) + } + + # 方法1: 监听网络请求,查找物流数据 API + print("\n方法1: 监听网络请求...") + page.listen.start() + + # 滚动页面触发可能的请求 + page.scroll.down(300) + time.sleep(2) + page.scroll.to_bottom() + time.sleep(3) + + # 检查监听到的请求 + responses = page.listen.get() + print(f"监听到 {len(responses)} 个请求") + + # 查找可能的物流数据接口 + possible_urls = [ + 'track', 'logistics', 'waybill', 'express', + 'delivery', '3.cn', 'jd.com/logistics', + 'api.m.jd.com', 'mapi.jd.com' + ] + + for resp in responses: + url = resp.url if hasattr(resp, 'url') else '' + url_lower = url.lower() + + # 检查是否可能是物流相关的 API + if any(keyword in url_lower for keyword in possible_urls): + print(f"发现可能的物流 API: {url[:100]}") + try: + if hasattr(resp, 'response') and hasattr(resp.response, 'body'): + body = resp.response.body + + # 处理 JSON 响应 + if isinstance(body, dict): + json_data = body + elif isinstance(body, str): + try: + json_data = json.loads(body) + except: + continue + else: + continue + + # 尝试从 JSON 中提取运单号等信息 + extracted = extract_from_json(json_data) + if extracted: + result.update(extracted) + print("成功从 API 响应中提取数据") + return result + except Exception as e: + print(f"解析 API 响应时出错: {e}") + + # 方法2: 从页面 HTML/DOM 中提取 + print("\n方法2: 从页面 DOM 提取数据...") + + html = page.html + result['raw_html'] = html[:5000] # 保存部分 HTML 用于调试 + + # 从 HTML 文本中提取运单号 + waybill_patterns = [ + r'运单号[::\s]*(\d+)', + r'waybill[_\s]*no["\']?\s*[::]\s*["\']?(\d+)', + r'tracking[_\s]*number["\']?\s*[::]\s*["\']?(\d+)', + r'"waybillNo"\s*[::]\s*["\']?(\d+)', + r'"trackingNumber"\s*[::]\s*["\']?(\d+)', + ] + + for pattern in waybill_patterns: + matches = re.findall(pattern, html, re.IGNORECASE) + if matches: + result['waybill_no'] = matches[0] + print(f"找到运单号: {result['waybill_no']}") + break + + # 提取承运人 + carrier_patterns = [ + r'国内承运人[::\s]*([^\s<,,]+)', + r'carrier[::\s]*([^\s<,,]+)', + r'"carrier"\s*[::]\s*["\']?([^"\']+)', + ] + + for pattern in carrier_patterns: + matches = re.findall(pattern, html, re.IGNORECASE) + if matches: + result['carrier'] = matches[0].strip() + print(f"找到承运人: {result['carrier']}") + break + + # 提取承运人电话 + phone_patterns = [ + r'国内承运人电话[::\s]*(\d+)', + r'carrier[_\s]*phone[::\s]*(\d+)', + r'"carrierPhone"\s*[::]\s*["\']?(\d+)', + ] + + for pattern in phone_patterns: + matches = re.findall(pattern, html, re.IGNORECASE) + if matches: + result['carrier_phone'] = matches[0] + print(f"找到承运人电话: {result['carrier_phone']}") + break + + # 方法3: 从 DOM 元素中提取 + print("\n方法3: 从 DOM 元素提取数据...") + + # 尝试查找运单号元素 + waybill_elements = page.eles('xpath=//*[contains(text(), "运单号") or contains(text(), "运单")]') + for elem in waybill_elements: + text = elem.text + parent_text = elem.parent().text if elem.parent() else "" + full_text = text + " " + parent_text + + # 从文本中提取数字作为运单号 + numbers = re.findall(r'\d{8,}', full_text) + if numbers and not result['waybill_no']: + result['waybill_no'] = numbers[0] + print(f"从元素文本中找到运单号: {result['waybill_no']}") + + # 提取承运人 + if '承运人' in text and not result['carrier']: + carrier_match = re.search(r'承运人[::\s]*([^\s<,,]+)', full_text) + if carrier_match: + result['carrier'] = carrier_match.group(1).strip() + print(f"从元素文本中找到承运人: {result['carrier']}") + + # 提取电话 + if '电话' in text and not result['carrier_phone']: + phone_match = re.search(r'电话[::\s]*(\d+)', full_text) + if phone_match: + result['carrier_phone'] = phone_match.group(1) + print(f"从元素文本中找到电话: {result['carrier_phone']}") + + # 提取物流跟踪信息(时间线) + print("\n提取物流跟踪信息...") + tracking_elements = page.eles('xpath=//*[contains(@class, "track") or contains(@class, "logistics") or contains(@class, "timeline")]') + + if not tracking_elements: + # 尝试查找包含时间戳的元素 + tracking_elements = page.eles('xpath=//*[contains(text(), "2025") or contains(text(), "货物") or contains(text(), "到达")]') + + tracking_info = [] + for elem in tracking_elements[:20]: # 限制数量 + text = elem.text + if text and len(text) > 5: + # 尝试提取时间戳 + time_match = re.search(r'(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})', text) + if time_match or any(keyword in text for keyword in ['货物', '到达', '揽收', '运输', '配送', '签收']): + tracking_info.append({ + 'text': text.strip(), + 'time': time_match.group(1) if time_match else None + }) + + result['tracking_info'] = tracking_info[:10] # 最多保存10条 + + return result + + except Exception as e: + print(f"提取物流信息时出错: {e}") + import traceback + traceback.print_exc() + return None + + +def extract_from_json(json_data): + """ + 从 JSON 数据中提取物流信息 + + Args: + json_data: JSON 字典 + + Returns: + dict: 提取到的物流信息 + """ + result = {} + + def search_dict(d, key_patterns): + """递归搜索字典中的值""" + if isinstance(d, dict): + for k, v in d.items(): + # 检查键名 + for pattern in key_patterns: + if re.search(pattern, k, re.IGNORECASE): + return v + # 递归搜索值 + if isinstance(v, (dict, list)): + found = search_dict(v, key_patterns) + if found: + return found + elif isinstance(d, list): + for item in d: + found = search_dict(item, key_patterns) + if found: + return found + return None + + # 搜索运单号 + waybill = search_dict(json_data, [r'waybill', r'tracking.*number', r'运单号', r'waybillNo']) + if waybill: + result['waybill_no'] = str(waybill) + + # 搜索承运人 + carrier = search_dict(json_data, [r'carrier', r'承运人', r'carrierName']) + if carrier: + result['carrier'] = str(carrier) + + # 搜索承运人电话 + phone = search_dict(json_data, [r'carrier.*phone', r'承运人电话', r'carrierPhone', r'phone']) + if phone: + result['carrier_phone'] = str(phone) + + # 搜索物流跟踪信息 + tracking = search_dict(json_data, [r'track', r'logistics', r'物流', r'轨迹', r'history']) + if tracking: + if isinstance(tracking, list): + result['tracking_info'] = tracking + elif isinstance(tracking, dict): + result['tracking_info'] = [tracking] + + return result if result else None + + +def print_result(result): + """打印提取结果""" + if not result: + print("未能提取到物流信息") + return + + print("\n" + "="*50) + print("物流信息提取结果:") + print("="*50) + print(f"运单号: {result.get('waybill_no', '未找到')}") + print(f"国内承运人: {result.get('carrier', '未找到')}") + print(f"国内承运人电话: {result.get('carrier_phone', '未找到')}") + + if result.get('tracking_info'): + print(f"\n物流跟踪信息 (共 {len(result['tracking_info'])} 条):") + for idx, info in enumerate(result['tracking_info'], 1): + if isinstance(info, dict): + text = info.get('text', str(info)) + time_str = info.get('time', '') + print(f" {idx}. {text}") + if time_str: + print(f" 时间: {time_str}") + else: + print(f" {idx}. {info}") + else: + print("\n物流跟踪信息: 未找到") + + print("="*50) + + +# 主程序 +if __name__ == '__main__': + # 测试 URL + tracking_url = "https://3.cn/2t-Iibig" + + print("="*60) + print("京东物流信息提取工具 (Ubuntu 版本)") + print("="*60) + print(f"目标 URL: {tracking_url}") + print(f"无头模式: {'是' if USE_HEADLESS else '否'}") + print("开始提取物流信息...\n") + + try: + result = extract_logistics_info(tracking_url) + except Exception as e: + print(f"\n❌ 执行过程中出错: {e}") + import traceback + traceback.print_exc() + result = None + + if result: + print_result(result) + + # 保存结果到文件 + output_file = "logistics_result.json" + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False, indent=2) + print(f"\n结果已保存到: {output_file}") + else: + print("提取失败") + + print("\n脚本执行完成") + diff --git a/jd/fetch_logistics_ubuntu.py b/jd/fetch_logistics_ubuntu.py new file mode 100644 index 0000000..abbc83c --- /dev/null +++ b/jd/fetch_logistics_ubuntu.py @@ -0,0 +1,457 @@ +import time +import json +import re +import os +import platform +import threading +from flask import Flask, request, jsonify +from DrissionPage import ChromiumPage, ChromiumOptions + +# Ubuntu 上常见的 Chrome/Chromium 路径 +UBUNTU_CHROME_PATHS = [ + '/usr/bin/google-chrome', + '/usr/bin/google-chrome-stable', + '/usr/bin/chromium-browser', + '/usr/bin/chromium', + '/snap/bin/chromium', + '/opt/google/chrome/chrome', +] + +# 是否使用无头模式(headless) +# True: 无界面模式,适合服务器环境 +# False: 有界面模式,需要 X11 或 Wayland +USE_HEADLESS = True # 可以根据需要修改 + +# 监听端口:内网多实例时每台设不同端口,例如 LOGISTICS_PORT=5002 +LISTEN_PORT = int(os.environ.get('LOGISTICS_PORT', os.environ.get('PORT', '5001'))) + +# 全局浏览器实例 +global_page = None + + +def find_chrome_path(): + """自动查找 Ubuntu 系统中的 Chrome/Chromium 路径""" + print("正在查找 Chrome/Chromium 浏览器...") + + # 首先尝试常见的路径 + for path in UBUNTU_CHROME_PATHS: + if os.path.exists(path): + print(f"✅ 找到浏览器: {path}") + return path + + # 尝试使用 which 命令查找 + import subprocess + try: + result = subprocess.run(['which', 'google-chrome'], + capture_output=True, text=True, timeout=5) + if result.returncode == 0 and os.path.exists(result.stdout.strip()): + path = result.stdout.strip() + print(f"✅ 通过 which 找到浏览器: {path}") + return path + except: + pass + + try: + result = subprocess.run(['which', 'chromium-browser'], + capture_output=True, text=True, timeout=5) + if result.returncode == 0 and os.path.exists(result.stdout.strip()): + path = result.stdout.strip() + print(f"✅ 通过 which 找到浏览器: {path}") + return path + except: + pass + + # 如果都找不到,返回最常见的路径 + default_path = '/usr/bin/google-chrome' + print(f"⚠️ 未找到浏览器,将使用默认路径: {default_path}") + print("请确保已安装 Google Chrome 或 Chromium:") + print(" sudo apt update") + print(" sudo apt install -y google-chrome-stable") + print(" 或者") + print(" sudo apt install -y chromium-browser") + return default_path + + +def get_global_browser(): + """获取全局浏览器实例(Ubuntu 版本)""" + global global_page + if global_page is None: + print("="*60) + print("Ubuntu 浏览器初始化") + print("="*60) + + # 检查操作系统 + if platform.system() != 'Linux': + print(f"⚠️ 警告: 当前系统是 {platform.system()},此脚本专为 Ubuntu 设计") + + # 查找 Chrome 路径 + chrome_path = find_chrome_path() + + options = ChromiumOptions() + options.set_browser_path(chrome_path) + + # Ubuntu 服务器环境通常使用无头模式 + if USE_HEADLESS: + print("配置为无头模式(headless)...") + try: + options.headless(True) + except: + # 如果 headless 方法不存在,使用参数 + try: + options.set_argument('--headless=new') + options.set_argument('--no-sandbox') + options.set_argument('--disable-dev-shm-usage') + except: + pass + else: + print("配置为有界面模式...") + # 检查是否有显示环境 + display = os.environ.get('DISPLAY') + if not display: + print("⚠️ 警告: 未检测到 DISPLAY 环境变量") + print("如果无法显示浏览器,请:") + print(" 1. 设置 USE_HEADLESS = True") + print(" 2. 或者设置 DISPLAY 环境变量(如 DISPLAY=:0)") + print(" 3. 或者使用 Xvfb(虚拟显示)") + + # Linux 特定参数 + try: + options.set_argument('--no-sandbox') # 在某些环境下需要 + options.set_argument('--disable-dev-shm-usage') # 避免 /dev/shm 空间不足 + options.set_argument('--disable-gpu') # 禁用 GPU(可选,在 headless 模式下有用) + except: + pass + + print(f"正在启动浏览器...") + print(f"浏览器路径: {chrome_path}") + if USE_HEADLESS: + print("模式: 无头模式(后台运行)") + else: + print("模式: 有界面模式") + + try: + global_page = ChromiumPage(options) + print("✅ 浏览器已成功启动!") + time.sleep(2) # 等待浏览器完全启动 + except Exception as e: + print(f"❌ 浏览器启动失败: {e}") + print("\n可能的解决方案:") + print("1. 确保已安装 Chrome/Chromium:") + print(" sudo apt update") + print(" sudo apt install -y google-chrome-stable") + print("2. 如果使用无头模式失败,尝试设置 USE_HEADLESS = False") + print("3. 确保有足够的权限") + print("4. 检查是否缺少依赖:") + print(" sudo apt install -y libnss3 libatk-bridge2.0-0 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libasound2") + import traceback + traceback.print_exc() + raise + else: + print("使用已存在的浏览器实例") + + return global_page + + +def extract_logistics_info(tracking_url): + """ + 从京东物流追踪页面提取运单号、承运人等信息(Ubuntu 版本) + + Args: + tracking_url: 物流追踪页面 URL,例如 https://3.cn/2t-Iibig + + Returns: + dict: 包含运单号、承运人、承运人电话、物流跟踪信息等的字典 + """ + page = get_global_browser() + + try: + print(f"\n正在打开物流追踪页面: {tracking_url}") + page.get(tracking_url) + print("页面加载中,请稍候...") + time.sleep(5) # 等待页面加载 + + # 检查页面是否成功加载 + current_url = page.url + print(f"当前页面 URL: {current_url}") + + # 检查页面标题 + try: + title = page.title + print(f"页面标题: {title}") + except: + print("无法获取页面标题") + + # 检查页面是否有内容 + try: + html_length = len(page.html) + print(f"页面 HTML 长度: {html_length} 字符") + if html_length < 100: + print("⚠️ 警告: 页面内容可能未完全加载") + except Exception as e: + print(f"⚠️ 无法获取页面 HTML: {e}") + + result = { + "waybill_no": None, # 运单号 + "carrier": None, # 国内承运人 + "carrier_phone": None, # 国内承运人电话 + "tracking_info": [], # 物流跟踪信息列表 + } + + # 从 DOM 元素中提取数据 + print("\n从 DOM 元素提取数据...") + + # 尝试查找运单号元素 + waybill_elements = page.eles('xpath=//*[contains(text(), "运单号") or contains(text(), "运单")]') + for elem in waybill_elements: + text = elem.text + parent_text = elem.parent().text if elem.parent() else "" + full_text = text + " " + parent_text + + # 从文本中提取数字作为运单号 + numbers = re.findall(r'\d{8,}', full_text) + if numbers and not result['waybill_no']: + result['waybill_no'] = numbers[0] + print(f"✅ 找到运单号: {result['waybill_no']}") + + # 提取承运人 + if '承运人' in text and not result['carrier']: + carrier_match = re.search(r'承运人[::\s]*([^\s<,,]+)', full_text) + if carrier_match: + result['carrier'] = carrier_match.group(1).strip() + print(f"✅ 找到承运人: {result['carrier']}") + + # 提取电话 + if '电话' in text and not result['carrier_phone']: + phone_match = re.search(r'电话[::\s]*(\d+)', full_text) + if phone_match: + result['carrier_phone'] = phone_match.group(1) + print(f"✅ 找到承运人电话: {result['carrier_phone']}") + + # 提取物流跟踪信息(时间线) + print("\n提取物流跟踪信息...") + tracking_elements = page.eles('xpath=//*[contains(@class, "track") or contains(@class, "logistics") or contains(@class, "timeline")]') + + if not tracking_elements: + # 尝试查找包含时间戳的元素 + tracking_elements = page.eles('xpath=//*[contains(text(), "2025") or contains(text(), "货物") or contains(text(), "到达")]') + + tracking_info = [] + for elem in tracking_elements[:20]: # 限制数量 + text = elem.text + if text and len(text) > 5: + # 尝试提取时间戳 + time_match = re.search(r'(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})', text) + if time_match or any(keyword in text for keyword in ['货物', '到达', '揽收', '运输', '配送', '签收']): + tracking_info.append({ + 'text': text.strip(), + 'time': time_match.group(1) if time_match else None + }) + + result['tracking_info'] = tracking_info[:10] # 最多保存10条 + + if result['tracking_info']: + print(f"✅ 找到 {len(result['tracking_info'])} 条物流跟踪信息") + + return result + + except Exception as e: + print(f"提取物流信息时出错: {e}") + import traceback + traceback.print_exc() + return None + + +def print_result(result): + """打印提取结果""" + if not result: + print("未能提取到物流信息") + return + + print("\n" + "="*50) + print("物流信息提取结果:") + print("="*50) + print(f"运单号: {result.get('waybill_no', '未找到')}") + print(f"国内承运人: {result.get('carrier', '未找到')}") + print(f"国内承运人电话: {result.get('carrier_phone', '未找到')}") + + if result.get('tracking_info'): + print(f"\n物流跟踪信息 (共 {len(result['tracking_info'])} 条):") + for idx, info in enumerate(result['tracking_info'], 1): + if isinstance(info, dict): + text = info.get('text', str(info)) + time_str = info.get('time', '') + print(f" {idx}. {text}") + if time_str: + print(f" 时间: {time_str}") + else: + print(f" {idx}. {info}") + else: + print("\n物流跟踪信息: 未找到") + + print("="*50) + + +# =================== Flask API 接口 =================== +# 初始化 Flask 应用 +app = Flask(__name__) + +# 初始化锁,防止并发访问 +fetch_lock = threading.Lock() + + +@app.route('/fetch_logistics', methods=['GET', 'POST']) +def fetch_logistics(): + """ + 查询物流信息接口 + + 参数: + tracking_url: 物流追踪页面 URL(GET 或 POST) + 例如: https://3.cn/2t-Iibig + + 返回: + JSON 格式的物流信息,包含: + - waybill_no: 运单号 + - carrier: 国内承运人 + - carrier_phone: 国内承运人电话 + - tracking_info: 物流跟踪信息列表 + - success: 是否成功 + - message: 消息提示 + """ + # 获取参数(支持 GET 和 POST) + if request.method == 'POST': + if request.is_json: + data = request.get_json() + tracking_url = data.get('tracking_url') or data.get('url') + else: + tracking_url = request.form.get('tracking_url') or request.form.get('url') or request.args.get('tracking_url') or request.args.get('url') + else: + tracking_url = request.args.get('tracking_url') or request.args.get('url') + + if not tracking_url: + return jsonify({ + "success": False, + "error": "缺少参数 tracking_url 或 url", + "message": "请提供物流追踪页面 URL" + }), 400 + + # 验证 URL 格式 + if not (tracking_url.startswith('http://') or tracking_url.startswith('https://')): + return jsonify({ + "success": False, + "error": "URL 格式错误", + "message": "URL 必须以 http:// 或 https:// 开头" + }), 400 + + try: + with fetch_lock: # 加锁,防止并发调用 + print(f"\n收到物流查询请求: {tracking_url}") + result = extract_logistics_info(tracking_url) + + if result: + # 构建返回数据 + response_data = { + "success": True, + "message": "查询成功", + "data": { + "waybill_no": result.get('waybill_no'), + "carrier": result.get('carrier'), + "carrier_phone": result.get('carrier_phone'), + "tracking_info": result.get('tracking_info', []), + "tracking_count": len(result.get('tracking_info', [])) + }, + "url": tracking_url + } + + # 如果有些信息未找到,添加提示 + missing_fields = [] + if not result.get('waybill_no'): + missing_fields.append('waybill_no') + if not result.get('carrier'): + missing_fields.append('carrier') + + if missing_fields: + response_data["warning"] = f"以下字段未找到: {', '.join(missing_fields)}" + + return jsonify(response_data), 200 + else: + return jsonify({ + "success": False, + "error": "提取失败", + "message": "未能从页面中提取到物流信息", + "url": tracking_url + }), 500 + + except Exception as e: + print(f"查询物流信息时出错: {e}") + import traceback + traceback.print_exc() + return jsonify({ + "success": False, + "error": str(e), + "message": "服务器内部错误", + "url": tracking_url + }), 500 + + +@app.route('/health', methods=['GET']) +def health(): + """健康检查接口""" + return jsonify({ + "status": "ok", + "service": "京东物流信息查询服务", + "version": "1.0.0" + }), 200 + + +@app.route('/', methods=['GET']) +def index(): + """首页,返回 API 使用说明""" + return jsonify({ + "service": "京东物流信息查询 API", + "version": "1.0.0", + "endpoints": { + "/fetch_logistics": { + "method": ["GET", "POST"], + "description": "查询物流信息", + "parameters": { + "tracking_url": "物流追踪页面 URL(必需)", + "url": "tracking_url 的别名(可选)" + }, + "example_get": "/fetch_logistics?tracking_url=https://3.cn/2t-Iibig", + "example_post": "POST /fetch_logistics\n{\"tracking_url\": \"https://3.cn/2t-Iibig\"}" + }, + "/health": { + "method": ["GET"], + "description": "健康检查" + } + } + }), 200 + + +# =================== 启动服务 =================== +if __name__ == '__main__': + # API 服务模式(默认) + print("="*60) + print("京东物流信息查询 API 服务 (Ubuntu 版本)") + print("="*60) + print(f"无头模式: {'是' if USE_HEADLESS else '否'}") + print("\n服务接口:") + print(" GET/POST /fetch_logistics?tracking_url= - 查询物流信息") + print(" GET /health - 健康检查") + print(" GET / - API 说明") + print("\n启动服务...") + print(f"服务地址: http://0.0.0.0:{LISTEN_PORT} (环境变量 LOGISTICS_PORT / PORT 可覆盖)") + print("按 Ctrl+C 停止服务\n") + + try: + app.run(host='0.0.0.0', port=LISTEN_PORT, debug=False, threaded=True) + except KeyboardInterrupt: + print("\n\n服务已停止") + finally: + if 'global_page' in globals() and global_page: + try: + global_page.quit() + print("浏览器已关闭") + except: + pass + diff --git a/logistics.log b/logistics.log new file mode 100644 index 0000000..668bd68 --- /dev/null +++ b/logistics.log @@ -0,0 +1,68 @@ +[2025-12-29 21:46:15] [INFO] 开始执行物流提取脚本 +[2025-12-29 21:46:15] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2025-12-29 21:46:15] [INFO] Python版本: Python 3.12.3 +[2025-12-29 21:46:15] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-01-13 21:17:03] [INFO] 开始执行物流提取脚本 +[2026-01-13 21:17:03] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-01-13 21:17:03] [INFO] Python版本: Python 3.12.3 +[2026-01-13 21:17:03] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-01-17 17:39:45] [INFO] 开始执行物流提取脚本 +[2026-01-17 17:39:45] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-01-17 17:39:45] [INFO] Python版本: Python 3.12.3 +[2026-01-17 17:39:45] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-01-19 15:06:09] [INFO] 开始执行物流提取脚本 +[2026-01-19 15:06:09] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-01-19 15:06:09] [INFO] Python版本: Python 3.12.3 +[2026-01-19 15:06:09] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-01-19 15:25:02] [INFO] 开始执行物流提取脚本 +[2026-01-19 15:25:02] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-01-19 15:25:02] [INFO] Python版本: Python 3.12.3 +[2026-01-19 15:25:02] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-01-19 18:59:37] [INFO] 开始执行物流提取脚本 +[2026-01-19 18:59:37] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-01-19 18:59:37] [INFO] Python版本: Python 3.12.3 +[2026-01-19 18:59:37] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-02-09 18:06:44] [INFO] 开始执行物流提取脚本 +[2026-02-09 18:06:44] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-02-09 18:06:44] [INFO] Python版本: Python 3.12.3 +[2026-02-09 18:06:44] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-02-11 14:32:19] [INFO] 开始执行物流提取脚本 +[2026-02-11 14:32:19] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-02-11 14:32:19] [INFO] Python版本: Python 3.12.3 +[2026-02-11 14:32:19] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-02-11 16:45:00] [INFO] 开始执行物流提取脚本 +[2026-02-11 16:45:00] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-02-11 16:45:00] [INFO] Python版本: Python 3.12.3 +[2026-02-11 16:45:00] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-02-12 01:35:07] [INFO] 开始执行物流提取脚本 +[2026-02-12 01:35:07] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-02-12 01:35:07] [INFO] Python版本: Python 3.12.3 +[2026-02-12 01:35:07] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-02-15 10:27:16] [INFO] 开始执行物流提取脚本 +[2026-02-15 10:27:16] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-02-15 10:27:16] [INFO] Python版本: Python 3.12.3 +[2026-02-15 10:27:16] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-02-24 19:34:46] [INFO] 开始执行物流提取脚本 +[2026-02-24 19:34:46] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-02-24 19:34:46] [INFO] Python版本: Python 3.12.3 +[2026-02-24 19:34:46] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-02-27 17:42:56] [INFO] 开始执行物流提取脚本 +[2026-02-27 17:42:56] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-02-27 17:42:56] [INFO] Python版本: Python 3.12.3 +[2026-02-27 17:42:56] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-02-28 16:39:40] [INFO] 开始执行物流提取脚本 +[2026-02-28 16:39:40] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-02-28 16:39:40] [INFO] Python版本: Python 3.12.3 +[2026-02-28 16:39:40] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-03-30 14:57:26] [INFO] 开始执行物流提取脚本 +[2026-03-30 14:57:26] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-03-30 14:57:26] [INFO] Python版本: Python 3.12.3 +[2026-03-30 14:57:26] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-04-07 08:00:51] [INFO] 开始执行物流提取脚本 +[2026-04-07 08:00:51] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-04-07 08:00:51] [INFO] Python版本: Python 3.12.3 +[2026-04-07 08:00:51] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py +[2026-04-07 12:11:33] [INFO] 开始执行物流提取脚本 +[2026-04-07 12:11:33] [INFO] 激活虚拟环境: /home/van/project/jd_python/venv +[2026-04-07 12:11:33] [INFO] Python版本: Python 3.12.3 +[2026-04-07 12:11:33] [INFO] 执行脚本: /home/van/project/jd_python/jd/fetch_logistics_ubuntu.py diff --git a/logistics_result.json b/logistics_result.json new file mode 100644 index 0000000..3ff4f0f --- /dev/null +++ b/logistics_result.json @@ -0,0 +1,48 @@ +{ + "waybill_no": "6686039468", + "carrier": ":", + "carrier_phone": null, + "tracking_info": [ + { + "text": "已签收\n[武汉]您的订单已签收,期待下次继续为您服务\n2025-11-03 11:54:57\n派送中\n[武汉]上门送货中\n2025-11-03 09:25:12\n运输中\n[武汉]日日顺小哥:【沈从宽】【13487235409】正在为您派件,感谢您的耐心等待\n2025-11-03 09:24:12\n[武汉]货物已到达网点\n2025-11-01 14:44:39\n[武汉]订单已从【武汉】转出,预计送达用户时间:2025-11-03\n2025-10-31 16:32:09\n已揽收\n[武汉]揽收\n2025-10-31 02:40:40\n仓库处理中\n预计11月3日24:00前送达\n2025-10-31 02:40:40\n您的订单已进入第三方卖家仓库,准备出库\n2025-10-30 22:10:47\n已下单\n您提交了订单,请等待第三方卖家系统确认\n2025-10-30 22:05:44", + "time": "2025-11-03 11:54:57" + }, + { + "text": "已签收\n[武汉]您的订单已签收,期待下次继续为您服务\n2025-11-03 11:54:57", + "time": "2025-11-03 11:54:57" + }, + { + "text": "已签收\n[武汉]您的订单已签收,期待下次继续为您服务\n2025-11-03 11:54:57", + "time": "2025-11-03 11:54:57" + }, + { + "text": "[武汉]您的订单已签收,期待下次继续为您服务", + "time": null + }, + { + "text": "2025-11-03 11:54:57", + "time": "2025-11-03 11:54:57" + }, + { + "text": "派送中\n[武汉]上门送货中\n2025-11-03 09:25:12", + "time": "2025-11-03 09:25:12" + }, + { + "text": "派送中\n[武汉]上门送货中\n2025-11-03 09:25:12", + "time": "2025-11-03 09:25:12" + }, + { + "text": "2025-11-03 09:25:12", + "time": "2025-11-03 09:25:12" + }, + { + "text": "运输中\n[武汉]日日顺小哥:【沈从宽】【13487235409】正在为您派件,感谢您的耐心等待\n2025-11-03 09:24:12", + "time": "2025-11-03 09:24:12" + }, + { + "text": "运输中\n[武汉]日日顺小哥:【沈从宽】【13487235409】正在为您派件,感谢您的耐心等待\n2025-11-03 09:24:12", + "time": "2025-11-03 09:24:12" + } + ], + "raw_html": "\n \n 订单跟踪\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n