import time import json import re import os import platform import threading from flask import Flask, request, jsonify from DrissionPage import ChromiumPage, ChromiumOptions # Ubuntu 上常见的 Chrome/Chromium 路径 UBUNTU_CHROME_PATHS = [ '/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', '/usr/bin/chromium-browser', '/usr/bin/chromium', '/snap/bin/chromium', '/opt/google/chrome/chrome', ] # 是否使用无头模式(headless) # True: 无界面模式,适合服务器环境 # False: 有界面模式,需要 X11 或 Wayland USE_HEADLESS = True # 可以根据需要修改 # 全局浏览器实例 global_page = None def find_chrome_path(): """自动查找 Ubuntu 系统中的 Chrome/Chromium 路径""" print("正在查找 Chrome/Chromium 浏览器...") # 首先尝试常见的路径 for path in UBUNTU_CHROME_PATHS: if os.path.exists(path): print(f"✅ 找到浏览器: {path}") return path # 尝试使用 which 命令查找 import subprocess try: result = subprocess.run(['which', 'google-chrome'], capture_output=True, text=True, timeout=5) if result.returncode == 0 and os.path.exists(result.stdout.strip()): path = result.stdout.strip() print(f"✅ 通过 which 找到浏览器: {path}") return path except: pass try: result = subprocess.run(['which', 'chromium-browser'], capture_output=True, text=True, timeout=5) if result.returncode == 0 and os.path.exists(result.stdout.strip()): path = result.stdout.strip() print(f"✅ 通过 which 找到浏览器: {path}") return path except: pass # 如果都找不到,返回最常见的路径 default_path = '/usr/bin/google-chrome' print(f"⚠️ 未找到浏览器,将使用默认路径: {default_path}") print("请确保已安装 Google Chrome 或 Chromium:") print(" sudo apt update") print(" sudo apt install -y google-chrome-stable") print(" 或者") print(" sudo apt install -y chromium-browser") return default_path def get_global_browser(): """获取全局浏览器实例(Ubuntu 版本)""" global global_page if global_page is None: print("="*60) print("Ubuntu 浏览器初始化") print("="*60) # 检查操作系统 if platform.system() != 'Linux': print(f"⚠️ 警告: 当前系统是 {platform.system()},此脚本专为 Ubuntu 设计") # 查找 Chrome 路径 chrome_path = find_chrome_path() options = ChromiumOptions() options.set_browser_path(chrome_path) # Ubuntu 服务器环境通常使用无头模式 if USE_HEADLESS: print("配置为无头模式(headless)...") try: options.headless(True) except: # 如果 headless 方法不存在,使用参数 try: options.set_argument('--headless=new') options.set_argument('--no-sandbox') options.set_argument('--disable-dev-shm-usage') except: pass else: print("配置为有界面模式...") # 检查是否有显示环境 display = os.environ.get('DISPLAY') if not display: print("⚠️ 警告: 未检测到 DISPLAY 环境变量") print("如果无法显示浏览器,请:") print(" 1. 设置 USE_HEADLESS = True") print(" 2. 或者设置 DISPLAY 环境变量(如 DISPLAY=:0)") print(" 3. 或者使用 Xvfb(虚拟显示)") # Linux 特定参数 try: options.set_argument('--no-sandbox') # 在某些环境下需要 options.set_argument('--disable-dev-shm-usage') # 避免 /dev/shm 空间不足 options.set_argument('--disable-gpu') # 禁用 GPU(可选,在 headless 模式下有用) except: pass print(f"正在启动浏览器...") print(f"浏览器路径: {chrome_path}") if USE_HEADLESS: print("模式: 无头模式(后台运行)") else: print("模式: 有界面模式") try: global_page = ChromiumPage(options) print("✅ 浏览器已成功启动!") time.sleep(2) # 等待浏览器完全启动 except Exception as e: print(f"❌ 浏览器启动失败: {e}") print("\n可能的解决方案:") print("1. 确保已安装 Chrome/Chromium:") print(" sudo apt update") print(" sudo apt install -y google-chrome-stable") print("2. 如果使用无头模式失败,尝试设置 USE_HEADLESS = False") print("3. 确保有足够的权限") print("4. 检查是否缺少依赖:") print(" sudo apt install -y libnss3 libatk-bridge2.0-0 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libasound2") import traceback traceback.print_exc() raise else: print("使用已存在的浏览器实例") return global_page def extract_logistics_info(tracking_url): """ 从京东物流追踪页面提取运单号、承运人等信息(Ubuntu 版本) Args: tracking_url: 物流追踪页面 URL,例如 https://3.cn/2t-Iibig Returns: dict: 包含运单号、承运人、承运人电话、物流跟踪信息等的字典 """ page = get_global_browser() try: print(f"\n正在打开物流追踪页面: {tracking_url}") page.get(tracking_url) print("页面加载中,请稍候...") time.sleep(5) # 等待页面加载 # 检查页面是否成功加载 current_url = page.url print(f"当前页面 URL: {current_url}") # 检查页面标题 try: title = page.title print(f"页面标题: {title}") except: print("无法获取页面标题") # 检查页面是否有内容 try: html_length = len(page.html) print(f"页面 HTML 长度: {html_length} 字符") if html_length < 100: print("⚠️ 警告: 页面内容可能未完全加载") except Exception as e: print(f"⚠️ 无法获取页面 HTML: {e}") result = { "waybill_no": None, # 运单号 "carrier": None, # 国内承运人 "carrier_phone": None, # 国内承运人电话 "tracking_info": [], # 物流跟踪信息列表 } # 从 DOM 元素中提取数据 print("\n从 DOM 元素提取数据...") # 尝试查找运单号元素 waybill_elements = page.eles('xpath=//*[contains(text(), "运单号") or contains(text(), "运单")]') for elem in waybill_elements: text = elem.text parent_text = elem.parent().text if elem.parent() else "" full_text = text + " " + parent_text # 从文本中提取数字作为运单号 numbers = re.findall(r'\d{8,}', full_text) if numbers and not result['waybill_no']: result['waybill_no'] = numbers[0] print(f"✅ 找到运单号: {result['waybill_no']}") # 提取承运人 if '承运人' in text and not result['carrier']: carrier_match = re.search(r'承运人[::\s]*([^\s<,,]+)', full_text) if carrier_match: result['carrier'] = carrier_match.group(1).strip() print(f"✅ 找到承运人: {result['carrier']}") # 提取电话 if '电话' in text and not result['carrier_phone']: phone_match = re.search(r'电话[::\s]*(\d+)', full_text) if phone_match: result['carrier_phone'] = phone_match.group(1) print(f"✅ 找到承运人电话: {result['carrier_phone']}") # 提取物流跟踪信息(时间线) print("\n提取物流跟踪信息...") tracking_elements = page.eles('xpath=//*[contains(@class, "track") or contains(@class, "logistics") or contains(@class, "timeline")]') if not tracking_elements: # 尝试查找包含时间戳的元素 tracking_elements = page.eles('xpath=//*[contains(text(), "2025") or contains(text(), "货物") or contains(text(), "到达")]') tracking_info = [] for elem in tracking_elements[:20]: # 限制数量 text = elem.text if text and len(text) > 5: # 尝试提取时间戳 time_match = re.search(r'(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})', text) if time_match or any(keyword in text for keyword in ['货物', '到达', '揽收', '运输', '配送', '签收']): tracking_info.append({ 'text': text.strip(), 'time': time_match.group(1) if time_match else None }) result['tracking_info'] = tracking_info[:10] # 最多保存10条 if result['tracking_info']: print(f"✅ 找到 {len(result['tracking_info'])} 条物流跟踪信息") return result except Exception as e: print(f"提取物流信息时出错: {e}") import traceback traceback.print_exc() return None def print_result(result): """打印提取结果""" if not result: print("未能提取到物流信息") return print("\n" + "="*50) print("物流信息提取结果:") print("="*50) print(f"运单号: {result.get('waybill_no', '未找到')}") print(f"国内承运人: {result.get('carrier', '未找到')}") print(f"国内承运人电话: {result.get('carrier_phone', '未找到')}") if result.get('tracking_info'): print(f"\n物流跟踪信息 (共 {len(result['tracking_info'])} 条):") for idx, info in enumerate(result['tracking_info'], 1): if isinstance(info, dict): text = info.get('text', str(info)) time_str = info.get('time', '') print(f" {idx}. {text}") if time_str: print(f" 时间: {time_str}") else: print(f" {idx}. {info}") else: print("\n物流跟踪信息: 未找到") print("="*50) # =================== Flask API 接口 =================== # 初始化 Flask 应用 app = Flask(__name__) # 初始化锁,防止并发访问 fetch_lock = threading.Lock() @app.route('/fetch_logistics', methods=['GET', 'POST']) def fetch_logistics(): """ 查询物流信息接口 参数: tracking_url: 物流追踪页面 URL(GET 或 POST) 例如: https://3.cn/2t-Iibig 返回: JSON 格式的物流信息,包含: - waybill_no: 运单号 - carrier: 国内承运人 - carrier_phone: 国内承运人电话 - tracking_info: 物流跟踪信息列表 - success: 是否成功 - message: 消息提示 """ # 获取参数(支持 GET 和 POST) if request.method == 'POST': if request.is_json: data = request.get_json() tracking_url = data.get('tracking_url') or data.get('url') else: tracking_url = request.form.get('tracking_url') or request.form.get('url') or request.args.get('tracking_url') or request.args.get('url') else: tracking_url = request.args.get('tracking_url') or request.args.get('url') if not tracking_url: return jsonify({ "success": False, "error": "缺少参数 tracking_url 或 url", "message": "请提供物流追踪页面 URL" }), 400 # 验证 URL 格式 if not (tracking_url.startswith('http://') or tracking_url.startswith('https://')): return jsonify({ "success": False, "error": "URL 格式错误", "message": "URL 必须以 http:// 或 https:// 开头" }), 400 try: with fetch_lock: # 加锁,防止并发调用 print(f"\n收到物流查询请求: {tracking_url}") result = extract_logistics_info(tracking_url) if result: # 构建返回数据 response_data = { "success": True, "message": "查询成功", "data": { "waybill_no": result.get('waybill_no'), "carrier": result.get('carrier'), "carrier_phone": result.get('carrier_phone'), "tracking_info": result.get('tracking_info', []), "tracking_count": len(result.get('tracking_info', [])) }, "url": tracking_url } # 如果有些信息未找到,添加提示 missing_fields = [] if not result.get('waybill_no'): missing_fields.append('waybill_no') if not result.get('carrier'): missing_fields.append('carrier') if missing_fields: response_data["warning"] = f"以下字段未找到: {', '.join(missing_fields)}" return jsonify(response_data), 200 else: return jsonify({ "success": False, "error": "提取失败", "message": "未能从页面中提取到物流信息", "url": tracking_url }), 500 except Exception as e: print(f"查询物流信息时出错: {e}") import traceback traceback.print_exc() return jsonify({ "success": False, "error": str(e), "message": "服务器内部错误", "url": tracking_url }), 500 @app.route('/health', methods=['GET']) def health(): """健康检查接口""" return jsonify({ "status": "ok", "service": "京东物流信息查询服务", "version": "1.0.0" }), 200 @app.route('/', methods=['GET']) def index(): """首页,返回 API 使用说明""" return jsonify({ "service": "京东物流信息查询 API", "version": "1.0.0", "endpoints": { "/fetch_logistics": { "method": ["GET", "POST"], "description": "查询物流信息", "parameters": { "tracking_url": "物流追踪页面 URL(必需)", "url": "tracking_url 的别名(可选)" }, "example_get": "/fetch_logistics?tracking_url=https://3.cn/2t-Iibig", "example_post": "POST /fetch_logistics\n{\"tracking_url\": \"https://3.cn/2t-Iibig\"}" }, "/health": { "method": ["GET"], "description": "健康检查" } } }), 200 # =================== 启动服务 =================== if __name__ == '__main__': # API 服务模式(默认) print("="*60) print("京东物流信息查询 API 服务 (Ubuntu 版本)") print("="*60) print(f"无头模式: {'是' if USE_HEADLESS else '否'}") print("\n服务接口:") print(" GET/POST /fetch_logistics?tracking_url= - 查询物流信息") print(" GET /health - 健康检查") print(" GET / - API 说明") print("\n启动服务...") print("服务地址: http://0.0.0.0:5001") print("按 Ctrl+C 停止服务\n") try: app.run(host='0.0.0.0', port=5001, debug=False, threaded=True) except KeyboardInterrupt: print("\n\n服务已停止") finally: if 'global_page' in globals() and global_page: try: global_page.quit() print("浏览器已关闭") except: pass