import time import json import re import os import platform from DrissionPage import ChromiumPage, ChromiumOptions # Ubuntu 上常见的 Chrome/Chromium 路径 UBUNTU_CHROME_PATHS = [ '/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', '/usr/bin/chromium-browser', '/usr/bin/chromium', '/snap/bin/chromium', '/opt/google/chrome/chrome', ] # 是否使用无头模式(headless) # True: 无界面模式,适合服务器环境 # False: 有界面模式,需要 X11 或 Wayland USE_HEADLESS = True # 可以根据需要修改 # 全局浏览器实例 global_page = None def find_chrome_path(): """自动查找 Ubuntu 系统中的 Chrome/Chromium 路径""" print("正在查找 Chrome/Chromium 浏览器...") # 首先尝试常见的路径 for path in UBUNTU_CHROME_PATHS: if os.path.exists(path): print(f"✅ 找到浏览器: {path}") return path # 尝试使用 which 命令查找 import subprocess try: result = subprocess.run(['which', 'google-chrome'], capture_output=True, text=True, timeout=5) if result.returncode == 0 and os.path.exists(result.stdout.strip()): path = result.stdout.strip() print(f"✅ 通过 which 找到浏览器: {path}") return path except: pass try: result = subprocess.run(['which', 'chromium-browser'], capture_output=True, text=True, timeout=5) if result.returncode == 0 and os.path.exists(result.stdout.strip()): path = result.stdout.strip() print(f"✅ 通过 which 找到浏览器: {path}") return path except: pass # 如果都找不到,返回最常见的路径 default_path = '/usr/bin/google-chrome' print(f"⚠️ 未找到浏览器,将使用默认路径: {default_path}") print("请确保已安装 Google Chrome 或 Chromium:") print(" sudo apt update") print(" sudo apt install -y google-chrome-stable") print(" 或者") print(" sudo apt install -y chromium-browser") return default_path def get_global_browser(): """获取全局浏览器实例(Ubuntu 版本)""" global global_page if global_page is None: print("="*60) print("Ubuntu 浏览器初始化") print("="*60) # 检查操作系统 if platform.system() != 'Linux': print(f"⚠️ 警告: 当前系统是 {platform.system()},此脚本专为 Ubuntu 设计") # 查找 Chrome 路径 chrome_path = find_chrome_path() options = ChromiumOptions() options.set_browser_path(chrome_path) # Ubuntu 服务器环境通常使用无头模式 if USE_HEADLESS: print("配置为无头模式(headless)...") try: options.headless(True) except: # 如果 headless 方法不存在,使用参数 try: options.set_argument('--headless=new') options.set_argument('--no-sandbox') options.set_argument('--disable-dev-shm-usage') except: pass else: print("配置为有界面模式...") # 检查是否有显示环境 display = os.environ.get('DISPLAY') if not display: print("⚠️ 警告: 未检测到 DISPLAY 环境变量") print("如果无法显示浏览器,请:") print(" 1. 设置 USE_HEADLESS = True") print(" 2. 或者设置 DISPLAY 环境变量(如 DISPLAY=:0)") print(" 3. 或者使用 Xvfb(虚拟显示)") # Linux 特定参数 try: options.set_argument('--no-sandbox') # 在某些环境下需要 options.set_argument('--disable-dev-shm-usage') # 避免 /dev/shm 空间不足 options.set_argument('--disable-gpu') # 禁用 GPU(可选,在 headless 模式下有用) except: pass print(f"正在启动浏览器...") print(f"浏览器路径: {chrome_path}") if USE_HEADLESS: print("模式: 无头模式(后台运行)") else: print("模式: 有界面模式") try: global_page = ChromiumPage(options) print("✅ 浏览器已成功启动!") time.sleep(2) # 等待浏览器完全启动 except Exception as e: print(f"❌ 浏览器启动失败: {e}") print("\n可能的解决方案:") print("1. 确保已安装 Chrome/Chromium:") print(" sudo apt update") print(" sudo apt install -y google-chrome-stable") print("2. 如果使用无头模式失败,尝试设置 USE_HEADLESS = False") print("3. 确保有足够的权限") print("4. 检查是否缺少依赖:") print(" sudo apt install -y libnss3 libatk-bridge2.0-0 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libasound2") import traceback traceback.print_exc() raise else: print("使用已存在的浏览器实例") return global_page def extract_logistics_info(tracking_url): """ 从京东物流追踪页面提取运单号、承运人等信息(Ubuntu 版本) Args: tracking_url: 物流追踪页面 URL,例如 https://3.cn/2t-Iibig Returns: dict: 包含运单号、承运人、承运人电话、物流跟踪信息等的字典 """ page = get_global_browser() try: print(f"\n正在打开物流追踪页面: {tracking_url}") page.get(tracking_url) print("页面加载中,请稍候...") time.sleep(5) # 等待页面加载 # 检查页面是否成功加载 current_url = page.url print(f"当前页面 URL: {current_url}") # 检查页面标题 try: title = page.title print(f"页面标题: {title}") except: print("无法获取页面标题") # 检查页面是否有内容 try: html_length = len(page.html) print(f"页面 HTML 长度: {html_length} 字符") if html_length < 100: print("⚠️ 警告: 页面内容可能未完全加载") except Exception as e: print(f"⚠️ 无法获取页面 HTML: {e}") result = { "waybill_no": None, # 运单号 "carrier": None, # 国内承运人 "carrier_phone": None, # 国内承运人电话 "tracking_info": [], # 物流跟踪信息列表 "raw_html": None # 原始 HTML(用于调试) } # 方法1: 监听网络请求,查找物流数据 API print("\n方法1: 监听网络请求...") page.listen.start() # 滚动页面触发可能的请求 page.scroll.down(300) time.sleep(2) page.scroll.to_bottom() time.sleep(3) # 检查监听到的请求 responses = page.listen.get() print(f"监听到 {len(responses)} 个请求") # 查找可能的物流数据接口 possible_urls = [ 'track', 'logistics', 'waybill', 'express', 'delivery', '3.cn', 'jd.com/logistics', 'api.m.jd.com', 'mapi.jd.com' ] for resp in responses: url = resp.url if hasattr(resp, 'url') else '' url_lower = url.lower() # 检查是否可能是物流相关的 API if any(keyword in url_lower for keyword in possible_urls): print(f"发现可能的物流 API: {url[:100]}") try: if hasattr(resp, 'response') and hasattr(resp.response, 'body'): body = resp.response.body # 处理 JSON 响应 if isinstance(body, dict): json_data = body elif isinstance(body, str): try: json_data = json.loads(body) except: continue else: continue # 尝试从 JSON 中提取运单号等信息 extracted = extract_from_json(json_data) if extracted: result.update(extracted) print("成功从 API 响应中提取数据") return result except Exception as e: print(f"解析 API 响应时出错: {e}") # 方法2: 从页面 HTML/DOM 中提取 print("\n方法2: 从页面 DOM 提取数据...") html = page.html result['raw_html'] = html[:5000] # 保存部分 HTML 用于调试 # 从 HTML 文本中提取运单号 waybill_patterns = [ r'运单号[::\s]*(\d+)', r'waybill[_\s]*no["\']?\s*[::]\s*["\']?(\d+)', r'tracking[_\s]*number["\']?\s*[::]\s*["\']?(\d+)', r'"waybillNo"\s*[::]\s*["\']?(\d+)', r'"trackingNumber"\s*[::]\s*["\']?(\d+)', ] for pattern in waybill_patterns: matches = re.findall(pattern, html, re.IGNORECASE) if matches: result['waybill_no'] = matches[0] print(f"找到运单号: {result['waybill_no']}") break # 提取承运人 carrier_patterns = [ r'国内承运人[::\s]*([^\s<,,]+)', r'carrier[::\s]*([^\s<,,]+)', r'"carrier"\s*[::]\s*["\']?([^"\']+)', ] for pattern in carrier_patterns: matches = re.findall(pattern, html, re.IGNORECASE) if matches: result['carrier'] = matches[0].strip() print(f"找到承运人: {result['carrier']}") break # 提取承运人电话 phone_patterns = [ r'国内承运人电话[::\s]*(\d+)', r'carrier[_\s]*phone[::\s]*(\d+)', r'"carrierPhone"\s*[::]\s*["\']?(\d+)', ] for pattern in phone_patterns: matches = re.findall(pattern, html, re.IGNORECASE) if matches: result['carrier_phone'] = matches[0] print(f"找到承运人电话: {result['carrier_phone']}") break # 方法3: 从 DOM 元素中提取 print("\n方法3: 从 DOM 元素提取数据...") # 尝试查找运单号元素 waybill_elements = page.eles('xpath=//*[contains(text(), "运单号") or contains(text(), "运单")]') for elem in waybill_elements: text = elem.text parent_text = elem.parent().text if elem.parent() else "" full_text = text + " " + parent_text # 从文本中提取数字作为运单号 numbers = re.findall(r'\d{8,}', full_text) if numbers and not result['waybill_no']: result['waybill_no'] = numbers[0] print(f"从元素文本中找到运单号: {result['waybill_no']}") # 提取承运人 if '承运人' in text and not result['carrier']: carrier_match = re.search(r'承运人[::\s]*([^\s<,,]+)', full_text) if carrier_match: result['carrier'] = carrier_match.group(1).strip() print(f"从元素文本中找到承运人: {result['carrier']}") # 提取电话 if '电话' in text and not result['carrier_phone']: phone_match = re.search(r'电话[::\s]*(\d+)', full_text) if phone_match: result['carrier_phone'] = phone_match.group(1) print(f"从元素文本中找到电话: {result['carrier_phone']}") # 提取物流跟踪信息(时间线) print("\n提取物流跟踪信息...") tracking_elements = page.eles('xpath=//*[contains(@class, "track") or contains(@class, "logistics") or contains(@class, "timeline")]') if not tracking_elements: # 尝试查找包含时间戳的元素 tracking_elements = page.eles('xpath=//*[contains(text(), "2025") or contains(text(), "货物") or contains(text(), "到达")]') tracking_info = [] for elem in tracking_elements[:20]: # 限制数量 text = elem.text if text and len(text) > 5: # 尝试提取时间戳 time_match = re.search(r'(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})', text) if time_match or any(keyword in text for keyword in ['货物', '到达', '揽收', '运输', '配送', '签收']): tracking_info.append({ 'text': text.strip(), 'time': time_match.group(1) if time_match else None }) result['tracking_info'] = tracking_info[:10] # 最多保存10条 return result except Exception as e: print(f"提取物流信息时出错: {e}") import traceback traceback.print_exc() return None def extract_from_json(json_data): """ 从 JSON 数据中提取物流信息 Args: json_data: JSON 字典 Returns: dict: 提取到的物流信息 """ result = {} def search_dict(d, key_patterns): """递归搜索字典中的值""" if isinstance(d, dict): for k, v in d.items(): # 检查键名 for pattern in key_patterns: if re.search(pattern, k, re.IGNORECASE): return v # 递归搜索值 if isinstance(v, (dict, list)): found = search_dict(v, key_patterns) if found: return found elif isinstance(d, list): for item in d: found = search_dict(item, key_patterns) if found: return found return None # 搜索运单号 waybill = search_dict(json_data, [r'waybill', r'tracking.*number', r'运单号', r'waybillNo']) if waybill: result['waybill_no'] = str(waybill) # 搜索承运人 carrier = search_dict(json_data, [r'carrier', r'承运人', r'carrierName']) if carrier: result['carrier'] = str(carrier) # 搜索承运人电话 phone = search_dict(json_data, [r'carrier.*phone', r'承运人电话', r'carrierPhone', r'phone']) if phone: result['carrier_phone'] = str(phone) # 搜索物流跟踪信息 tracking = search_dict(json_data, [r'track', r'logistics', r'物流', r'轨迹', r'history']) if tracking: if isinstance(tracking, list): result['tracking_info'] = tracking elif isinstance(tracking, dict): result['tracking_info'] = [tracking] return result if result else None def print_result(result): """打印提取结果""" if not result: print("未能提取到物流信息") return print("\n" + "="*50) print("物流信息提取结果:") print("="*50) print(f"运单号: {result.get('waybill_no', '未找到')}") print(f"国内承运人: {result.get('carrier', '未找到')}") print(f"国内承运人电话: {result.get('carrier_phone', '未找到')}") if result.get('tracking_info'): print(f"\n物流跟踪信息 (共 {len(result['tracking_info'])} 条):") for idx, info in enumerate(result['tracking_info'], 1): if isinstance(info, dict): text = info.get('text', str(info)) time_str = info.get('time', '') print(f" {idx}. {text}") if time_str: print(f" 时间: {time_str}") else: print(f" {idx}. {info}") else: print("\n物流跟踪信息: 未找到") print("="*50) # 主程序 if __name__ == '__main__': # 测试 URL tracking_url = "https://3.cn/2t-Iibig" print("="*60) print("京东物流信息提取工具 (Ubuntu 版本)") print("="*60) print(f"目标 URL: {tracking_url}") print(f"无头模式: {'是' if USE_HEADLESS else '否'}") print("开始提取物流信息...\n") try: result = extract_logistics_info(tracking_url) except Exception as e: print(f"\n❌ 执行过程中出错: {e}") import traceback traceback.print_exc() result = None if result: print_result(result) # 保存结果到文件 output_file = "logistics_result.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) print(f"\n结果已保存到: {output_file}") else: print("提取失败") print("\n脚本执行完成")