480 lines
17 KiB
Python
480 lines
17 KiB
Python
import time
|
||
import json
|
||
import re
|
||
import os
|
||
import platform
|
||
from DrissionPage import ChromiumPage, ChromiumOptions
|
||
|
||
# Ubuntu 上常见的 Chrome/Chromium 路径
|
||
UBUNTU_CHROME_PATHS = [
|
||
'/usr/bin/google-chrome',
|
||
'/usr/bin/google-chrome-stable',
|
||
'/usr/bin/chromium-browser',
|
||
'/usr/bin/chromium',
|
||
'/snap/bin/chromium',
|
||
'/opt/google/chrome/chrome',
|
||
]
|
||
|
||
# 是否使用无头模式(headless)
|
||
# True: 无界面模式,适合服务器环境
|
||
# False: 有界面模式,需要 X11 或 Wayland
|
||
USE_HEADLESS = True # 可以根据需要修改
|
||
|
||
# 全局浏览器实例
|
||
global_page = None
|
||
|
||
|
||
def find_chrome_path():
|
||
"""自动查找 Ubuntu 系统中的 Chrome/Chromium 路径"""
|
||
print("正在查找 Chrome/Chromium 浏览器...")
|
||
|
||
# 首先尝试常见的路径
|
||
for path in UBUNTU_CHROME_PATHS:
|
||
if os.path.exists(path):
|
||
print(f"✅ 找到浏览器: {path}")
|
||
return path
|
||
|
||
# 尝试使用 which 命令查找
|
||
import subprocess
|
||
try:
|
||
result = subprocess.run(['which', 'google-chrome'],
|
||
capture_output=True, text=True, timeout=5)
|
||
if result.returncode == 0 and os.path.exists(result.stdout.strip()):
|
||
path = result.stdout.strip()
|
||
print(f"✅ 通过 which 找到浏览器: {path}")
|
||
return path
|
||
except:
|
||
pass
|
||
|
||
try:
|
||
result = subprocess.run(['which', 'chromium-browser'],
|
||
capture_output=True, text=True, timeout=5)
|
||
if result.returncode == 0 and os.path.exists(result.stdout.strip()):
|
||
path = result.stdout.strip()
|
||
print(f"✅ 通过 which 找到浏览器: {path}")
|
||
return path
|
||
except:
|
||
pass
|
||
|
||
# 如果都找不到,返回最常见的路径
|
||
default_path = '/usr/bin/google-chrome'
|
||
print(f"⚠️ 未找到浏览器,将使用默认路径: {default_path}")
|
||
print("请确保已安装 Google Chrome 或 Chromium:")
|
||
print(" sudo apt update")
|
||
print(" sudo apt install -y google-chrome-stable")
|
||
print(" 或者")
|
||
print(" sudo apt install -y chromium-browser")
|
||
return default_path
|
||
|
||
|
||
def get_global_browser():
|
||
"""获取全局浏览器实例(Ubuntu 版本)"""
|
||
global global_page
|
||
if global_page is None:
|
||
print("="*60)
|
||
print("Ubuntu 浏览器初始化")
|
||
print("="*60)
|
||
|
||
# 检查操作系统
|
||
if platform.system() != 'Linux':
|
||
print(f"⚠️ 警告: 当前系统是 {platform.system()},此脚本专为 Ubuntu 设计")
|
||
|
||
# 查找 Chrome 路径
|
||
chrome_path = find_chrome_path()
|
||
|
||
options = ChromiumOptions()
|
||
options.set_browser_path(chrome_path)
|
||
|
||
# Ubuntu 服务器环境通常使用无头模式
|
||
if USE_HEADLESS:
|
||
print("配置为无头模式(headless)...")
|
||
try:
|
||
options.headless(True)
|
||
except:
|
||
# 如果 headless 方法不存在,使用参数
|
||
try:
|
||
options.set_argument('--headless=new')
|
||
options.set_argument('--no-sandbox')
|
||
options.set_argument('--disable-dev-shm-usage')
|
||
except:
|
||
pass
|
||
else:
|
||
print("配置为有界面模式...")
|
||
# 检查是否有显示环境
|
||
display = os.environ.get('DISPLAY')
|
||
if not display:
|
||
print("⚠️ 警告: 未检测到 DISPLAY 环境变量")
|
||
print("如果无法显示浏览器,请:")
|
||
print(" 1. 设置 USE_HEADLESS = True")
|
||
print(" 2. 或者设置 DISPLAY 环境变量(如 DISPLAY=:0)")
|
||
print(" 3. 或者使用 Xvfb(虚拟显示)")
|
||
|
||
# Linux 特定参数
|
||
try:
|
||
options.set_argument('--no-sandbox') # 在某些环境下需要
|
||
options.set_argument('--disable-dev-shm-usage') # 避免 /dev/shm 空间不足
|
||
options.set_argument('--disable-gpu') # 禁用 GPU(可选,在 headless 模式下有用)
|
||
except:
|
||
pass
|
||
|
||
print(f"正在启动浏览器...")
|
||
print(f"浏览器路径: {chrome_path}")
|
||
if USE_HEADLESS:
|
||
print("模式: 无头模式(后台运行)")
|
||
else:
|
||
print("模式: 有界面模式")
|
||
|
||
try:
|
||
global_page = ChromiumPage(options)
|
||
print("✅ 浏览器已成功启动!")
|
||
time.sleep(2) # 等待浏览器完全启动
|
||
except Exception as e:
|
||
print(f"❌ 浏览器启动失败: {e}")
|
||
print("\n可能的解决方案:")
|
||
print("1. 确保已安装 Chrome/Chromium:")
|
||
print(" sudo apt update")
|
||
print(" sudo apt install -y google-chrome-stable")
|
||
print("2. 如果使用无头模式失败,尝试设置 USE_HEADLESS = False")
|
||
print("3. 确保有足够的权限")
|
||
print("4. 检查是否缺少依赖:")
|
||
print(" sudo apt install -y libnss3 libatk-bridge2.0-0 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libasound2")
|
||
import traceback
|
||
traceback.print_exc()
|
||
raise
|
||
else:
|
||
print("使用已存在的浏览器实例")
|
||
|
||
return global_page
|
||
|
||
|
||
def extract_logistics_info(tracking_url):
|
||
"""
|
||
从京东物流追踪页面提取运单号、承运人等信息(Ubuntu 版本)
|
||
|
||
Args:
|
||
tracking_url: 物流追踪页面 URL,例如 https://3.cn/2t-Iibig
|
||
|
||
Returns:
|
||
dict: 包含运单号、承运人、承运人电话、物流跟踪信息等的字典
|
||
"""
|
||
page = get_global_browser()
|
||
|
||
try:
|
||
print(f"\n正在打开物流追踪页面: {tracking_url}")
|
||
page.get(tracking_url)
|
||
print("页面加载中,请稍候...")
|
||
time.sleep(5) # 等待页面加载
|
||
|
||
# 检查页面是否成功加载
|
||
current_url = page.url
|
||
print(f"当前页面 URL: {current_url}")
|
||
|
||
# 检查页面标题
|
||
try:
|
||
title = page.title
|
||
print(f"页面标题: {title}")
|
||
except:
|
||
print("无法获取页面标题")
|
||
|
||
# 检查页面是否有内容
|
||
try:
|
||
html_length = len(page.html)
|
||
print(f"页面 HTML 长度: {html_length} 字符")
|
||
if html_length < 100:
|
||
print("⚠️ 警告: 页面内容可能未完全加载")
|
||
except Exception as e:
|
||
print(f"⚠️ 无法获取页面 HTML: {e}")
|
||
|
||
result = {
|
||
"waybill_no": None, # 运单号
|
||
"carrier": None, # 国内承运人
|
||
"carrier_phone": None, # 国内承运人电话
|
||
"tracking_info": [], # 物流跟踪信息列表
|
||
"raw_html": None # 原始 HTML(用于调试)
|
||
}
|
||
|
||
# 方法1: 监听网络请求,查找物流数据 API
|
||
print("\n方法1: 监听网络请求...")
|
||
page.listen.start()
|
||
|
||
# 滚动页面触发可能的请求
|
||
page.scroll.down(300)
|
||
time.sleep(2)
|
||
page.scroll.to_bottom()
|
||
time.sleep(3)
|
||
|
||
# 检查监听到的请求
|
||
responses = page.listen.get()
|
||
print(f"监听到 {len(responses)} 个请求")
|
||
|
||
# 查找可能的物流数据接口
|
||
possible_urls = [
|
||
'track', 'logistics', 'waybill', 'express',
|
||
'delivery', '3.cn', 'jd.com/logistics',
|
||
'api.m.jd.com', 'mapi.jd.com'
|
||
]
|
||
|
||
for resp in responses:
|
||
url = resp.url if hasattr(resp, 'url') else ''
|
||
url_lower = url.lower()
|
||
|
||
# 检查是否可能是物流相关的 API
|
||
if any(keyword in url_lower for keyword in possible_urls):
|
||
print(f"发现可能的物流 API: {url[:100]}")
|
||
try:
|
||
if hasattr(resp, 'response') and hasattr(resp.response, 'body'):
|
||
body = resp.response.body
|
||
|
||
# 处理 JSON 响应
|
||
if isinstance(body, dict):
|
||
json_data = body
|
||
elif isinstance(body, str):
|
||
try:
|
||
json_data = json.loads(body)
|
||
except:
|
||
continue
|
||
else:
|
||
continue
|
||
|
||
# 尝试从 JSON 中提取运单号等信息
|
||
extracted = extract_from_json(json_data)
|
||
if extracted:
|
||
result.update(extracted)
|
||
print("成功从 API 响应中提取数据")
|
||
return result
|
||
except Exception as e:
|
||
print(f"解析 API 响应时出错: {e}")
|
||
|
||
# 方法2: 从页面 HTML/DOM 中提取
|
||
print("\n方法2: 从页面 DOM 提取数据...")
|
||
|
||
html = page.html
|
||
result['raw_html'] = html[:5000] # 保存部分 HTML 用于调试
|
||
|
||
# 从 HTML 文本中提取运单号
|
||
waybill_patterns = [
|
||
r'运单号[::\s]*(\d+)',
|
||
r'waybill[_\s]*no["\']?\s*[::]\s*["\']?(\d+)',
|
||
r'tracking[_\s]*number["\']?\s*[::]\s*["\']?(\d+)',
|
||
r'"waybillNo"\s*[::]\s*["\']?(\d+)',
|
||
r'"trackingNumber"\s*[::]\s*["\']?(\d+)',
|
||
]
|
||
|
||
for pattern in waybill_patterns:
|
||
matches = re.findall(pattern, html, re.IGNORECASE)
|
||
if matches:
|
||
result['waybill_no'] = matches[0]
|
||
print(f"找到运单号: {result['waybill_no']}")
|
||
break
|
||
|
||
# 提取承运人
|
||
carrier_patterns = [
|
||
r'国内承运人[::\s]*([^\s<,,]+)',
|
||
r'carrier[::\s]*([^\s<,,]+)',
|
||
r'"carrier"\s*[::]\s*["\']?([^"\']+)',
|
||
]
|
||
|
||
for pattern in carrier_patterns:
|
||
matches = re.findall(pattern, html, re.IGNORECASE)
|
||
if matches:
|
||
result['carrier'] = matches[0].strip()
|
||
print(f"找到承运人: {result['carrier']}")
|
||
break
|
||
|
||
# 提取承运人电话
|
||
phone_patterns = [
|
||
r'国内承运人电话[::\s]*(\d+)',
|
||
r'carrier[_\s]*phone[::\s]*(\d+)',
|
||
r'"carrierPhone"\s*[::]\s*["\']?(\d+)',
|
||
]
|
||
|
||
for pattern in phone_patterns:
|
||
matches = re.findall(pattern, html, re.IGNORECASE)
|
||
if matches:
|
||
result['carrier_phone'] = matches[0]
|
||
print(f"找到承运人电话: {result['carrier_phone']}")
|
||
break
|
||
|
||
# 方法3: 从 DOM 元素中提取
|
||
print("\n方法3: 从 DOM 元素提取数据...")
|
||
|
||
# 尝试查找运单号元素
|
||
waybill_elements = page.eles('xpath=//*[contains(text(), "运单号") or contains(text(), "运单")]')
|
||
for elem in waybill_elements:
|
||
text = elem.text
|
||
parent_text = elem.parent().text if elem.parent() else ""
|
||
full_text = text + " " + parent_text
|
||
|
||
# 从文本中提取数字作为运单号
|
||
numbers = re.findall(r'\d{8,}', full_text)
|
||
if numbers and not result['waybill_no']:
|
||
result['waybill_no'] = numbers[0]
|
||
print(f"从元素文本中找到运单号: {result['waybill_no']}")
|
||
|
||
# 提取承运人
|
||
if '承运人' in text and not result['carrier']:
|
||
carrier_match = re.search(r'承运人[::\s]*([^\s<,,]+)', full_text)
|
||
if carrier_match:
|
||
result['carrier'] = carrier_match.group(1).strip()
|
||
print(f"从元素文本中找到承运人: {result['carrier']}")
|
||
|
||
# 提取电话
|
||
if '电话' in text and not result['carrier_phone']:
|
||
phone_match = re.search(r'电话[::\s]*(\d+)', full_text)
|
||
if phone_match:
|
||
result['carrier_phone'] = phone_match.group(1)
|
||
print(f"从元素文本中找到电话: {result['carrier_phone']}")
|
||
|
||
# 提取物流跟踪信息(时间线)
|
||
print("\n提取物流跟踪信息...")
|
||
tracking_elements = page.eles('xpath=//*[contains(@class, "track") or contains(@class, "logistics") or contains(@class, "timeline")]')
|
||
|
||
if not tracking_elements:
|
||
# 尝试查找包含时间戳的元素
|
||
tracking_elements = page.eles('xpath=//*[contains(text(), "2025") or contains(text(), "货物") or contains(text(), "到达")]')
|
||
|
||
tracking_info = []
|
||
for elem in tracking_elements[:20]: # 限制数量
|
||
text = elem.text
|
||
if text and len(text) > 5:
|
||
# 尝试提取时间戳
|
||
time_match = re.search(r'(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})', text)
|
||
if time_match or any(keyword in text for keyword in ['货物', '到达', '揽收', '运输', '配送', '签收']):
|
||
tracking_info.append({
|
||
'text': text.strip(),
|
||
'time': time_match.group(1) if time_match else None
|
||
})
|
||
|
||
result['tracking_info'] = tracking_info[:10] # 最多保存10条
|
||
|
||
return result
|
||
|
||
except Exception as e:
|
||
print(f"提取物流信息时出错: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return None
|
||
|
||
|
||
def extract_from_json(json_data):
|
||
"""
|
||
从 JSON 数据中提取物流信息
|
||
|
||
Args:
|
||
json_data: JSON 字典
|
||
|
||
Returns:
|
||
dict: 提取到的物流信息
|
||
"""
|
||
result = {}
|
||
|
||
def search_dict(d, key_patterns):
|
||
"""递归搜索字典中的值"""
|
||
if isinstance(d, dict):
|
||
for k, v in d.items():
|
||
# 检查键名
|
||
for pattern in key_patterns:
|
||
if re.search(pattern, k, re.IGNORECASE):
|
||
return v
|
||
# 递归搜索值
|
||
if isinstance(v, (dict, list)):
|
||
found = search_dict(v, key_patterns)
|
||
if found:
|
||
return found
|
||
elif isinstance(d, list):
|
||
for item in d:
|
||
found = search_dict(item, key_patterns)
|
||
if found:
|
||
return found
|
||
return None
|
||
|
||
# 搜索运单号
|
||
waybill = search_dict(json_data, [r'waybill', r'tracking.*number', r'运单号', r'waybillNo'])
|
||
if waybill:
|
||
result['waybill_no'] = str(waybill)
|
||
|
||
# 搜索承运人
|
||
carrier = search_dict(json_data, [r'carrier', r'承运人', r'carrierName'])
|
||
if carrier:
|
||
result['carrier'] = str(carrier)
|
||
|
||
# 搜索承运人电话
|
||
phone = search_dict(json_data, [r'carrier.*phone', r'承运人电话', r'carrierPhone', r'phone'])
|
||
if phone:
|
||
result['carrier_phone'] = str(phone)
|
||
|
||
# 搜索物流跟踪信息
|
||
tracking = search_dict(json_data, [r'track', r'logistics', r'物流', r'轨迹', r'history'])
|
||
if tracking:
|
||
if isinstance(tracking, list):
|
||
result['tracking_info'] = tracking
|
||
elif isinstance(tracking, dict):
|
||
result['tracking_info'] = [tracking]
|
||
|
||
return result if result else None
|
||
|
||
|
||
def print_result(result):
|
||
"""打印提取结果"""
|
||
if not result:
|
||
print("未能提取到物流信息")
|
||
return
|
||
|
||
print("\n" + "="*50)
|
||
print("物流信息提取结果:")
|
||
print("="*50)
|
||
print(f"运单号: {result.get('waybill_no', '未找到')}")
|
||
print(f"国内承运人: {result.get('carrier', '未找到')}")
|
||
print(f"国内承运人电话: {result.get('carrier_phone', '未找到')}")
|
||
|
||
if result.get('tracking_info'):
|
||
print(f"\n物流跟踪信息 (共 {len(result['tracking_info'])} 条):")
|
||
for idx, info in enumerate(result['tracking_info'], 1):
|
||
if isinstance(info, dict):
|
||
text = info.get('text', str(info))
|
||
time_str = info.get('time', '')
|
||
print(f" {idx}. {text}")
|
||
if time_str:
|
||
print(f" 时间: {time_str}")
|
||
else:
|
||
print(f" {idx}. {info}")
|
||
else:
|
||
print("\n物流跟踪信息: 未找到")
|
||
|
||
print("="*50)
|
||
|
||
|
||
# 主程序
|
||
if __name__ == '__main__':
|
||
# 测试 URL
|
||
tracking_url = "https://3.cn/2t-Iibig"
|
||
|
||
print("="*60)
|
||
print("京东物流信息提取工具 (Ubuntu 版本)")
|
||
print("="*60)
|
||
print(f"目标 URL: {tracking_url}")
|
||
print(f"无头模式: {'是' if USE_HEADLESS else '否'}")
|
||
print("开始提取物流信息...\n")
|
||
|
||
try:
|
||
result = extract_logistics_info(tracking_url)
|
||
except Exception as e:
|
||
print(f"\n❌ 执行过程中出错: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
result = None
|
||
|
||
if result:
|
||
print_result(result)
|
||
|
||
# 保存结果到文件
|
||
output_file = "logistics_result.json"
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||
print(f"\n结果已保存到: {output_file}")
|
||
else:
|
||
print("提取失败")
|
||
|
||
print("\n脚本执行完成")
|
||
|