This commit is contained in:
van
2026-04-26 13:39:19 +08:00
parent fa25bfd784
commit a89703ea72
12 changed files with 2154 additions and 25 deletions

382
jd/fetch_logistics.py Normal file
View File

@@ -0,0 +1,382 @@
import time
import json
import re
from DrissionPage import ChromiumPage, ChromiumOptions
# 设置浏览器路径
CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
# 全局浏览器实例
global_page = None
def get_global_browser():
"""获取全局浏览器实例"""
global global_page
if global_page is None:
print("正在初始化浏览器...")
print(f"浏览器路径: {CHROME_PATH}")
# 导入 os 检查文件是否存在
import os
if not os.path.exists(CHROME_PATH):
raise FileNotFoundError(f"找不到 Chrome 浏览器,路径: {CHROME_PATH}")
options = ChromiumOptions()
options.set_browser_path(CHROME_PATH)
# DrissionPage 默认应该是有界面的浏览器
# 参考 jd.py 和 tb.py 的实现,直接创建即可
# 如果需要最大化窗口,可以尝试添加参数(可选)
try:
options.set_argument('--start-maximized')
except:
pass # 如果设置失败就忽略,不影响浏览器启动
print("正在启动浏览器,请稍候...")
print("如果浏览器没有自动打开,请检查 Chrome 是否正确安装")
try:
global_page = ChromiumPage(options)
print("✅ 浏览器已成功启动!")
print(f"当前页面 URL: {global_page.url}")
# 等待浏览器完全启动
time.sleep(2)
except Exception as e:
print(f"❌ 浏览器启动失败: {e}")
import traceback
traceback.print_exc()
raise
else:
print("使用已存在的浏览器实例")
return global_page
def extract_logistics_info(tracking_url):
"""
从京东物流追踪页面提取运单号、承运人等信息
Args:
tracking_url: 物流追踪页面 URL例如 https://3.cn/2t-Iibig
Returns:
dict: 包含运单号、承运人、承运人电话、物流跟踪信息等的字典
"""
page = get_global_browser()
try:
print(f"\n正在打开物流追踪页面: {tracking_url}")
page.get(tracking_url)
print("页面加载中,请稍候...")
time.sleep(5) # 等待页面加载
# 检查页面是否成功加载
current_url = page.url
print(f"当前页面 URL: {current_url}")
# 检查页面标题
try:
title = page.title
print(f"页面标题: {title}")
except:
print("无法获取页面标题")
# 检查页面是否有内容
try:
html_length = len(page.html)
print(f"页面 HTML 长度: {html_length} 字符")
if html_length < 100:
print("⚠️ 警告: 页面内容可能未完全加载")
except Exception as e:
print(f"⚠️ 无法获取页面 HTML: {e}")
result = {
"waybill_no": None, # 运单号
"carrier": None, # 国内承运人
"carrier_phone": None, # 国内承运人电话
"tracking_info": [], # 物流跟踪信息列表
"raw_html": None # 原始 HTML用于调试
}
# 方法1: 监听网络请求,查找物流数据 API
print("方法1: 监听网络请求...")
page.listen.start()
# 滚动页面触发可能的请求
page.scroll.down(300)
time.sleep(2)
page.scroll.to_bottom()
time.sleep(3)
# 检查监听到的请求
responses = page.listen.get()
print(f"监听到 {len(responses)} 个请求")
# 查找可能的物流数据接口
possible_urls = [
'track', 'logistics', 'waybill', 'express',
'delivery', '3.cn', 'jd.com/logistics',
'api.m.jd.com', 'mapi.jd.com'
]
for resp in responses:
url = resp.url if hasattr(resp, 'url') else ''
url_lower = url.lower()
# 检查是否可能是物流相关的 API
if any(keyword in url_lower for keyword in possible_urls):
print(f"发现可能的物流 API: {url[:100]}")
try:
if hasattr(resp, 'response') and hasattr(resp.response, 'body'):
body = resp.response.body
# 处理 JSON 响应
if isinstance(body, dict):
json_data = body
elif isinstance(body, str):
try:
json_data = json.loads(body)
except:
continue
else:
continue
# 尝试从 JSON 中提取运单号等信息
extracted = extract_from_json(json_data)
if extracted:
result.update(extracted)
print("成功从 API 响应中提取数据")
return result
except Exception as e:
print(f"解析 API 响应时出错: {e}")
# 方法2: 从页面 HTML/DOM 中提取
print("\n方法2: 从页面 DOM 提取数据...")
html = page.html
result['raw_html'] = html[:5000] # 保存部分 HTML 用于调试
# 从 HTML 文本中提取运单号
waybill_patterns = [
r'运单号[:\s]*(\d+)',
r'waybill[_\s]*no["\']?\s*[:]\s*["\']?(\d+)',
r'tracking[_\s]*number["\']?\s*[:]\s*["\']?(\d+)',
r'"waybillNo"\s*[:]\s*["\']?(\d+)',
r'"trackingNumber"\s*[:]\s*["\']?(\d+)',
]
for pattern in waybill_patterns:
matches = re.findall(pattern, html, re.IGNORECASE)
if matches:
result['waybill_no'] = matches[0]
print(f"找到运单号: {result['waybill_no']}")
break
# 提取承运人
carrier_patterns = [
r'国内承运人[:\s]*([^\s<,]+)',
r'carrier[:\s]*([^\s<,]+)',
r'"carrier"\s*[:]\s*["\']?([^"\']+)',
]
for pattern in carrier_patterns:
matches = re.findall(pattern, html, re.IGNORECASE)
if matches:
result['carrier'] = matches[0].strip()
print(f"找到承运人: {result['carrier']}")
break
# 提取承运人电话
phone_patterns = [
r'国内承运人电话[:\s]*(\d+)',
r'carrier[_\s]*phone[:\s]*(\d+)',
r'"carrierPhone"\s*[:]\s*["\']?(\d+)',
]
for pattern in phone_patterns:
matches = re.findall(pattern, html, re.IGNORECASE)
if matches:
result['carrier_phone'] = matches[0]
print(f"找到承运人电话: {result['carrier_phone']}")
break
# 方法3: 从 DOM 元素中提取
print("\n方法3: 从 DOM 元素提取数据...")
# 尝试查找运单号元素
waybill_elements = page.eles('xpath=//*[contains(text(), "运单号") or contains(text(), "运单")]')
for elem in waybill_elements:
text = elem.text
parent_text = elem.parent().text if elem.parent() else ""
full_text = text + " " + parent_text
# 从文本中提取数字作为运单号
numbers = re.findall(r'\d{8,}', full_text)
if numbers and not result['waybill_no']:
result['waybill_no'] = numbers[0]
print(f"从元素文本中找到运单号: {result['waybill_no']}")
# 提取承运人
if '承运人' in text and not result['carrier']:
carrier_match = re.search(r'承运人[:\s]*([^\s<,]+)', full_text)
if carrier_match:
result['carrier'] = carrier_match.group(1).strip()
print(f"从元素文本中找到承运人: {result['carrier']}")
# 提取电话
if '电话' in text and not result['carrier_phone']:
phone_match = re.search(r'电话[:\s]*(\d+)', full_text)
if phone_match:
result['carrier_phone'] = phone_match.group(1)
print(f"从元素文本中找到电话: {result['carrier_phone']}")
# 提取物流跟踪信息(时间线)
print("\n提取物流跟踪信息...")
tracking_elements = page.eles('xpath=//*[contains(@class, "track") or contains(@class, "logistics") or contains(@class, "timeline")]')
if not tracking_elements:
# 尝试查找包含时间戳的元素
tracking_elements = page.eles('xpath=//*[contains(text(), "2025") or contains(text(), "货物") or contains(text(), "到达")]')
tracking_info = []
for elem in tracking_elements[:20]: # 限制数量
text = elem.text
if text and len(text) > 5:
# 尝试提取时间戳
time_match = re.search(r'(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})', text)
if time_match or any(keyword in text for keyword in ['货物', '到达', '揽收', '运输', '配送', '签收']):
tracking_info.append({
'text': text.strip(),
'time': time_match.group(1) if time_match else None
})
result['tracking_info'] = tracking_info[:10] # 最多保存10条
return result
except Exception as e:
print(f"提取物流信息时出错: {e}")
import traceback
traceback.print_exc()
return None
def extract_from_json(json_data):
"""
从 JSON 数据中提取物流信息
Args:
json_data: JSON 字典
Returns:
dict: 提取到的物流信息
"""
result = {}
def search_dict(d, key_patterns):
"""递归搜索字典中的值"""
if isinstance(d, dict):
for k, v in d.items():
# 检查键名
for pattern in key_patterns:
if re.search(pattern, k, re.IGNORECASE):
return v
# 递归搜索值
if isinstance(v, (dict, list)):
found = search_dict(v, key_patterns)
if found:
return found
elif isinstance(d, list):
for item in d:
found = search_dict(item, key_patterns)
if found:
return found
return None
# 搜索运单号
waybill = search_dict(json_data, [r'waybill', r'tracking.*number', r'运单号', r'waybillNo'])
if waybill:
result['waybill_no'] = str(waybill)
# 搜索承运人
carrier = search_dict(json_data, [r'carrier', r'承运人', r'carrierName'])
if carrier:
result['carrier'] = str(carrier)
# 搜索承运人电话
phone = search_dict(json_data, [r'carrier.*phone', r'承运人电话', r'carrierPhone', r'phone'])
if phone:
result['carrier_phone'] = str(phone)
# 搜索物流跟踪信息
tracking = search_dict(json_data, [r'track', r'logistics', r'物流', r'轨迹', r'history'])
if tracking:
if isinstance(tracking, list):
result['tracking_info'] = tracking
elif isinstance(tracking, dict):
result['tracking_info'] = [tracking]
return result if result else None
def print_result(result):
"""打印提取结果"""
if not result:
print("未能提取到物流信息")
return
print("\n" + "="*50)
print("物流信息提取结果:")
print("="*50)
print(f"运单号: {result.get('waybill_no', '未找到')}")
print(f"国内承运人: {result.get('carrier', '未找到')}")
print(f"国内承运人电话: {result.get('carrier_phone', '未找到')}")
if result.get('tracking_info'):
print(f"\n物流跟踪信息 (共 {len(result['tracking_info'])} 条):")
for idx, info in enumerate(result['tracking_info'], 1):
if isinstance(info, dict):
text = info.get('text', str(info))
time_str = info.get('time', '')
print(f" {idx}. {text}")
if time_str:
print(f" 时间: {time_str}")
else:
print(f" {idx}. {info}")
else:
print("\n物流跟踪信息: 未找到")
print("="*50)
# 主程序
if __name__ == '__main__':
# 测试 URL
tracking_url = "https://3.cn/2t-Iibig"
print("="*60)
print("京东物流信息提取工具")
print("="*60)
print(f"目标 URL: {tracking_url}")
print("开始提取物流信息...\n")
try:
result = extract_logistics_info(tracking_url)
except Exception as e:
print(f"\n❌ 执行过程中出错: {e}")
import traceback
traceback.print_exc()
result = None
if result:
print_result(result)
# 保存结果到文件
output_file = "logistics_result.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n结果已保存到: {output_file}")
else:
print("提取失败")
print("\n脚本执行完成,浏览器保持打开状态用于调试")