Files
tb_pl/jd/logistics.py
2026-04-26 13:39:19 +08:00

155 lines
5.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import time
import json
import re
from DrissionPage import ChromiumPage, ChromiumOptions
# 设置浏览器路径
CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
# 物流追踪页面 URL
TRACKING_URL = "https://3.cn/2t-Iibig"
# 配置并启动浏览器
options = ChromiumOptions()
options.set_browser_path(CHROME_PATH)
# 创建浏览器实例
page = ChromiumPage(options)
try:
print("正在打开物流追踪页面...")
page.get(TRACKING_URL)
# 等待页面加载
time.sleep(5)
print("\n=== 方法1: 尝试从页面元素提取信息 ===")
# 尝试提取运单号
waybill_elements = page.eles('xpath=//*[contains(text(), "运单号")]')
if waybill_elements:
print(f"找到运单号相关元素: {len(waybill_elements)}")
for elem in waybill_elements:
print(f" 文本: {elem.text}")
# 尝试获取父元素或兄弟元素
parent = elem.parent()
if parent:
print(f" 父元素文本: {parent.text[:100]}")
# 尝试提取承运人信息
carrier_elements = page.eles('xpath=//*[contains(text(), "承运人")]')
if carrier_elements:
print(f"\n找到承运人相关元素: {len(carrier_elements)}")
for elem in carrier_elements:
print(f" 文本: {elem.text}")
print("\n=== 方法2: 监听网络请求,查找数据接口 ===")
# 监听所有包含数据的请求
print("开始监听网络请求...")
page.listen.start()
# 滚动页面触发可能的请求
page.scroll.down(500)
time.sleep(3)
page.scroll.to_bottom()
time.sleep(5)
# 获取所有监听到的请求
all_responses = page.listen.get()
print(f"\n共监听到 {len(all_responses)} 个请求")
# 查找可能包含物流数据的请求
keywords = ['track', 'logistics', 'waybill', 'express', 'delivery', '3.cn', 'jd.com', 'json', 'api']
for idx, resp in enumerate(all_responses):
url = resp.url if hasattr(resp, 'url') else ''
print(f"\n请求 {idx + 1}:")
print(f" URL: {url[:150]}")
# 检查是否包含关键词
url_lower = url.lower()
if any(keyword in url_lower for keyword in keywords):
print(f" ⭐ 可能相关的请求!")
try:
if hasattr(resp, 'response') and hasattr(resp.response, 'body'):
body = resp.response.body
if isinstance(body, dict):
print(f" 响应数据 (前500字符): {str(body)[:500]}")
# 尝试解析 JSON
print(f" 完整的 JSON 数据:")
print(json.dumps(body, indent=2, ensure_ascii=False)[:1000])
elif isinstance(body, str):
print(f" 响应数据 (前500字符): {body[:500]}")
# 尝试解析 JSON
try:
json_data = json.loads(body)
print(f" 解析后的 JSON (前1000字符):")
print(json.dumps(json_data, indent=2, ensure_ascii=False)[:1000])
except:
pass
except Exception as e:
print(f" 解析响应时出错: {e}")
print("\n=== 方法3: 提取页面 HTML 中的 JSON 数据 ===")
# 获取页面 HTML
html = page.html
# 查找可能的 JSON 数据(在 script 标签中)
json_patterns = [
r'window\.__INITIAL_STATE__\s*=\s*({.+?});',
r'var\s+trackData\s*=\s*({.+?});',
r'const\s+trackingInfo\s*=\s*({.+?});',
r'data\s*:\s*({.+?})',
r'"waybillNo"[:\s]+"([^"]+)"',
r'"trackingNumber"[:\s]+"([^"]+)"',
]
for pattern in json_patterns:
matches = re.findall(pattern, html, re.DOTALL)
if matches:
print(f"\n找到匹配模式 {pattern}:")
for match in matches[:3]: # 只显示前3个
print(f" 匹配: {str(match)[:200]}")
print("\n=== 尝试提取页面中的所有文本内容 ===")
page_text = page.html
# 查找运单号(通常是数字)
waybill_pattern = r'运单号[:\s]*(\d+)'
waybill_matches = re.findall(waybill_pattern, page_text)
if waybill_matches:
print(f"找到运单号: {waybill_matches}")
# 查找承运人
carrier_pattern = r'国内承运人[:\s]*([^\s<]+)'
carrier_matches = re.findall(carrier_pattern, page_text)
if carrier_matches:
print(f"找到承运人: {carrier_matches}")
# 查找电话号码
phone_pattern = r'国内承运人电话[:\s]*(\d+)'
phone_matches = re.findall(phone_pattern, page_text)
if phone_matches:
print(f"找到电话: {phone_matches}")
print("\n=== 等待用户查看页面 ===")
print("页面已打开请手动检查浏览器中的网络请求F12 -> Network查找包含物流数据的 API")
print("按 Enter 键继续或等待 60 秒后自动关闭...")
try:
input()
except:
time.sleep(60)
except KeyboardInterrupt:
print("\n用户中断脚本执行")
except Exception as e:
print(f"\n发生错误: {e}")
import traceback
traceback.print_exc()
finally:
print("\n脚本执行完成,浏览器保持打开状态用于调试")
# 可以选择是否关闭浏览器
# page.quit()