This commit is contained in:
van
2026-04-26 13:39:19 +08:00
parent fa25bfd784
commit a89703ea72
12 changed files with 2154 additions and 25 deletions

154
jd/logistics.py Normal file
View File

@@ -0,0 +1,154 @@
import time
import json
import re
from DrissionPage import ChromiumPage, ChromiumOptions
# 设置浏览器路径
CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
# 物流追踪页面 URL
TRACKING_URL = "https://3.cn/2t-Iibig"
# 配置并启动浏览器
options = ChromiumOptions()
options.set_browser_path(CHROME_PATH)
# 创建浏览器实例
page = ChromiumPage(options)
try:
print("正在打开物流追踪页面...")
page.get(TRACKING_URL)
# 等待页面加载
time.sleep(5)
print("\n=== 方法1: 尝试从页面元素提取信息 ===")
# 尝试提取运单号
waybill_elements = page.eles('xpath=//*[contains(text(), "运单号")]')
if waybill_elements:
print(f"找到运单号相关元素: {len(waybill_elements)}")
for elem in waybill_elements:
print(f" 文本: {elem.text}")
# 尝试获取父元素或兄弟元素
parent = elem.parent()
if parent:
print(f" 父元素文本: {parent.text[:100]}")
# 尝试提取承运人信息
carrier_elements = page.eles('xpath=//*[contains(text(), "承运人")]')
if carrier_elements:
print(f"\n找到承运人相关元素: {len(carrier_elements)}")
for elem in carrier_elements:
print(f" 文本: {elem.text}")
print("\n=== 方法2: 监听网络请求,查找数据接口 ===")
# 监听所有包含数据的请求
print("开始监听网络请求...")
page.listen.start()
# 滚动页面触发可能的请求
page.scroll.down(500)
time.sleep(3)
page.scroll.to_bottom()
time.sleep(5)
# 获取所有监听到的请求
all_responses = page.listen.get()
print(f"\n共监听到 {len(all_responses)} 个请求")
# 查找可能包含物流数据的请求
keywords = ['track', 'logistics', 'waybill', 'express', 'delivery', '3.cn', 'jd.com', 'json', 'api']
for idx, resp in enumerate(all_responses):
url = resp.url if hasattr(resp, 'url') else ''
print(f"\n请求 {idx + 1}:")
print(f" URL: {url[:150]}")
# 检查是否包含关键词
url_lower = url.lower()
if any(keyword in url_lower for keyword in keywords):
print(f" ⭐ 可能相关的请求!")
try:
if hasattr(resp, 'response') and hasattr(resp.response, 'body'):
body = resp.response.body
if isinstance(body, dict):
print(f" 响应数据 (前500字符): {str(body)[:500]}")
# 尝试解析 JSON
print(f" 完整的 JSON 数据:")
print(json.dumps(body, indent=2, ensure_ascii=False)[:1000])
elif isinstance(body, str):
print(f" 响应数据 (前500字符): {body[:500]}")
# 尝试解析 JSON
try:
json_data = json.loads(body)
print(f" 解析后的 JSON (前1000字符):")
print(json.dumps(json_data, indent=2, ensure_ascii=False)[:1000])
except:
pass
except Exception as e:
print(f" 解析响应时出错: {e}")
print("\n=== 方法3: 提取页面 HTML 中的 JSON 数据 ===")
# 获取页面 HTML
html = page.html
# 查找可能的 JSON 数据(在 script 标签中)
json_patterns = [
r'window\.__INITIAL_STATE__\s*=\s*({.+?});',
r'var\s+trackData\s*=\s*({.+?});',
r'const\s+trackingInfo\s*=\s*({.+?});',
r'data\s*:\s*({.+?})',
r'"waybillNo"[:\s]+"([^"]+)"',
r'"trackingNumber"[:\s]+"([^"]+)"',
]
for pattern in json_patterns:
matches = re.findall(pattern, html, re.DOTALL)
if matches:
print(f"\n找到匹配模式 {pattern}:")
for match in matches[:3]: # 只显示前3个
print(f" 匹配: {str(match)[:200]}")
print("\n=== 尝试提取页面中的所有文本内容 ===")
page_text = page.html
# 查找运单号(通常是数字)
waybill_pattern = r'运单号[:\s]*(\d+)'
waybill_matches = re.findall(waybill_pattern, page_text)
if waybill_matches:
print(f"找到运单号: {waybill_matches}")
# 查找承运人
carrier_pattern = r'国内承运人[:\s]*([^\s<]+)'
carrier_matches = re.findall(carrier_pattern, page_text)
if carrier_matches:
print(f"找到承运人: {carrier_matches}")
# 查找电话号码
phone_pattern = r'国内承运人电话[:\s]*(\d+)'
phone_matches = re.findall(phone_pattern, page_text)
if phone_matches:
print(f"找到电话: {phone_matches}")
print("\n=== 等待用户查看页面 ===")
print("页面已打开请手动检查浏览器中的网络请求F12 -> Network查找包含物流数据的 API")
print("按 Enter 键继续或等待 60 秒后自动关闭...")
try:
input()
except:
time.sleep(60)
except KeyboardInterrupt:
print("\n用户中断脚本执行")
except Exception as e:
print(f"\n发生错误: {e}")
import traceback
traceback.print_exc()
finally:
print("\n脚本执行完成,浏览器保持打开状态用于调试")
# 可以选择是否关闭浏览器
# page.quit()