155 lines
5.4 KiB
Python
155 lines
5.4 KiB
Python
import time
|
||
import json
|
||
import re
|
||
from DrissionPage import ChromiumPage, ChromiumOptions
|
||
|
||
# 设置浏览器路径
|
||
CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
|
||
|
||
# 物流追踪页面 URL
|
||
TRACKING_URL = "https://3.cn/2t-Iibig"
|
||
|
||
# 配置并启动浏览器
|
||
options = ChromiumOptions()
|
||
options.set_browser_path(CHROME_PATH)
|
||
|
||
# 创建浏览器实例
|
||
page = ChromiumPage(options)
|
||
|
||
try:
|
||
print("正在打开物流追踪页面...")
|
||
page.get(TRACKING_URL)
|
||
|
||
# 等待页面加载
|
||
time.sleep(5)
|
||
|
||
print("\n=== 方法1: 尝试从页面元素提取信息 ===")
|
||
|
||
# 尝试提取运单号
|
||
waybill_elements = page.eles('xpath=//*[contains(text(), "运单号")]')
|
||
if waybill_elements:
|
||
print(f"找到运单号相关元素: {len(waybill_elements)} 个")
|
||
for elem in waybill_elements:
|
||
print(f" 文本: {elem.text}")
|
||
# 尝试获取父元素或兄弟元素
|
||
parent = elem.parent()
|
||
if parent:
|
||
print(f" 父元素文本: {parent.text[:100]}")
|
||
|
||
# 尝试提取承运人信息
|
||
carrier_elements = page.eles('xpath=//*[contains(text(), "承运人")]')
|
||
if carrier_elements:
|
||
print(f"\n找到承运人相关元素: {len(carrier_elements)} 个")
|
||
for elem in carrier_elements:
|
||
print(f" 文本: {elem.text}")
|
||
|
||
print("\n=== 方法2: 监听网络请求,查找数据接口 ===")
|
||
|
||
# 监听所有包含数据的请求
|
||
print("开始监听网络请求...")
|
||
page.listen.start()
|
||
|
||
# 滚动页面触发可能的请求
|
||
page.scroll.down(500)
|
||
time.sleep(3)
|
||
page.scroll.to_bottom()
|
||
time.sleep(5)
|
||
|
||
# 获取所有监听到的请求
|
||
all_responses = page.listen.get()
|
||
print(f"\n共监听到 {len(all_responses)} 个请求")
|
||
|
||
# 查找可能包含物流数据的请求
|
||
keywords = ['track', 'logistics', 'waybill', 'express', 'delivery', '3.cn', 'jd.com', 'json', 'api']
|
||
|
||
for idx, resp in enumerate(all_responses):
|
||
url = resp.url if hasattr(resp, 'url') else ''
|
||
print(f"\n请求 {idx + 1}:")
|
||
print(f" URL: {url[:150]}")
|
||
|
||
# 检查是否包含关键词
|
||
url_lower = url.lower()
|
||
if any(keyword in url_lower for keyword in keywords):
|
||
print(f" ⭐ 可能相关的请求!")
|
||
try:
|
||
if hasattr(resp, 'response') and hasattr(resp.response, 'body'):
|
||
body = resp.response.body
|
||
if isinstance(body, dict):
|
||
print(f" 响应数据 (前500字符): {str(body)[:500]}")
|
||
# 尝试解析 JSON
|
||
print(f" 完整的 JSON 数据:")
|
||
print(json.dumps(body, indent=2, ensure_ascii=False)[:1000])
|
||
elif isinstance(body, str):
|
||
print(f" 响应数据 (前500字符): {body[:500]}")
|
||
# 尝试解析 JSON
|
||
try:
|
||
json_data = json.loads(body)
|
||
print(f" 解析后的 JSON (前1000字符):")
|
||
print(json.dumps(json_data, indent=2, ensure_ascii=False)[:1000])
|
||
except:
|
||
pass
|
||
except Exception as e:
|
||
print(f" 解析响应时出错: {e}")
|
||
|
||
print("\n=== 方法3: 提取页面 HTML 中的 JSON 数据 ===")
|
||
|
||
# 获取页面 HTML
|
||
html = page.html
|
||
# 查找可能的 JSON 数据(在 script 标签中)
|
||
json_patterns = [
|
||
r'window\.__INITIAL_STATE__\s*=\s*({.+?});',
|
||
r'var\s+trackData\s*=\s*({.+?});',
|
||
r'const\s+trackingInfo\s*=\s*({.+?});',
|
||
r'data\s*:\s*({.+?})',
|
||
r'"waybillNo"[:\s]+"([^"]+)"',
|
||
r'"trackingNumber"[:\s]+"([^"]+)"',
|
||
]
|
||
|
||
for pattern in json_patterns:
|
||
matches = re.findall(pattern, html, re.DOTALL)
|
||
if matches:
|
||
print(f"\n找到匹配模式 {pattern}:")
|
||
for match in matches[:3]: # 只显示前3个
|
||
print(f" 匹配: {str(match)[:200]}")
|
||
|
||
print("\n=== 尝试提取页面中的所有文本内容 ===")
|
||
page_text = page.html
|
||
# 查找运单号(通常是数字)
|
||
waybill_pattern = r'运单号[:\s]*(\d+)'
|
||
waybill_matches = re.findall(waybill_pattern, page_text)
|
||
if waybill_matches:
|
||
print(f"找到运单号: {waybill_matches}")
|
||
|
||
# 查找承运人
|
||
carrier_pattern = r'国内承运人[:\s]*([^\s<]+)'
|
||
carrier_matches = re.findall(carrier_pattern, page_text)
|
||
if carrier_matches:
|
||
print(f"找到承运人: {carrier_matches}")
|
||
|
||
# 查找电话号码
|
||
phone_pattern = r'国内承运人电话[:\s]*(\d+)'
|
||
phone_matches = re.findall(phone_pattern, page_text)
|
||
if phone_matches:
|
||
print(f"找到电话: {phone_matches}")
|
||
|
||
print("\n=== 等待用户查看页面 ===")
|
||
print("页面已打开,请手动检查浏览器中的网络请求(F12 -> Network),查找包含物流数据的 API")
|
||
print("按 Enter 键继续或等待 60 秒后自动关闭...")
|
||
|
||
try:
|
||
input()
|
||
except:
|
||
time.sleep(60)
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n用户中断脚本执行")
|
||
except Exception as e:
|
||
print(f"\n发生错误: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
finally:
|
||
print("\n脚本执行完成,浏览器保持打开状态用于调试")
|
||
# 可以选择是否关闭浏览器
|
||
# page.quit()
|
||
|