This commit is contained in:
van
2026-04-26 13:55:54 +08:00
commit 83c48dbed9
24 changed files with 1955 additions and 0 deletions

View File

@@ -0,0 +1,457 @@
import time
import json
import re
import os
import platform
import threading
from flask import Flask, request, jsonify
from DrissionPage import ChromiumPage, ChromiumOptions
# Ubuntu 上常见的 Chrome/Chromium 路径
UBUNTU_CHROME_PATHS = [
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium-browser',
'/usr/bin/chromium',
'/snap/bin/chromium',
'/opt/google/chrome/chrome',
]
# 是否使用无头模式headless
# True: 无界面模式,适合服务器环境
# False: 有界面模式,需要 X11 或 Wayland
USE_HEADLESS = True # 可以根据需要修改
# 监听端口:内网多实例时每台设不同端口,例如 LOGISTICS_PORT=5002
LISTEN_PORT = int(os.environ.get('LOGISTICS_PORT', os.environ.get('PORT', '5001')))
# 全局浏览器实例
global_page = None
def find_chrome_path():
"""自动查找 Ubuntu 系统中的 Chrome/Chromium 路径"""
print("正在查找 Chrome/Chromium 浏览器...")
# 首先尝试常见的路径
for path in UBUNTU_CHROME_PATHS:
if os.path.exists(path):
print(f"✅ 找到浏览器: {path}")
return path
# 尝试使用 which 命令查找
import subprocess
try:
result = subprocess.run(['which', 'google-chrome'],
capture_output=True, text=True, timeout=5)
if result.returncode == 0 and os.path.exists(result.stdout.strip()):
path = result.stdout.strip()
print(f"✅ 通过 which 找到浏览器: {path}")
return path
except:
pass
try:
result = subprocess.run(['which', 'chromium-browser'],
capture_output=True, text=True, timeout=5)
if result.returncode == 0 and os.path.exists(result.stdout.strip()):
path = result.stdout.strip()
print(f"✅ 通过 which 找到浏览器: {path}")
return path
except:
pass
# 如果都找不到,返回最常见的路径
default_path = '/usr/bin/google-chrome'
print(f"⚠️ 未找到浏览器,将使用默认路径: {default_path}")
print("请确保已安装 Google Chrome 或 Chromium:")
print(" sudo apt update")
print(" sudo apt install -y google-chrome-stable")
print(" 或者")
print(" sudo apt install -y chromium-browser")
return default_path
def get_global_browser():
"""获取全局浏览器实例Ubuntu 版本)"""
global global_page
if global_page is None:
print("="*60)
print("Ubuntu 浏览器初始化")
print("="*60)
# 检查操作系统
if platform.system() != 'Linux':
print(f"⚠️ 警告: 当前系统是 {platform.system()},此脚本专为 Ubuntu 设计")
# 查找 Chrome 路径
chrome_path = find_chrome_path()
options = ChromiumOptions()
options.set_browser_path(chrome_path)
# Ubuntu 服务器环境通常使用无头模式
if USE_HEADLESS:
print("配置为无头模式headless...")
try:
options.headless(True)
except:
# 如果 headless 方法不存在,使用参数
try:
options.set_argument('--headless=new')
options.set_argument('--no-sandbox')
options.set_argument('--disable-dev-shm-usage')
except:
pass
else:
print("配置为有界面模式...")
# 检查是否有显示环境
display = os.environ.get('DISPLAY')
if not display:
print("⚠️ 警告: 未检测到 DISPLAY 环境变量")
print("如果无法显示浏览器,请:")
print(" 1. 设置 USE_HEADLESS = True")
print(" 2. 或者设置 DISPLAY 环境变量(如 DISPLAY=:0")
print(" 3. 或者使用 Xvfb虚拟显示")
# Linux 特定参数
try:
options.set_argument('--no-sandbox') # 在某些环境下需要
options.set_argument('--disable-dev-shm-usage') # 避免 /dev/shm 空间不足
options.set_argument('--disable-gpu') # 禁用 GPU可选在 headless 模式下有用)
except:
pass
print(f"正在启动浏览器...")
print(f"浏览器路径: {chrome_path}")
if USE_HEADLESS:
print("模式: 无头模式(后台运行)")
else:
print("模式: 有界面模式")
try:
global_page = ChromiumPage(options)
print("✅ 浏览器已成功启动!")
time.sleep(2) # 等待浏览器完全启动
except Exception as e:
print(f"❌ 浏览器启动失败: {e}")
print("\n可能的解决方案:")
print("1. 确保已安装 Chrome/Chromium:")
print(" sudo apt update")
print(" sudo apt install -y google-chrome-stable")
print("2. 如果使用无头模式失败,尝试设置 USE_HEADLESS = False")
print("3. 确保有足够的权限")
print("4. 检查是否缺少依赖:")
print(" sudo apt install -y libnss3 libatk-bridge2.0-0 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libasound2")
import traceback
traceback.print_exc()
raise
else:
print("使用已存在的浏览器实例")
return global_page
def extract_logistics_info(tracking_url):
"""
从京东物流追踪页面提取运单号、承运人等信息Ubuntu 版本)
Args:
tracking_url: 物流追踪页面 URL例如 https://3.cn/2t-Iibig
Returns:
dict: 包含运单号、承运人、承运人电话、物流跟踪信息等的字典
"""
page = get_global_browser()
try:
print(f"\n正在打开物流追踪页面: {tracking_url}")
page.get(tracking_url)
print("页面加载中,请稍候...")
time.sleep(5) # 等待页面加载
# 检查页面是否成功加载
current_url = page.url
print(f"当前页面 URL: {current_url}")
# 检查页面标题
try:
title = page.title
print(f"页面标题: {title}")
except:
print("无法获取页面标题")
# 检查页面是否有内容
try:
html_length = len(page.html)
print(f"页面 HTML 长度: {html_length} 字符")
if html_length < 100:
print("⚠️ 警告: 页面内容可能未完全加载")
except Exception as e:
print(f"⚠️ 无法获取页面 HTML: {e}")
result = {
"waybill_no": None, # 运单号
"carrier": None, # 国内承运人
"carrier_phone": None, # 国内承运人电话
"tracking_info": [], # 物流跟踪信息列表
}
# 从 DOM 元素中提取数据
print("\n从 DOM 元素提取数据...")
# 尝试查找运单号元素
waybill_elements = page.eles('xpath=//*[contains(text(), "运单号") or contains(text(), "运单")]')
for elem in waybill_elements:
text = elem.text
parent_text = elem.parent().text if elem.parent() else ""
full_text = text + " " + parent_text
# 从文本中提取数字作为运单号
numbers = re.findall(r'\d{8,}', full_text)
if numbers and not result['waybill_no']:
result['waybill_no'] = numbers[0]
print(f"✅ 找到运单号: {result['waybill_no']}")
# 提取承运人
if '承运人' in text and not result['carrier']:
carrier_match = re.search(r'承运人[:\s]*([^\s<,]+)', full_text)
if carrier_match:
result['carrier'] = carrier_match.group(1).strip()
print(f"✅ 找到承运人: {result['carrier']}")
# 提取电话
if '电话' in text and not result['carrier_phone']:
phone_match = re.search(r'电话[:\s]*(\d+)', full_text)
if phone_match:
result['carrier_phone'] = phone_match.group(1)
print(f"✅ 找到承运人电话: {result['carrier_phone']}")
# 提取物流跟踪信息(时间线)
print("\n提取物流跟踪信息...")
tracking_elements = page.eles('xpath=//*[contains(@class, "track") or contains(@class, "logistics") or contains(@class, "timeline")]')
if not tracking_elements:
# 尝试查找包含时间戳的元素
tracking_elements = page.eles('xpath=//*[contains(text(), "2025") or contains(text(), "货物") or contains(text(), "到达")]')
tracking_info = []
for elem in tracking_elements[:20]: # 限制数量
text = elem.text
if text and len(text) > 5:
# 尝试提取时间戳
time_match = re.search(r'(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})', text)
if time_match or any(keyword in text for keyword in ['货物', '到达', '揽收', '运输', '配送', '签收']):
tracking_info.append({
'text': text.strip(),
'time': time_match.group(1) if time_match else None
})
result['tracking_info'] = tracking_info[:10] # 最多保存10条
if result['tracking_info']:
print(f"✅ 找到 {len(result['tracking_info'])} 条物流跟踪信息")
return result
except Exception as e:
print(f"提取物流信息时出错: {e}")
import traceback
traceback.print_exc()
return None
def print_result(result):
"""打印提取结果"""
if not result:
print("未能提取到物流信息")
return
print("\n" + "="*50)
print("物流信息提取结果:")
print("="*50)
print(f"运单号: {result.get('waybill_no', '未找到')}")
print(f"国内承运人: {result.get('carrier', '未找到')}")
print(f"国内承运人电话: {result.get('carrier_phone', '未找到')}")
if result.get('tracking_info'):
print(f"\n物流跟踪信息 (共 {len(result['tracking_info'])} 条):")
for idx, info in enumerate(result['tracking_info'], 1):
if isinstance(info, dict):
text = info.get('text', str(info))
time_str = info.get('time', '')
print(f" {idx}. {text}")
if time_str:
print(f" 时间: {time_str}")
else:
print(f" {idx}. {info}")
else:
print("\n物流跟踪信息: 未找到")
print("="*50)
# =================== Flask API 接口 ===================
# 初始化 Flask 应用
app = Flask(__name__)
# 初始化锁,防止并发访问
fetch_lock = threading.Lock()
@app.route('/fetch_logistics', methods=['GET', 'POST'])
def fetch_logistics():
"""
查询物流信息接口
参数:
tracking_url: 物流追踪页面 URLGET 或 POST
例如: https://3.cn/2t-Iibig
返回:
JSON 格式的物流信息,包含:
- waybill_no: 运单号
- carrier: 国内承运人
- carrier_phone: 国内承运人电话
- tracking_info: 物流跟踪信息列表
- success: 是否成功
- message: 消息提示
"""
# 获取参数(支持 GET 和 POST
if request.method == 'POST':
if request.is_json:
data = request.get_json()
tracking_url = data.get('tracking_url') or data.get('url')
else:
tracking_url = request.form.get('tracking_url') or request.form.get('url') or request.args.get('tracking_url') or request.args.get('url')
else:
tracking_url = request.args.get('tracking_url') or request.args.get('url')
if not tracking_url:
return jsonify({
"success": False,
"error": "缺少参数 tracking_url 或 url",
"message": "请提供物流追踪页面 URL"
}), 400
# 验证 URL 格式
if not (tracking_url.startswith('http://') or tracking_url.startswith('https://')):
return jsonify({
"success": False,
"error": "URL 格式错误",
"message": "URL 必须以 http:// 或 https:// 开头"
}), 400
try:
with fetch_lock: # 加锁,防止并发调用
print(f"\n收到物流查询请求: {tracking_url}")
result = extract_logistics_info(tracking_url)
if result:
# 构建返回数据
response_data = {
"success": True,
"message": "查询成功",
"data": {
"waybill_no": result.get('waybill_no'),
"carrier": result.get('carrier'),
"carrier_phone": result.get('carrier_phone'),
"tracking_info": result.get('tracking_info', []),
"tracking_count": len(result.get('tracking_info', []))
},
"url": tracking_url
}
# 如果有些信息未找到,添加提示
missing_fields = []
if not result.get('waybill_no'):
missing_fields.append('waybill_no')
if not result.get('carrier'):
missing_fields.append('carrier')
if missing_fields:
response_data["warning"] = f"以下字段未找到: {', '.join(missing_fields)}"
return jsonify(response_data), 200
else:
return jsonify({
"success": False,
"error": "提取失败",
"message": "未能从页面中提取到物流信息",
"url": tracking_url
}), 500
except Exception as e:
print(f"查询物流信息时出错: {e}")
import traceback
traceback.print_exc()
return jsonify({
"success": False,
"error": str(e),
"message": "服务器内部错误",
"url": tracking_url
}), 500
@app.route('/health', methods=['GET'])
def health():
"""健康检查接口"""
return jsonify({
"status": "ok",
"service": "京东物流信息查询服务",
"version": "1.0.0"
}), 200
@app.route('/', methods=['GET'])
def index():
"""首页,返回 API 使用说明"""
return jsonify({
"service": "京东物流信息查询 API",
"version": "1.0.0",
"endpoints": {
"/fetch_logistics": {
"method": ["GET", "POST"],
"description": "查询物流信息",
"parameters": {
"tracking_url": "物流追踪页面 URL必需",
"url": "tracking_url 的别名(可选)"
},
"example_get": "/fetch_logistics?tracking_url=https://3.cn/2t-Iibig",
"example_post": "POST /fetch_logistics\n{\"tracking_url\": \"https://3.cn/2t-Iibig\"}"
},
"/health": {
"method": ["GET"],
"description": "健康检查"
}
}
}), 200
# =================== 启动服务 ===================
if __name__ == '__main__':
# API 服务模式(默认)
print("="*60)
print("京东物流信息查询 API 服务 (Ubuntu 版本)")
print("="*60)
print(f"无头模式: {'' if USE_HEADLESS else ''}")
print("\n服务接口:")
print(" GET/POST /fetch_logistics?tracking_url=<URL> - 查询物流信息")
print(" GET /health - 健康检查")
print(" GET / - API 说明")
print("\n启动服务...")
print(f"服务地址: http://0.0.0.0:{LISTEN_PORT} (环境变量 LOGISTICS_PORT / PORT 可覆盖)")
print("按 Ctrl+C 停止服务\n")
try:
app.run(host='0.0.0.0', port=LISTEN_PORT, debug=False, threaded=True)
except KeyboardInterrupt:
print("\n\n服务已停止")
finally:
if 'global_page' in globals() and global_page:
try:
global_page.quit()
print("浏览器已关闭")
except:
pass