This commit is contained in:
van
2026-04-26 13:39:19 +08:00
parent fa25bfd784
commit a89703ea72
12 changed files with 2154 additions and 25 deletions

206
jd/jd.py
View File

@@ -6,8 +6,7 @@ import threading
from flask import Flask, request, jsonify
from DrissionPage import ChromiumPage, ChromiumOptions
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import declarative_base, sessionmaker
# =================== 配置部分 ===================
# 浏览器路径(请根据本地实际路径修改)
@@ -27,6 +26,14 @@ app = Flask(__name__)
# 初始化锁
fetch_lock = threading.Lock()
# 全局爬虫控制标志
crawler_running = False
crawler_thread = None
current_product_id = None
# 当前“允许运行”的抓取任务 product_id新请求会覆盖旧线程检测到不匹配则退出
active_fetch_product_id = None
# 初始化数据库连接
db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
@@ -65,16 +72,27 @@ def get_global_browser():
return global_page
def _is_fetch_cancelled(product_id):
"""当前任务是否已被新请求取消(只保留最新请求的 product_id"""
global active_fetch_product_id
return active_fetch_product_id is not None and active_fetch_product_id != product_id
def fetch_jd_comments(product_id):
global active_fetch_product_id
page = get_global_browser() # 使用全局浏览器
try:
# 打开商品页面
page.get(f'https://item.jd.com/{product_id}.html#crumb-wrap')
time.sleep(random.uniform(5, 8))
if _is_fetch_cancelled(product_id):
return 0
# 向下滚动主页面
page.scroll.down(150)
time.sleep(random.uniform(3, 5))
if _is_fetch_cancelled(product_id):
return 0
# 点击“买家赞不绝口”
element1 = page.ele('xpath=//div[contains(text(), "买家赞不绝口")]')
@@ -86,16 +104,20 @@ def fetch_jd_comments(product_id):
if element1:
element1.click()
time.sleep(random.uniform(3, 5))
if _is_fetch_cancelled(product_id):
return 0
# 点击“当前商品”
element2 = page.ele('xpath=//div[contains(text(), "当前商品")]')
if element2:
element2.click()
time.sleep(random.uniform(3, 5))
if _is_fetch_cancelled(product_id):
return 0
# 定位弹窗区域
popup = page.ele('xpath=//*[@id="rateList"]/div/div[3]')
if not popup:
return []
return 0
# 点击“视频”
element3 = page.ele('xpath=//div[contains(text(), "视频")]')
@@ -103,20 +125,28 @@ def fetch_jd_comments(product_id):
element3.click()
time.sleep(random.uniform(3, 5))
if _is_fetch_cancelled(product_id):
return 0
# 监听请求
page.listen.start('https://api.m.jd.com/client.action')
max_retries = 10 # 最多尝试 5 次无新数据
retry_count = 0
new_comments = [] # 存储最终的新评论
seen_ids = set() # 已处理过的 comment_id
total_comments_saved = 0 # 总共保存的评论数
while retry_count < max_retries and len(new_comments) < 10:
# 持续获取评论,直到被新请求取消或手动停止
while True:
if _is_fetch_cancelled(product_id):
print(f"[fetch_jd_comments] 商品 {product_id} 已被新请求取消,退出")
break
scroll_amount = random.randint(10000, 100000)
popup.scroll.down(scroll_amount)
print(f"弹窗向下滚动了 {scroll_amount} 像素")
time.sleep(random.uniform(3, 5))
if _is_fetch_cancelled(product_id):
break
resp = page.listen.wait(timeout=5)
if resp and 'getCommentListPage' in resp.request.postData:
@@ -161,6 +191,12 @@ def fetch_jd_comments(product_id):
print(f"本次获取到 {len(fresh_comments)} 条新评论")
new_comments.extend(fresh_comments)
retry_count = 0 # 有新数据,重置重试计数器
# 立即保存这批评论到数据库
save_comments_to_db(product_id, fresh_comments)
total_comments_saved += len(fresh_comments)
print(f"已保存 {len(fresh_comments)} 条评论到数据库,总计保存 {total_comments_saved} 条评论")
else:
print("本次无新评论,继续滚动...")
retry_count += 1
@@ -173,16 +209,35 @@ def fetch_jd_comments(product_id):
else:
print("未捕获到新的评论数据,继续滚动...")
retry_count += 1
if _is_fetch_cancelled(product_id):
break
print(f"共抓取到 {len(new_comments)}评论最多需要10条")
return new_comments[:10] # 只保留前10条
print(f"爬虫已停止,共抓取到 {total_comments_saved} 条评论")
return total_comments_saved
except Exception as e:
print("发生错误:", e)
return []
return 0
# =================== 持续爬虫后台运行函数 ===================
def continuous_crawler(product_id):
"""持续爬取评论的后台函数"""
global crawler_running
try:
print(f"开始持续爬取商品 {product_id} 的评论...")
while crawler_running:
result = fetch_jd_comments(product_id)
if not crawler_running:
break
# 如果没有获取到数据,等待一段时间再继续
time.sleep(10)
print(f"商品 {product_id} 的持续爬取已停止")
except Exception as e:
print(f"持续爬虫发生错误: {e}")
crawler_running = False
# =================== 提取评论并保存到数据库 ===================
def save_comments_to_db(product_id, comments):
session = Session()
@@ -229,33 +284,144 @@ def save_comments_to_db(product_id, comments):
# =================== Flask API 接口 ===================
@app.route('/fetch_comments', methods=['POST'])
def fetch_comments():
@app.route('/start_crawler', methods=['POST'])
def start_crawler():
"""启动持续爬虫"""
global crawler_running, crawler_thread, current_product_id
product_id = request.args.get('product_id')
if not product_id:
return jsonify({"error": "缺少 product_id"}), -200
return jsonify({"error": "缺少 product_id"}), 400
if crawler_running:
return jsonify({
"message": f"爬虫已在运行中当前商品ID: {current_product_id}",
"status": "already_running"
}), 200
try:
with fetch_lock: # 加锁,防止并发调用
comments = fetch_jd_comments(product_id)
if not comments:
return jsonify({"message": "未获取到评论数据"}), -200
save_comments_to_db(product_id, comments)
with fetch_lock:
crawler_running = True
current_product_id = product_id
crawler_thread = threading.Thread(target=continuous_crawler, args=(product_id,))
crawler_thread.daemon = True
crawler_thread.start()
return jsonify({
"message": f"成功保存 {len(comments)} 条评论",
"message": f"已启动持续爬虫商品ID: {product_id}",
"status": "started",
"product_id": product_id
}), 200
except Exception as e:
return jsonify({"error": str(e)}), -200
crawler_running = False
return jsonify({"error": str(e)}), 500
@app.route('/stop_crawler', methods=['POST'])
def stop_crawler():
"""停止持续爬虫"""
global crawler_running, crawler_thread, current_product_id
if not crawler_running:
return jsonify({
"message": "爬虫未在运行",
"status": "not_running"
}), 200
try:
with fetch_lock:
crawler_running = False
stopped_product_id = current_product_id
current_product_id = None
# 等待线程结束
if crawler_thread and crawler_thread.is_alive():
crawler_thread.join(timeout=10)
return jsonify({
"message": f"已停止持续爬虫商品ID: {stopped_product_id}",
"status": "stopped",
"product_id": stopped_product_id
}), 200
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route('/crawler_status', methods=['GET'])
def crawler_status():
"""获取爬虫状态"""
global crawler_running, current_product_id
return jsonify({
"running": crawler_running,
"product_id": current_product_id,
"status": "running" if crawler_running else "stopped"
}), 200
@app.route('/test', methods=['GET'])
def test():
"""测试端点,验证服务器是否正常工作"""
print("测试端点被访问")
return jsonify({"message": "服务器运行正常", "status": "ok"}), 200
@app.route('/fetch_comments', methods=['GET', 'POST'])
def fetch_comments():
"""单次获取评论(在后台运行,立即返回)。新请求会中断所有历史请求线程,只执行本次请求。"""
global crawler_running, active_fetch_product_id
print(f"[fetch_comments] 收到请求,方法: {request.method}, 参数: {request.args}")
product_id = request.args.get('product_id')
if not product_id:
print("[fetch_comments] 错误: 缺少 product_id")
return jsonify({"error": "缺少 product_id"}), 400
print(f"[fetch_comments] 开始处理商品ID: {product_id},将中断所有历史请求后执行")
try:
# 立刻中断所有历史:停止持续爬虫并标记“当前任务”为新 product_id旧线程在循环中检测到会自行退出
with fetch_lock:
crawler_running = False
active_fetch_product_id = product_id
def run_fetch():
try:
print(f"[后台线程] 开始获取商品 {product_id} 的评论...")
result = fetch_jd_comments(product_id)
print(f"[后台线程] 获取完成,结果: {result}")
except Exception as e:
import traceback
error_msg = f"后台获取评论时发生错误: {e}\n{traceback.format_exc()}"
print(f"[后台线程] {error_msg}")
fetch_thread = threading.Thread(target=run_fetch)
fetch_thread.daemon = True
fetch_thread.start()
print(f"[fetch_comments] 后台线程已启动(历史请求已标记为取消)")
response_data = {
"message": f"已开始获取商品 {product_id} 的评论,正在后台运行中...(已中断之前的请求)",
"status": "started",
"product_id": product_id,
"note": "评论获取在后台进行,请稍后查看数据库或使用 /crawler_status 查看状态"
}
print(f"[fetch_comments] 返回响应: {response_data}")
return jsonify(response_data), 200
except Exception as e:
import traceback
error_msg = f"处理请求时发生错误: {e}\n{traceback.format_exc()}"
print(f"[fetch_comments] {error_msg}")
return jsonify({"error": str(e)}), 500
# =================== 启动服务 ===================
if __name__ == '__main__':
try:
app.run(host='0.0.0.0', port=5000, debug=True)
app.run(host='0.0.0.0', port=5008, debug=True)
finally:
if 'global_page' in globals() and global_page:
global_page.quit()