1

2026-04-26 13:39:19 +08:00
parent fa25bfd784
commit a89703ea72
12 changed files with 2154 additions and 25 deletions
--- a/jd/jd.py
+++ b/jd/jd.py
@@ -6,8 +6,7 @@ import threading
 from flask import Flask, request, jsonify
 from DrissionPage import ChromiumPage, ChromiumOptions
 from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy.orm import sessionmaker
+from sqlalchemy.orm import declarative_base, sessionmaker

 # =================== 配置部分 ===================
 # 浏览器路径（请根据本地实际路径修改）
@@ -27,6 +26,14 @@ app = Flask(__name__)
 # 初始化锁
 fetch_lock = threading.Lock()

+# 全局爬虫控制标志
+crawler_running = False
+crawler_thread = None
+current_product_id = None
+
+# 当前“允许运行”的抓取任务 product_id（新请求会覆盖，旧线程检测到不匹配则退出）
+active_fetch_product_id = None
+

 # 初始化数据库连接
 db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
@@ -65,16 +72,27 @@ def get_global_browser():
    return global_page


+def _is_fetch_cancelled(product_id):
+    """当前任务是否已被新请求取消（只保留最新请求的 product_id）"""
+    global active_fetch_product_id
+    return active_fetch_product_id is not None and active_fetch_product_id != product_id
+
+
 def fetch_jd_comments(product_id):
+    global active_fetch_product_id
    page = get_global_browser()  # 使用全局浏览器
    try:
        # 打开商品页面
        page.get(f'https://item.jd.com/{product_id}.html#crumb-wrap')
        time.sleep(random.uniform(5, 8))
+        if _is_fetch_cancelled(product_id):
+            return 0

        # 向下滚动主页面
        page.scroll.down(150)
        time.sleep(random.uniform(3, 5))
+        if _is_fetch_cancelled(product_id):
+            return 0

        # 点击“买家赞不绝口”
        element1 = page.ele('xpath=//div[contains(text(), "买家赞不绝口")]')
@@ -86,16 +104,20 @@ def fetch_jd_comments(product_id):
            if element1:
                element1.click()
                time.sleep(random.uniform(3, 5))
+        if _is_fetch_cancelled(product_id):
+            return 0
        # 点击“当前商品”
        element2 = page.ele('xpath=//div[contains(text(), "当前商品")]')
        if element2:
            element2.click()
            time.sleep(random.uniform(3, 5))

+        if _is_fetch_cancelled(product_id):
+            return 0
        # 定位弹窗区域
        popup = page.ele('xpath=//*[@id="rateList"]/div/div[3]')
        if not popup:
-            return []
+            return 0

        # 点击“视频”
        element3 = page.ele('xpath=//div[contains(text(), "视频")]')
@@ -103,20 +125,28 @@ def fetch_jd_comments(product_id):
            element3.click()
            time.sleep(random.uniform(3, 5))

+        if _is_fetch_cancelled(product_id):
+            return 0
        # 监听请求
        page.listen.start('https://api.m.jd.com/client.action')

-        max_retries = 10  # 最多尝试 5 次无新数据
        retry_count = 0
        new_comments = []  # 存储最终的新评论
        seen_ids = set()  # 已处理过的 comment_id
+        total_comments_saved = 0  # 总共保存的评论数

-        while retry_count < max_retries and len(new_comments) < 10:
+        # 持续获取评论，直到被新请求取消或手动停止
+        while True:
+            if _is_fetch_cancelled(product_id):
+                print(f"[fetch_jd_comments] 商品 {product_id} 已被新请求取消，退出")
+                break
            scroll_amount = random.randint(10000, 100000)
            popup.scroll.down(scroll_amount)
            print(f"弹窗向下滚动了 {scroll_amount} 像素")

            time.sleep(random.uniform(3, 5))
+            if _is_fetch_cancelled(product_id):
+                break

            resp = page.listen.wait(timeout=5)
            if resp and 'getCommentListPage' in resp.request.postData:
@@ -161,6 +191,12 @@ def fetch_jd_comments(product_id):
                            print(f"本次获取到 {len(fresh_comments)} 条新评论")
                            new_comments.extend(fresh_comments)
                            retry_count = 0  # 有新数据，重置重试计数器
+
+                            # 立即保存这批评论到数据库
+                            save_comments_to_db(product_id, fresh_comments)
+                            total_comments_saved += len(fresh_comments)
+                            print(f"已保存 {len(fresh_comments)} 条评论到数据库，总计保存 {total_comments_saved} 条评论")
+
                        else:
                            print("本次无新评论，继续滚动...")
                            retry_count += 1
@@ -173,16 +209,35 @@ def fetch_jd_comments(product_id):
            else:
                print("未捕获到新的评论数据，继续滚动...")
                retry_count += 1
+            if _is_fetch_cancelled(product_id):
+                break

-        print(f"共抓取到 {len(new_comments)} 条新评论（最多需要10条）")
-        return new_comments[:10]  # 只保留前10条
+        print(f"爬虫已停止，共抓取到 {total_comments_saved} 条评论")
+        return total_comments_saved

    except Exception as e:
        print("发生错误:", e)
-        return []
+        return 0



+# =================== 持续爬虫后台运行函数 ===================
+def continuous_crawler(product_id):
+    """持续爬取评论的后台函数"""
+    global crawler_running
+    try:
+        print(f"开始持续爬取商品 {product_id} 的评论...")
+        while crawler_running:
+            result = fetch_jd_comments(product_id)
+            if not crawler_running:
+                break
+            # 如果没有获取到数据，等待一段时间再继续
+            time.sleep(10)
+        print(f"商品 {product_id} 的持续爬取已停止")
+    except Exception as e:
+        print(f"持续爬虫发生错误: {e}")
+        crawler_running = False
+
 # =================== 提取评论并保存到数据库 ===================
 def save_comments_to_db(product_id, comments):
    session = Session()
@@ -229,33 +284,144 @@ def save_comments_to_db(product_id, comments):


 # =================== Flask API 接口 ===================
-@app.route('/fetch_comments', methods=['POST'])
-def fetch_comments():
+@app.route('/start_crawler', methods=['POST'])
+def start_crawler():
+    """启动持续爬虫"""
+    global crawler_running, crawler_thread, current_product_id
+
    product_id = request.args.get('product_id')
    if not product_id:
-        return jsonify({"error": "缺少 product_id"}), -200
+        return jsonify({"error": "缺少 product_id"}), 400
+
+    if crawler_running:
+        return jsonify({
+            "message": f"爬虫已在运行中，当前商品ID: {current_product_id}",
+            "status": "already_running"
+        }), 200

    try:
-        with fetch_lock:  # 加锁，防止并发调用
-            comments = fetch_jd_comments(product_id)
-            if not comments:
-                return jsonify({"message": "未获取到评论数据"}), -200
-
-            save_comments_to_db(product_id, comments)
+        with fetch_lock:
+            crawler_running = True
+            current_product_id = product_id
+            crawler_thread = threading.Thread(target=continuous_crawler, args=(product_id,))
+            crawler_thread.daemon = True
+            crawler_thread.start()

        return jsonify({
-            "message": f"成功保存 {len(comments)} 条评论",
+            "message": f"已启动持续爬虫，商品ID: {product_id}",
+            "status": "started",
            "product_id": product_id
        }), 200

    except Exception as e:
-        return jsonify({"error": str(e)}), -200
+        crawler_running = False
+        return jsonify({"error": str(e)}), 500
+
+
+@app.route('/stop_crawler', methods=['POST'])
+def stop_crawler():
+    """停止持续爬虫"""
+    global crawler_running, crawler_thread, current_product_id
+
+    if not crawler_running:
+        return jsonify({
+            "message": "爬虫未在运行",
+            "status": "not_running"
+        }), 200
+
+    try:
+        with fetch_lock:
+            crawler_running = False
+            stopped_product_id = current_product_id
+            current_product_id = None
+
+        # 等待线程结束
+        if crawler_thread and crawler_thread.is_alive():
+            crawler_thread.join(timeout=10)
+
+        return jsonify({
+            "message": f"已停止持续爬虫，商品ID: {stopped_product_id}",
+            "status": "stopped",
+            "product_id": stopped_product_id
+        }), 200
+
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+
+
+@app.route('/crawler_status', methods=['GET'])
+def crawler_status():
+    """获取爬虫状态"""
+    global crawler_running, current_product_id
+
+    return jsonify({
+        "running": crawler_running,
+        "product_id": current_product_id,
+        "status": "running" if crawler_running else "stopped"
+    }), 200
+
+
+@app.route('/test', methods=['GET'])
+def test():
+    """测试端点，验证服务器是否正常工作"""
+    print("测试端点被访问")
+    return jsonify({"message": "服务器运行正常", "status": "ok"}), 200
+
+
+@app.route('/fetch_comments', methods=['GET', 'POST'])
+def fetch_comments():
+    """单次获取评论（在后台运行，立即返回）。新请求会中断所有历史请求线程，只执行本次请求。"""
+    global crawler_running, active_fetch_product_id
+    print(f"[fetch_comments] 收到请求，方法: {request.method}, 参数: {request.args}")
+    product_id = request.args.get('product_id')
+
+    if not product_id:
+        print("[fetch_comments] 错误: 缺少 product_id")
+        return jsonify({"error": "缺少 product_id"}), 400
+
+    print(f"[fetch_comments] 开始处理商品ID: {product_id}，将中断所有历史请求后执行")
+
+    try:
+        # 立刻中断所有历史：停止持续爬虫并标记“当前任务”为新 product_id，旧线程在循环中检测到会自行退出
+        with fetch_lock:
+            crawler_running = False
+            active_fetch_product_id = product_id
+
+        def run_fetch():
+            try:
+                print(f"[后台线程] 开始获取商品 {product_id} 的评论...")
+                result = fetch_jd_comments(product_id)
+                print(f"[后台线程] 获取完成，结果: {result}")
+            except Exception as e:
+                import traceback
+                error_msg = f"后台获取评论时发生错误: {e}\n{traceback.format_exc()}"
+                print(f"[后台线程] {error_msg}")
+
+        fetch_thread = threading.Thread(target=run_fetch)
+        fetch_thread.daemon = True
+        fetch_thread.start()
+        print(f"[fetch_comments] 后台线程已启动（历史请求已标记为取消）")
+
+        response_data = {
+            "message": f"已开始获取商品 {product_id} 的评论，正在后台运行中...（已中断之前的请求）",
+            "status": "started",
+            "product_id": product_id,
+            "note": "评论获取在后台进行，请稍后查看数据库或使用 /crawler_status 查看状态"
+        }
+        print(f"[fetch_comments] 返回响应: {response_data}")
+        return jsonify(response_data), 200
+
+    except Exception as e:
+        import traceback
+        error_msg = f"处理请求时发生错误: {e}\n{traceback.format_exc()}"
+        print(f"[fetch_comments] {error_msg}")
+        return jsonify({"error": str(e)}), 500


 # =================== 启动服务 ===================
 if __name__ == '__main__':
    try:
-        app.run(host='0.0.0.0', port=5000, debug=True)
+        app.run(host='0.0.0.0', port=5008, debug=True)
    finally:
        if 'global_page' in globals() and global_page:
            global_page.quit()