tb_pl/jd/jd.py

import time
import random
import json
import threading

from flask import Flask, request, jsonify
from DrissionPage import ChromiumPage, ChromiumOptions
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
from sqlalchemy.orm import declarative_base, sessionmaker

# =================== 配置部分 ===================
# 浏览器路径（请根据本地实际路径修改）
CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'

# MySQL 配置
db_config = {
    "host": "192.168.8.88",
    "port": 3306,
    "user": "root",
    "password": "mysql_7sjTXH",  # 修改为你的密码
    "database": "jd"
}

# 初始化 Flask 应用
app = Flask(__name__)
# 初始化锁
fetch_lock = threading.Lock()

# 全局爬虫控制标志
crawler_running = False
crawler_thread = None
current_product_id = None

# 当前“允许运行”的抓取任务 product_id（新请求会覆盖，旧线程检测到不匹配则退出）
active_fetch_product_id = None


# 初始化数据库连接
db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
engine = create_engine(db_url, echo=False)
Session = sessionmaker(bind=engine)

Base = declarative_base()


# 定义评论模型
class Comment(Base):
    __tablename__ = 'comments'
    id = Column(Integer, primary_key=True)
    product_id = Column(String(50), nullable=False)
    user_name = Column(String(100))
    comment_text = Column(Text)
    comment_id = Column(String(100))
    picture_urls = Column(Text)  # 存储 JSON 字符串
    created_at = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))
    comment_date = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))


# 创建表（如果不存在）
Base.metadata.create_all(engine)
# =================== 核心爬虫函数 ===================
# 全局浏览器实例（只初始化一次）
global_page = None


def get_global_browser():
    global global_page
    if global_page is None:
        options = ChromiumOptions()
        options.set_browser_path(CHROME_PATH)
        global_page = ChromiumPage(options)
    return global_page


def _is_fetch_cancelled(product_id):
    """当前任务是否已被新请求取消（只保留最新请求的 product_id）"""
    global active_fetch_product_id
    return active_fetch_product_id is not None and active_fetch_product_id != product_id


def fetch_jd_comments(product_id):
    global active_fetch_product_id
    page = get_global_browser()  # 使用全局浏览器
    try:
        # 打开商品页面
        page.get(f'https://item.jd.com/{product_id}.html#crumb-wrap')
        time.sleep(random.uniform(5, 8))
        if _is_fetch_cancelled(product_id):
            return 0

        # 向下滚动主页面
        page.scroll.down(150)
        time.sleep(random.uniform(3, 5))
        if _is_fetch_cancelled(product_id):
            return 0

        # 点击“买家赞不绝口”
        element1 = page.ele('xpath=//div[contains(text(), "买家赞不绝口")]')
        if element1:
            element1.click()
            time.sleep(random.uniform(3, 5))
        else:
            element1 = page.ele('xpath=//div[contains(text(), "好评率")]')
            if element1:
                element1.click()
                time.sleep(random.uniform(3, 5))
        if _is_fetch_cancelled(product_id):
            return 0
        # 点击“当前商品”
        element2 = page.ele('xpath=//div[contains(text(), "当前商品")]')
        if element2:
            element2.click()
            time.sleep(random.uniform(3, 5))

        if _is_fetch_cancelled(product_id):
            return 0
        # 定位弹窗区域
        popup = page.ele('xpath=//*[@id="rateList"]/div/div[3]')
        if not popup:
            return 0

        # 点击“视频”
        element3 = page.ele('xpath=//div[contains(text(), "视频")]')
        if element3:
            element3.click()
            time.sleep(random.uniform(3, 5))

        if _is_fetch_cancelled(product_id):
            return 0
        # 监听请求
        page.listen.start('https://api.m.jd.com/client.action')

        retry_count = 0
        new_comments = []  # 存储最终的新评论
        seen_ids = set()  # 已处理过的 comment_id
        total_comments_saved = 0  # 总共保存的评论数

        # 持续获取评论，直到被新请求取消或手动停止
        while True:
            if _is_fetch_cancelled(product_id):
                print(f"[fetch_jd_comments] 商品 {product_id} 已被新请求取消，退出")
                break
            scroll_amount = random.randint(10000, 100000)
            popup.scroll.down(scroll_amount)
            print(f"弹窗向下滚动了 {scroll_amount} 像素")

            time.sleep(random.uniform(3, 5))
            if _is_fetch_cancelled(product_id):
                break

            resp = page.listen.wait(timeout=5)
            if resp and 'getCommentListPage' in resp.request.postData:
                json_data = resp.response.body
                if 'result' in json_data and 'floors' in json_data['result']:
                    comment_floor = json_data['result']['floors'][2]
                    if 'data' in comment_floor and isinstance(comment_floor['data'], list):
                        batch_comments = comment_floor['data']

                        # 提取这批评论中的新评论
                        fresh_comments = []
                        for comment in batch_comments:
                            comment_info = comment.get('commentInfo', {})
                            comment_id = comment_info.get('commentId', '')
                            comment_score = comment_info.get('commentScore', '')  # 获取评分字段

                            if not comment_id:
                                continue

                            # 只保留五星好评
                            if comment_score != '5':
                                print(f"跳过非五星评论：{comment_id}，评分为 {comment_score}")
                                continue

                            # 如果该评论已存在数据库或本次已收集，则跳过
                            exists_in_db = False
                            if comment_id in seen_ids:
                                exists_in_db = True
                            else:
                                session = Session()
                                exists_in_db = session.query(Comment).filter_by(comment_id=comment_id).first() is not None
                                session.close()

                            if exists_in_db:
                                print(f"评论已存在：{comment_id}")
                                continue

                            seen_ids.add(comment_id)
                            fresh_comments.append(comment)

                        if fresh_comments:
                            print(f"本次获取到 {len(fresh_comments)} 条新评论")
                            new_comments.extend(fresh_comments)
                            retry_count = 0  # 有新数据，重置重试计数器

                            # 立即保存这批评论到数据库
                            save_comments_to_db(product_id, fresh_comments)
                            total_comments_saved += len(fresh_comments)
                            print(f"已保存 {len(fresh_comments)} 条评论到数据库，总计保存 {total_comments_saved} 条评论")

                        else:
                            print("本次无新评论，继续滚动...")
                            retry_count += 1
                    else:
                        print("未找到有效的评论列表")
                        retry_count += 1
                else:
                    print("返回数据结构异常")
                    retry_count += 1
            else:
                print("未捕获到新的评论数据，继续滚动...")
                retry_count += 1
            if _is_fetch_cancelled(product_id):
                break

        print(f"爬虫已停止，共抓取到 {total_comments_saved} 条评论")
        return total_comments_saved

    except Exception as e:
        print("发生错误:", e)
        return 0


# =================== 持续爬虫后台运行函数 ===================
def continuous_crawler(product_id):
    """持续爬取评论的后台函数"""
    global crawler_running
    try:
        print(f"开始持续爬取商品 {product_id} 的评论...")
        while crawler_running:
            result = fetch_jd_comments(product_id)
            if not crawler_running:
                break
            # 如果没有获取到数据，等待一段时间再继续
            time.sleep(10)
        print(f"商品 {product_id} 的持续爬取已停止")
    except Exception as e:
        print(f"持续爬虫发生错误: {e}")
        crawler_running = False

# =================== 提取评论并保存到数据库 ===================
def save_comments_to_db(product_id, comments):
    session = Session()
    try:
        for comment in comments:
            comment_info = comment.get('commentInfo', {})
            comment_id = comment_info.get('commentId', '')

            # 如果 comment_id 为空，跳过这条评论
            if not comment_id:
                print("跳过无 comment_id 的评论")
                continue

            # 检查是否已存在该评论
            exists = session.query(Comment).filter_by(comment_id=comment_id).first()
            if exists:
                print(f"评论已存在：{comment_id}")
                continue

            # 提取其他字段
            user_name = comment_info.get('userNickName', '匿名用户')
            comment_text = comment_info.get('commentData', '无评论内容')
            product_id = comment_info.get('productId', product_id)
            picture_list = comment_info.get('pictureInfoList', [])
            comment_date = comment_info.get('commentDate', '')
            picture_urls = [pic.get('largePicURL') for pic in picture_list if pic.get('largePicURL')]

            new_comment = Comment(
                product_id=product_id,
                user_name=user_name,
                comment_text=comment_text,
                comment_id=comment_id,
                picture_urls=json.dumps(picture_urls, ensure_ascii=False),
                comment_date=comment_date
            )
            session.add(new_comment)

        session.commit()
    except Exception as e:
        session.rollback()
        print("保存失败:", e)
    finally:
        session.close()


# =================== Flask API 接口 ===================
@app.route('/start_crawler', methods=['POST'])
def start_crawler():
    """启动持续爬虫"""
    global crawler_running, crawler_thread, current_product_id

    product_id = request.args.get('product_id')
    if not product_id:
        return jsonify({"error": "缺少 product_id"}), 400

    if crawler_running:
        return jsonify({
            "message": f"爬虫已在运行中，当前商品ID: {current_product_id}",
            "status": "already_running"
        }), 200

    try:
        with fetch_lock:
            crawler_running = True
            current_product_id = product_id
            crawler_thread = threading.Thread(target=continuous_crawler, args=(product_id,))
            crawler_thread.daemon = True
            crawler_thread.start()

        return jsonify({
            "message": f"已启动持续爬虫，商品ID: {product_id}",
            "status": "started",
            "product_id": product_id
        }), 200

    except Exception as e:
        crawler_running = False
        return jsonify({"error": str(e)}), 500


@app.route('/stop_crawler', methods=['POST'])
def stop_crawler():
    """停止持续爬虫"""
    global crawler_running, crawler_thread, current_product_id

    if not crawler_running:
        return jsonify({
            "message": "爬虫未在运行",
            "status": "not_running"
        }), 200

    try:
        with fetch_lock:
            crawler_running = False
            stopped_product_id = current_product_id
            current_product_id = None

        # 等待线程结束
        if crawler_thread and crawler_thread.is_alive():
            crawler_thread.join(timeout=10)

        return jsonify({
            "message": f"已停止持续爬虫，商品ID: {stopped_product_id}",
            "status": "stopped",
            "product_id": stopped_product_id
        }), 200

    except Exception as e:
        return jsonify({"error": str(e)}), 500


@app.route('/crawler_status', methods=['GET'])
def crawler_status():
    """获取爬虫状态"""
    global crawler_running, current_product_id

    return jsonify({
        "running": crawler_running,
        "product_id": current_product_id,
        "status": "running" if crawler_running else "stopped"
    }), 200


@app.route('/test', methods=['GET'])
def test():
    """测试端点，验证服务器是否正常工作"""
    print("测试端点被访问")
    return jsonify({"message": "服务器运行正常", "status": "ok"}), 200


@app.route('/fetch_comments', methods=['GET', 'POST'])
def fetch_comments():
    """单次获取评论（在后台运行，立即返回）。新请求会中断所有历史请求线程，只执行本次请求。"""
    global crawler_running, active_fetch_product_id
    print(f"[fetch_comments] 收到请求，方法: {request.method}, 参数: {request.args}")
    product_id = request.args.get('product_id')

    if not product_id:
        print("[fetch_comments] 错误: 缺少 product_id")
        return jsonify({"error": "缺少 product_id"}), 400

    print(f"[fetch_comments] 开始处理商品ID: {product_id}，将中断所有历史请求后执行")

    try:
        # 立刻中断所有历史：停止持续爬虫并标记“当前任务”为新 product_id，旧线程在循环中检测到会自行退出
        with fetch_lock:
            crawler_running = False
            active_fetch_product_id = product_id

        def run_fetch():
            try:
                print(f"[后台线程] 开始获取商品 {product_id} 的评论...")
                result = fetch_jd_comments(product_id)
                print(f"[后台线程] 获取完成，结果: {result}")
            except Exception as e:
                import traceback
                error_msg = f"后台获取评论时发生错误: {e}\n{traceback.format_exc()}"
                print(f"[后台线程] {error_msg}")

        fetch_thread = threading.Thread(target=run_fetch)
        fetch_thread.daemon = True
        fetch_thread.start()
        print(f"[fetch_comments] 后台线程已启动（历史请求已标记为取消）")

        response_data = {
            "message": f"已开始获取商品 {product_id} 的评论，正在后台运行中...（已中断之前的请求）",
            "status": "started",
            "product_id": product_id,
            "note": "评论获取在后台进行，请稍后查看数据库或使用 /crawler_status 查看状态"
        }
        print(f"[fetch_comments] 返回响应: {response_data}")
        return jsonify(response_data), 200

    except Exception as e:
        import traceback
        error_msg = f"处理请求时发生错误: {e}\n{traceback.format_exc()}"
        print(f"[fetch_comments] {error_msg}")
        return jsonify({"error": str(e)}), 500


# =================== 启动服务 ===================
if __name__ == '__main__':
    try:
        app.run(host='0.0.0.0', port=5008, debug=True)
    finally:
        if 'global_page' in globals() and global_page:
            global_page.quit()
            print("浏览器已关闭")