tb_pl/jd/tb.py

import time
import random
import re
import json
import threading

from DrissionPage import ChromiumPage, ChromiumOptions
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
from sqlalchemy.orm import sessionmaker, declarative_base

# =================== 配置部分 ===================
# 浏览器路径（请根据本地实际路径修改）
CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'

# 固定商品详情页 URL
TARGET_URL = "https://detail.tmall.com/item.htm?abbucket=1&id=735141569627&ltk2=1753093866331wbixx4bjhgx78xdlrpyxq&ns=1&priceTId=213e074d17530938630755244e1109&skuId=5667837161089&spm=a21n57.1.hoverItem.2&utparam=%7B%22aplus_abtest%22%3A%228c55408acbff553514850c28e821c3b4%22%7D&xxc=taobaoSearch"
# MySQL 配置
db_config = {
    "host": "192.168.8.88",
    "port": 3306,
    "user": "root",
    "password": "mysql_7sjTXH",  # 修改为你的密码
    "database": "jd"
}

# 初始化数据库连接
db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
engine = create_engine(db_url, echo=False)
Session = sessionmaker(bind=engine)
Base = declarative_base()


# 定义淘宝评论模型
class TaobaoComment(Base):
    __tablename__ = 'taobao_comments'
    id = Column(Integer, primary_key=True)
    product_id = Column(String(50), nullable=False)
    user_name = Column(String(100))
    comment_text = Column(Text)
    comment_id = Column(String(100))
    picture_urls = Column(Text)  # 存储 JSON 字符串
    created_at = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))
    comment_date = Column(DateTime)


# 创建表（如果不存在）
Base.metadata.create_all(engine)


# 全局浏览器实例（只初始化一次）
global_taobao_page = None


def get_global_taobao_browser():
    global global_taobao_page
    if global_taobao_page is None:
        options = ChromiumOptions()
        options.set_browser_path(CHROME_PATH)
        global_taobao_page = ChromiumPage(options)
    return global_taobao_page


def extract_json_from_mtop(raw_response: str) -> dict:
    # print("原始数据：", raw_response)

    """去除 mtopjsonppcdetail18(...) 等封装，提取真实 JSON"""
    match = re.search(r'mtopjsonppcdetail\d+\((\{.*\})\)', raw_response, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError as e:
            print("JSON 解析失败:", e)
    else:
        print("未找到有效的 JSON 数据")
    return {}


def fetch_taobao_comments():
    page = get_global_taobao_browser()
    try:
        # 打开固定商品页
        page.get(TARGET_URL)
        # time.sleep(5)

        # 向下滚动主页面
        page.scroll.down(150)
        # time.sleep(3)

        # 点击“评价”按钮
        element = page.ele('xpath=//div[contains(text(), "全部评价")]')
        if element:
            element.click()
            time.sleep(3)
        else:
            print("未找到评价按钮")
            return []

        # 开始监听指定请求
        target_url = 'https://h5api.m.tmall.com/h5/mtop.taobao.rate.detaillist.get/6.0/?jsv=2.7.5'
        page.listen.start(target_url)

        seen_ids = set()

        print("\n===============================")
        print("✅ 自动开始抓取评论，每次滚动到底部后自动保存新评论...")
        print("🚫 如需停止程序，请手动关闭浏览器或使用 Ctrl+C 中断")
        print("===============================\n")

        while True:
            # 发送滚动到底部指令
            page.scroll.to_bottom()
            print("已向下滚动到底部，等待接口返回数据...")

            resp = page.listen.wait(timeout=10)
            if resp and target_url in resp.url:
                raw_body = resp.response.body
                json_data = extract_json_from_mtop(raw_body)

                if json_data and 'data' in json_data and 'rateList' in json_data['data']:
                    batch_comments = json_data['data']['rateList']

                    fresh_comments = []
                    for comment in batch_comments:
                        comment_id = comment.get('id', '')
                        if not comment_id or comment_id in seen_ids:
                            continue

                        seen_ids.add(comment_id)
                        fresh_comments.append(comment)

                    if fresh_comments:
                        print(f"✅ 本次获取到 {len(fresh_comments)} 条新评论，正在保存...")
                        save_taobao_comments_to_db(fresh_comments)  # 👈 立即保存
                    else:
                        print("⚠️ 本次无新评论，可能已抓取完毕")
                else:
                    print("🚫 返回数据结构异常，无法提取评论")
            else:
                print("🚫 未捕获到新的评论数据，请确认是否已滚动并加载出更多评论")

            # 自动等待几秒再滚动
            # time.sleep(random.uniform(3, 5))

        # 这里不再返回 comments，而是直接实时保存

    except Exception as e:
        print("发生错误:", e)


def save_taobao_comments_to_db(comments):
    session = Session()
    try:
        for comment in comments:
            comment_id = comment.get('id', '')
            feedback = comment.get('feedback', '无评论内容')
            user_nick = comment.get('userNick', '匿名用户')
            pic_list = comment.get('feedPicPathList', [])
            comment_date = comment.get('feedbackDate', '')

            exists = session.query(TaobaoComment).filter_by(comment_id=comment_id).first()
            if exists:
                print(f"评论已存在：{comment_id}")
                continue

            picture_urls = [url for url in pic_list if url.startswith('//')]

            new_comment = TaobaoComment(
                product_id="735141569627",
                user_name=user_nick,
                comment_text=feedback,
                comment_id=comment_id,
                picture_urls=json.dumps(picture_urls, ensure_ascii=False),
                comment_date=comment_date
            )
            session.add(new_comment)
            print(f"正在写入评论: {comment_id}")
        session.commit()
    except Exception as e:
        session.rollback()
        print("保存失败:", e)
    finally:
        session.close()


# =================== 主程序入口 ===================
if __name__ == '__main__':
    print("开始抓取评论...")

    # 获取评论
    comments = fetch_taobao_comments()
    if comments:
        print(f"成功获取 {len(comments)} 条评论，正在保存到数据库...")
        save_taobao_comments_to_db(comments)
    else:
        print("未获取到任何评论数据。")

    print("浏览器保持打开状态，用于调试。")