import time import random import re import json import threading from DrissionPage import ChromiumPage, ChromiumOptions from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime from sqlalchemy.orm import sessionmaker, declarative_base # =================== 配置部分 =================== # 浏览器路径(请根据本地实际路径修改) CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe' # 固定商品详情页 URL TARGET_URL = "https://detail.tmall.com/item.htm?abbucket=1&id=735141569627<k2=1753093866331wbixx4bjhgx78xdlrpyxq&ns=1&priceTId=213e074d17530938630755244e1109&skuId=5667837161089&spm=a21n57.1.hoverItem.2&utparam=%7B%22aplus_abtest%22%3A%228c55408acbff553514850c28e821c3b4%22%7D&xxc=taobaoSearch" # MySQL 配置 db_config = { "host": "192.168.8.88", "port": 3306, "user": "root", "password": "mysql_7sjTXH", # 修改为你的密码 "database": "jd" } # 初始化数据库连接 db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4" engine = create_engine(db_url, echo=False) Session = sessionmaker(bind=engine) Base = declarative_base() # 定义淘宝评论模型 class TaobaoComment(Base): __tablename__ = 'taobao_comments' id = Column(Integer, primary_key=True) product_id = Column(String(50), nullable=False) user_name = Column(String(100)) comment_text = Column(Text) comment_id = Column(String(100)) picture_urls = Column(Text) # 存储 JSON 字符串 created_at = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S')) comment_date = Column(DateTime) # 创建表(如果不存在) Base.metadata.create_all(engine) # 全局浏览器实例(只初始化一次) global_taobao_page = None def get_global_taobao_browser(): global global_taobao_page if global_taobao_page is None: options = ChromiumOptions() options.set_browser_path(CHROME_PATH) global_taobao_page = ChromiumPage(options) return global_taobao_page def extract_json_from_mtop(raw_response: str) -> dict: # print("原始数据:", raw_response) """去除 mtopjsonppcdetail18(...) 等封装,提取真实 JSON""" match = re.search(r'mtopjsonppcdetail\d+\((\{.*\})\)', raw_response, re.DOTALL) if match: try: return json.loads(match.group(1)) except json.JSONDecodeError as e: print("JSON 解析失败:", e) else: print("未找到有效的 JSON 数据") return {} def fetch_taobao_comments(): page = get_global_taobao_browser() try: # 打开固定商品页 page.get(TARGET_URL) # time.sleep(5) # 向下滚动主页面 page.scroll.down(150) # time.sleep(3) # 点击“评价”按钮 element = page.ele('xpath=//div[contains(text(), "全部评价")]') if element: element.click() time.sleep(3) else: print("未找到评价按钮") return [] # 开始监听指定请求 target_url = 'https://h5api.m.tmall.com/h5/mtop.taobao.rate.detaillist.get/6.0/?jsv=2.7.5' page.listen.start(target_url) seen_ids = set() print("\n===============================") print("✅ 自动开始抓取评论,每次滚动到底部后自动保存新评论...") print("🚫 如需停止程序,请手动关闭浏览器或使用 Ctrl+C 中断") print("===============================\n") while True: # 发送滚动到底部指令 page.scroll.to_bottom() print("已向下滚动到底部,等待接口返回数据...") resp = page.listen.wait(timeout=10) if resp and target_url in resp.url: raw_body = resp.response.body json_data = extract_json_from_mtop(raw_body) if json_data and 'data' in json_data and 'rateList' in json_data['data']: batch_comments = json_data['data']['rateList'] fresh_comments = [] for comment in batch_comments: comment_id = comment.get('id', '') if not comment_id or comment_id in seen_ids: continue seen_ids.add(comment_id) fresh_comments.append(comment) if fresh_comments: print(f"✅ 本次获取到 {len(fresh_comments)} 条新评论,正在保存...") save_taobao_comments_to_db(fresh_comments) # 👈 立即保存 else: print("⚠️ 本次无新评论,可能已抓取完毕") else: print("🚫 返回数据结构异常,无法提取评论") else: print("🚫 未捕获到新的评论数据,请确认是否已滚动并加载出更多评论") # 自动等待几秒再滚动 # time.sleep(random.uniform(3, 5)) # 这里不再返回 comments,而是直接实时保存 except Exception as e: print("发生错误:", e) def save_taobao_comments_to_db(comments): session = Session() try: for comment in comments: comment_id = comment.get('id', '') feedback = comment.get('feedback', '无评论内容') user_nick = comment.get('userNick', '匿名用户') pic_list = comment.get('feedPicPathList', []) comment_date = comment.get('feedbackDate', '') exists = session.query(TaobaoComment).filter_by(comment_id=comment_id).first() if exists: print(f"评论已存在:{comment_id}") continue picture_urls = [url for url in pic_list if url.startswith('//')] new_comment = TaobaoComment( product_id="735141569627", user_name=user_nick, comment_text=feedback, comment_id=comment_id, picture_urls=json.dumps(picture_urls, ensure_ascii=False), comment_date=comment_date ) session.add(new_comment) print(f"正在写入评论: {comment_id}") session.commit() except Exception as e: session.rollback() print("保存失败:", e) finally: session.close() # =================== 主程序入口 =================== if __name__ == '__main__': print("开始抓取评论...") # 获取评论 comments = fetch_taobao_comments() if comments: print(f"成功获取 {len(comments)} 条评论,正在保存到数据库...") save_taobao_comments_to_db(comments) else: print("未获取到任何评论数据。") print("浏览器保持打开状态,用于调试。")