Files
tb_pl/jd/tb.py
2025-08-13 16:02:21 +08:00

199 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import time
import random
import re
import json
import threading
from DrissionPage import ChromiumPage, ChromiumOptions
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
from sqlalchemy.orm import sessionmaker, declarative_base
# =================== 配置部分 ===================
# 浏览器路径(请根据本地实际路径修改)
CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
# 固定商品详情页 URL
TARGET_URL = "https://detail.tmall.com/item.htm?abbucket=1&id=735141569627&ltk2=1753093866331wbixx4bjhgx78xdlrpyxq&ns=1&priceTId=213e074d17530938630755244e1109&skuId=5667837161089&spm=a21n57.1.hoverItem.2&utparam=%7B%22aplus_abtest%22%3A%228c55408acbff553514850c28e821c3b4%22%7D&xxc=taobaoSearch"
# MySQL 配置
db_config = {
"host": "192.168.8.88",
"port": 3306,
"user": "root",
"password": "mysql_7sjTXH", # 修改为你的密码
"database": "jd"
}
# 初始化数据库连接
db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
engine = create_engine(db_url, echo=False)
Session = sessionmaker(bind=engine)
Base = declarative_base()
# 定义淘宝评论模型
class TaobaoComment(Base):
__tablename__ = 'taobao_comments'
id = Column(Integer, primary_key=True)
product_id = Column(String(50), nullable=False)
user_name = Column(String(100))
comment_text = Column(Text)
comment_id = Column(String(100))
picture_urls = Column(Text) # 存储 JSON 字符串
created_at = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))
comment_date = Column(DateTime)
# 创建表(如果不存在)
Base.metadata.create_all(engine)
# 全局浏览器实例(只初始化一次)
global_taobao_page = None
def get_global_taobao_browser():
global global_taobao_page
if global_taobao_page is None:
options = ChromiumOptions()
options.set_browser_path(CHROME_PATH)
global_taobao_page = ChromiumPage(options)
return global_taobao_page
def extract_json_from_mtop(raw_response: str) -> dict:
# print("原始数据:", raw_response)
"""去除 mtopjsonppcdetail18(...) 等封装,提取真实 JSON"""
match = re.search(r'mtopjsonppcdetail\d+\((\{.*\})\)', raw_response, re.DOTALL)
if match:
try:
return json.loads(match.group(1))
except json.JSONDecodeError as e:
print("JSON 解析失败:", e)
else:
print("未找到有效的 JSON 数据")
return {}
def fetch_taobao_comments():
page = get_global_taobao_browser()
try:
# 打开固定商品页
page.get(TARGET_URL)
# time.sleep(5)
# 向下滚动主页面
page.scroll.down(150)
# time.sleep(3)
# 点击“评价”按钮
element = page.ele('xpath=//div[contains(text(), "全部评价")]')
if element:
element.click()
time.sleep(3)
else:
print("未找到评价按钮")
return []
# 开始监听指定请求
target_url = 'https://h5api.m.tmall.com/h5/mtop.taobao.rate.detaillist.get/6.0/?jsv=2.7.5'
page.listen.start(target_url)
seen_ids = set()
print("\n===============================")
print("✅ 自动开始抓取评论,每次滚动到底部后自动保存新评论...")
print("🚫 如需停止程序,请手动关闭浏览器或使用 Ctrl+C 中断")
print("===============================\n")
while True:
# 发送滚动到底部指令
page.scroll.to_bottom()
print("已向下滚动到底部,等待接口返回数据...")
resp = page.listen.wait(timeout=10)
if resp and target_url in resp.url:
raw_body = resp.response.body
json_data = extract_json_from_mtop(raw_body)
if json_data and 'data' in json_data and 'rateList' in json_data['data']:
batch_comments = json_data['data']['rateList']
fresh_comments = []
for comment in batch_comments:
comment_id = comment.get('id', '')
if not comment_id or comment_id in seen_ids:
continue
seen_ids.add(comment_id)
fresh_comments.append(comment)
if fresh_comments:
print(f"✅ 本次获取到 {len(fresh_comments)} 条新评论,正在保存...")
save_taobao_comments_to_db(fresh_comments) # 👈 立即保存
else:
print("⚠️ 本次无新评论,可能已抓取完毕")
else:
print("🚫 返回数据结构异常,无法提取评论")
else:
print("🚫 未捕获到新的评论数据,请确认是否已滚动并加载出更多评论")
# 自动等待几秒再滚动
# time.sleep(random.uniform(3, 5))
# 这里不再返回 comments而是直接实时保存
except Exception as e:
print("发生错误:", e)
def save_taobao_comments_to_db(comments):
session = Session()
try:
for comment in comments:
comment_id = comment.get('id', '')
feedback = comment.get('feedback', '无评论内容')
user_nick = comment.get('userNick', '匿名用户')
pic_list = comment.get('feedPicPathList', [])
comment_date = comment.get('feedbackDate', '')
exists = session.query(TaobaoComment).filter_by(comment_id=comment_id).first()
if exists:
print(f"评论已存在:{comment_id}")
continue
picture_urls = [url for url in pic_list if url.startswith('//')]
new_comment = TaobaoComment(
product_id="735141569627",
user_name=user_nick,
comment_text=feedback,
comment_id=comment_id,
picture_urls=json.dumps(picture_urls, ensure_ascii=False),
comment_date=comment_date
)
session.add(new_comment)
print(f"正在写入评论: {comment_id}")
session.commit()
except Exception as e:
session.rollback()
print("保存失败:", e)
finally:
session.close()
# =================== 主程序入口 ===================
if __name__ == '__main__':
print("开始抓取评论...")
# 获取评论
comments = fetch_taobao_comments()
if comments:
print(f"成功获取 {len(comments)} 条评论,正在保存到数据库...")
save_taobao_comments_to_db(comments)
else:
print("未获取到任何评论数据。")
print("浏览器保持打开状态,用于调试。")