This commit is contained in:
2025-08-13 16:02:21 +08:00
commit fa25bfd784
11 changed files with 3403 additions and 0 deletions

198
jd/tb.py Normal file
View File

@@ -0,0 +1,198 @@
import time
import random
import re
import json
import threading
from DrissionPage import ChromiumPage, ChromiumOptions
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
from sqlalchemy.orm import sessionmaker, declarative_base
# =================== 配置部分 ===================
# 浏览器路径(请根据本地实际路径修改)
CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
# 固定商品详情页 URL
TARGET_URL = "https://detail.tmall.com/item.htm?abbucket=1&id=735141569627&ltk2=1753093866331wbixx4bjhgx78xdlrpyxq&ns=1&priceTId=213e074d17530938630755244e1109&skuId=5667837161089&spm=a21n57.1.hoverItem.2&utparam=%7B%22aplus_abtest%22%3A%228c55408acbff553514850c28e821c3b4%22%7D&xxc=taobaoSearch"
# MySQL 配置
db_config = {
"host": "192.168.8.88",
"port": 3306,
"user": "root",
"password": "mysql_7sjTXH", # 修改为你的密码
"database": "jd"
}
# 初始化数据库连接
db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
engine = create_engine(db_url, echo=False)
Session = sessionmaker(bind=engine)
Base = declarative_base()
# 定义淘宝评论模型
class TaobaoComment(Base):
__tablename__ = 'taobao_comments'
id = Column(Integer, primary_key=True)
product_id = Column(String(50), nullable=False)
user_name = Column(String(100))
comment_text = Column(Text)
comment_id = Column(String(100))
picture_urls = Column(Text) # 存储 JSON 字符串
created_at = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))
comment_date = Column(DateTime)
# 创建表(如果不存在)
Base.metadata.create_all(engine)
# 全局浏览器实例(只初始化一次)
global_taobao_page = None
def get_global_taobao_browser():
global global_taobao_page
if global_taobao_page is None:
options = ChromiumOptions()
options.set_browser_path(CHROME_PATH)
global_taobao_page = ChromiumPage(options)
return global_taobao_page
def extract_json_from_mtop(raw_response: str) -> dict:
# print("原始数据:", raw_response)
"""去除 mtopjsonppcdetail18(...) 等封装,提取真实 JSON"""
match = re.search(r'mtopjsonppcdetail\d+\((\{.*\})\)', raw_response, re.DOTALL)
if match:
try:
return json.loads(match.group(1))
except json.JSONDecodeError as e:
print("JSON 解析失败:", e)
else:
print("未找到有效的 JSON 数据")
return {}
def fetch_taobao_comments():
page = get_global_taobao_browser()
try:
# 打开固定商品页
page.get(TARGET_URL)
# time.sleep(5)
# 向下滚动主页面
page.scroll.down(150)
# time.sleep(3)
# 点击“评价”按钮
element = page.ele('xpath=//div[contains(text(), "全部评价")]')
if element:
element.click()
time.sleep(3)
else:
print("未找到评价按钮")
return []
# 开始监听指定请求
target_url = 'https://h5api.m.tmall.com/h5/mtop.taobao.rate.detaillist.get/6.0/?jsv=2.7.5'
page.listen.start(target_url)
seen_ids = set()
print("\n===============================")
print("✅ 自动开始抓取评论,每次滚动到底部后自动保存新评论...")
print("🚫 如需停止程序,请手动关闭浏览器或使用 Ctrl+C 中断")
print("===============================\n")
while True:
# 发送滚动到底部指令
page.scroll.to_bottom()
print("已向下滚动到底部,等待接口返回数据...")
resp = page.listen.wait(timeout=10)
if resp and target_url in resp.url:
raw_body = resp.response.body
json_data = extract_json_from_mtop(raw_body)
if json_data and 'data' in json_data and 'rateList' in json_data['data']:
batch_comments = json_data['data']['rateList']
fresh_comments = []
for comment in batch_comments:
comment_id = comment.get('id', '')
if not comment_id or comment_id in seen_ids:
continue
seen_ids.add(comment_id)
fresh_comments.append(comment)
if fresh_comments:
print(f"✅ 本次获取到 {len(fresh_comments)} 条新评论,正在保存...")
save_taobao_comments_to_db(fresh_comments) # 👈 立即保存
else:
print("⚠️ 本次无新评论,可能已抓取完毕")
else:
print("🚫 返回数据结构异常,无法提取评论")
else:
print("🚫 未捕获到新的评论数据,请确认是否已滚动并加载出更多评论")
# 自动等待几秒再滚动
# time.sleep(random.uniform(3, 5))
# 这里不再返回 comments而是直接实时保存
except Exception as e:
print("发生错误:", e)
def save_taobao_comments_to_db(comments):
session = Session()
try:
for comment in comments:
comment_id = comment.get('id', '')
feedback = comment.get('feedback', '无评论内容')
user_nick = comment.get('userNick', '匿名用户')
pic_list = comment.get('feedPicPathList', [])
comment_date = comment.get('feedbackDate', '')
exists = session.query(TaobaoComment).filter_by(comment_id=comment_id).first()
if exists:
print(f"评论已存在:{comment_id}")
continue
picture_urls = [url for url in pic_list if url.startswith('//')]
new_comment = TaobaoComment(
product_id="735141569627",
user_name=user_nick,
comment_text=feedback,
comment_id=comment_id,
picture_urls=json.dumps(picture_urls, ensure_ascii=False),
comment_date=comment_date
)
session.add(new_comment)
print(f"正在写入评论: {comment_id}")
session.commit()
except Exception as e:
session.rollback()
print("保存失败:", e)
finally:
session.close()
# =================== 主程序入口 ===================
if __name__ == '__main__':
print("开始抓取评论...")
# 获取评论
comments = fetch_taobao_comments()
if comments:
print(f"成功获取 {len(comments)} 条评论,正在保存到数据库...")
save_taobao_comments_to_db(comments)
else:
print("未获取到任何评论数据。")
print("浏览器保持打开状态,用于调试。")