199 lines
6.8 KiB
Python
199 lines
6.8 KiB
Python
import time
|
||
import random
|
||
import re
|
||
import json
|
||
import threading
|
||
|
||
from DrissionPage import ChromiumPage, ChromiumOptions
|
||
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
|
||
from sqlalchemy.orm import sessionmaker, declarative_base
|
||
|
||
# =================== 配置部分 ===================
|
||
# 浏览器路径(请根据本地实际路径修改)
|
||
CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
|
||
|
||
# 固定商品详情页 URL
|
||
TARGET_URL = "https://detail.tmall.com/item.htm?abbucket=1&id=735141569627<k2=1753093866331wbixx4bjhgx78xdlrpyxq&ns=1&priceTId=213e074d17530938630755244e1109&skuId=5667837161089&spm=a21n57.1.hoverItem.2&utparam=%7B%22aplus_abtest%22%3A%228c55408acbff553514850c28e821c3b4%22%7D&xxc=taobaoSearch"
|
||
# MySQL 配置
|
||
db_config = {
|
||
"host": "192.168.8.88",
|
||
"port": 3306,
|
||
"user": "root",
|
||
"password": "mysql_7sjTXH", # 修改为你的密码
|
||
"database": "jd"
|
||
}
|
||
|
||
# 初始化数据库连接
|
||
db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
|
||
engine = create_engine(db_url, echo=False)
|
||
Session = sessionmaker(bind=engine)
|
||
Base = declarative_base()
|
||
|
||
|
||
# 定义淘宝评论模型
|
||
class TaobaoComment(Base):
|
||
__tablename__ = 'taobao_comments'
|
||
id = Column(Integer, primary_key=True)
|
||
product_id = Column(String(50), nullable=False)
|
||
user_name = Column(String(100))
|
||
comment_text = Column(Text)
|
||
comment_id = Column(String(100))
|
||
picture_urls = Column(Text) # 存储 JSON 字符串
|
||
created_at = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))
|
||
comment_date = Column(DateTime)
|
||
|
||
|
||
# 创建表(如果不存在)
|
||
Base.metadata.create_all(engine)
|
||
|
||
|
||
# 全局浏览器实例(只初始化一次)
|
||
global_taobao_page = None
|
||
|
||
|
||
def get_global_taobao_browser():
|
||
global global_taobao_page
|
||
if global_taobao_page is None:
|
||
options = ChromiumOptions()
|
||
options.set_browser_path(CHROME_PATH)
|
||
global_taobao_page = ChromiumPage(options)
|
||
return global_taobao_page
|
||
|
||
|
||
def extract_json_from_mtop(raw_response: str) -> dict:
|
||
# print("原始数据:", raw_response)
|
||
|
||
"""去除 mtopjsonppcdetail18(...) 等封装,提取真实 JSON"""
|
||
match = re.search(r'mtopjsonppcdetail\d+\((\{.*\})\)', raw_response, re.DOTALL)
|
||
if match:
|
||
try:
|
||
return json.loads(match.group(1))
|
||
except json.JSONDecodeError as e:
|
||
print("JSON 解析失败:", e)
|
||
else:
|
||
print("未找到有效的 JSON 数据")
|
||
return {}
|
||
|
||
|
||
def fetch_taobao_comments():
|
||
page = get_global_taobao_browser()
|
||
try:
|
||
# 打开固定商品页
|
||
page.get(TARGET_URL)
|
||
# time.sleep(5)
|
||
|
||
# 向下滚动主页面
|
||
page.scroll.down(150)
|
||
# time.sleep(3)
|
||
|
||
# 点击“评价”按钮
|
||
element = page.ele('xpath=//div[contains(text(), "全部评价")]')
|
||
if element:
|
||
element.click()
|
||
time.sleep(3)
|
||
else:
|
||
print("未找到评价按钮")
|
||
return []
|
||
|
||
# 开始监听指定请求
|
||
target_url = 'https://h5api.m.tmall.com/h5/mtop.taobao.rate.detaillist.get/6.0/?jsv=2.7.5'
|
||
page.listen.start(target_url)
|
||
|
||
seen_ids = set()
|
||
|
||
print("\n===============================")
|
||
print("✅ 自动开始抓取评论,每次滚动到底部后自动保存新评论...")
|
||
print("🚫 如需停止程序,请手动关闭浏览器或使用 Ctrl+C 中断")
|
||
print("===============================\n")
|
||
|
||
while True:
|
||
# 发送滚动到底部指令
|
||
page.scroll.to_bottom()
|
||
print("已向下滚动到底部,等待接口返回数据...")
|
||
|
||
resp = page.listen.wait(timeout=10)
|
||
if resp and target_url in resp.url:
|
||
raw_body = resp.response.body
|
||
json_data = extract_json_from_mtop(raw_body)
|
||
|
||
if json_data and 'data' in json_data and 'rateList' in json_data['data']:
|
||
batch_comments = json_data['data']['rateList']
|
||
|
||
fresh_comments = []
|
||
for comment in batch_comments:
|
||
comment_id = comment.get('id', '')
|
||
if not comment_id or comment_id in seen_ids:
|
||
continue
|
||
|
||
seen_ids.add(comment_id)
|
||
fresh_comments.append(comment)
|
||
|
||
if fresh_comments:
|
||
print(f"✅ 本次获取到 {len(fresh_comments)} 条新评论,正在保存...")
|
||
save_taobao_comments_to_db(fresh_comments) # 👈 立即保存
|
||
else:
|
||
print("⚠️ 本次无新评论,可能已抓取完毕")
|
||
else:
|
||
print("🚫 返回数据结构异常,无法提取评论")
|
||
else:
|
||
print("🚫 未捕获到新的评论数据,请确认是否已滚动并加载出更多评论")
|
||
|
||
# 自动等待几秒再滚动
|
||
# time.sleep(random.uniform(3, 5))
|
||
|
||
# 这里不再返回 comments,而是直接实时保存
|
||
|
||
except Exception as e:
|
||
print("发生错误:", e)
|
||
|
||
|
||
|
||
def save_taobao_comments_to_db(comments):
|
||
session = Session()
|
||
try:
|
||
for comment in comments:
|
||
comment_id = comment.get('id', '')
|
||
feedback = comment.get('feedback', '无评论内容')
|
||
user_nick = comment.get('userNick', '匿名用户')
|
||
pic_list = comment.get('feedPicPathList', [])
|
||
comment_date = comment.get('feedbackDate', '')
|
||
|
||
exists = session.query(TaobaoComment).filter_by(comment_id=comment_id).first()
|
||
if exists:
|
||
print(f"评论已存在:{comment_id}")
|
||
continue
|
||
|
||
picture_urls = [url for url in pic_list if url.startswith('//')]
|
||
|
||
new_comment = TaobaoComment(
|
||
product_id="735141569627",
|
||
user_name=user_nick,
|
||
comment_text=feedback,
|
||
comment_id=comment_id,
|
||
picture_urls=json.dumps(picture_urls, ensure_ascii=False),
|
||
comment_date=comment_date
|
||
)
|
||
session.add(new_comment)
|
||
print(f"正在写入评论: {comment_id}")
|
||
session.commit()
|
||
except Exception as e:
|
||
session.rollback()
|
||
print("保存失败:", e)
|
||
finally:
|
||
session.close()
|
||
|
||
|
||
# =================== 主程序入口 ===================
|
||
if __name__ == '__main__':
|
||
print("开始抓取评论...")
|
||
|
||
# 获取评论
|
||
comments = fetch_taobao_comments()
|
||
if comments:
|
||
print(f"成功获取 {len(comments)} 条评论,正在保存到数据库...")
|
||
save_taobao_comments_to_db(comments)
|
||
else:
|
||
print("未获取到任何评论数据。")
|
||
|
||
print("浏览器保持打开状态,用于调试。")
|