Files
tb_pl/jd/jd.py
2026-04-26 13:39:19 +08:00

429 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import time
import random
import json
import threading
from flask import Flask, request, jsonify
from DrissionPage import ChromiumPage, ChromiumOptions
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
from sqlalchemy.orm import declarative_base, sessionmaker
# =================== 配置部分 ===================
# 浏览器路径(请根据本地实际路径修改)
CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
# MySQL 配置
db_config = {
"host": "192.168.8.88",
"port": 3306,
"user": "root",
"password": "mysql_7sjTXH", # 修改为你的密码
"database": "jd"
}
# 初始化 Flask 应用
app = Flask(__name__)
# 初始化锁
fetch_lock = threading.Lock()
# 全局爬虫控制标志
crawler_running = False
crawler_thread = None
current_product_id = None
# 当前“允许运行”的抓取任务 product_id新请求会覆盖旧线程检测到不匹配则退出
active_fetch_product_id = None
# 初始化数据库连接
db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
engine = create_engine(db_url, echo=False)
Session = sessionmaker(bind=engine)
Base = declarative_base()
# 定义评论模型
class Comment(Base):
__tablename__ = 'comments'
id = Column(Integer, primary_key=True)
product_id = Column(String(50), nullable=False)
user_name = Column(String(100))
comment_text = Column(Text)
comment_id = Column(String(100))
picture_urls = Column(Text) # 存储 JSON 字符串
created_at = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))
comment_date = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))
# 创建表(如果不存在)
Base.metadata.create_all(engine)
# =================== 核心爬虫函数 ===================
# 全局浏览器实例(只初始化一次)
global_page = None
def get_global_browser():
global global_page
if global_page is None:
options = ChromiumOptions()
options.set_browser_path(CHROME_PATH)
global_page = ChromiumPage(options)
return global_page
def _is_fetch_cancelled(product_id):
"""当前任务是否已被新请求取消(只保留最新请求的 product_id"""
global active_fetch_product_id
return active_fetch_product_id is not None and active_fetch_product_id != product_id
def fetch_jd_comments(product_id):
global active_fetch_product_id
page = get_global_browser() # 使用全局浏览器
try:
# 打开商品页面
page.get(f'https://item.jd.com/{product_id}.html#crumb-wrap')
time.sleep(random.uniform(5, 8))
if _is_fetch_cancelled(product_id):
return 0
# 向下滚动主页面
page.scroll.down(150)
time.sleep(random.uniform(3, 5))
if _is_fetch_cancelled(product_id):
return 0
# 点击“买家赞不绝口”
element1 = page.ele('xpath=//div[contains(text(), "买家赞不绝口")]')
if element1:
element1.click()
time.sleep(random.uniform(3, 5))
else:
element1 = page.ele('xpath=//div[contains(text(), "好评率")]')
if element1:
element1.click()
time.sleep(random.uniform(3, 5))
if _is_fetch_cancelled(product_id):
return 0
# 点击“当前商品”
element2 = page.ele('xpath=//div[contains(text(), "当前商品")]')
if element2:
element2.click()
time.sleep(random.uniform(3, 5))
if _is_fetch_cancelled(product_id):
return 0
# 定位弹窗区域
popup = page.ele('xpath=//*[@id="rateList"]/div/div[3]')
if not popup:
return 0
# 点击“视频”
element3 = page.ele('xpath=//div[contains(text(), "视频")]')
if element3:
element3.click()
time.sleep(random.uniform(3, 5))
if _is_fetch_cancelled(product_id):
return 0
# 监听请求
page.listen.start('https://api.m.jd.com/client.action')
retry_count = 0
new_comments = [] # 存储最终的新评论
seen_ids = set() # 已处理过的 comment_id
total_comments_saved = 0 # 总共保存的评论数
# 持续获取评论,直到被新请求取消或手动停止
while True:
if _is_fetch_cancelled(product_id):
print(f"[fetch_jd_comments] 商品 {product_id} 已被新请求取消,退出")
break
scroll_amount = random.randint(10000, 100000)
popup.scroll.down(scroll_amount)
print(f"弹窗向下滚动了 {scroll_amount} 像素")
time.sleep(random.uniform(3, 5))
if _is_fetch_cancelled(product_id):
break
resp = page.listen.wait(timeout=5)
if resp and 'getCommentListPage' in resp.request.postData:
json_data = resp.response.body
if 'result' in json_data and 'floors' in json_data['result']:
comment_floor = json_data['result']['floors'][2]
if 'data' in comment_floor and isinstance(comment_floor['data'], list):
batch_comments = comment_floor['data']
# 提取这批评论中的新评论
fresh_comments = []
for comment in batch_comments:
comment_info = comment.get('commentInfo', {})
comment_id = comment_info.get('commentId', '')
comment_score = comment_info.get('commentScore', '') # 获取评分字段
if not comment_id:
continue
# 只保留五星好评
if comment_score != '5':
print(f"跳过非五星评论:{comment_id},评分为 {comment_score}")
continue
# 如果该评论已存在数据库或本次已收集,则跳过
exists_in_db = False
if comment_id in seen_ids:
exists_in_db = True
else:
session = Session()
exists_in_db = session.query(Comment).filter_by(comment_id=comment_id).first() is not None
session.close()
if exists_in_db:
print(f"评论已存在:{comment_id}")
continue
seen_ids.add(comment_id)
fresh_comments.append(comment)
if fresh_comments:
print(f"本次获取到 {len(fresh_comments)} 条新评论")
new_comments.extend(fresh_comments)
retry_count = 0 # 有新数据,重置重试计数器
# 立即保存这批评论到数据库
save_comments_to_db(product_id, fresh_comments)
total_comments_saved += len(fresh_comments)
print(f"已保存 {len(fresh_comments)} 条评论到数据库,总计保存 {total_comments_saved} 条评论")
else:
print("本次无新评论,继续滚动...")
retry_count += 1
else:
print("未找到有效的评论列表")
retry_count += 1
else:
print("返回数据结构异常")
retry_count += 1
else:
print("未捕获到新的评论数据,继续滚动...")
retry_count += 1
if _is_fetch_cancelled(product_id):
break
print(f"爬虫已停止,共抓取到 {total_comments_saved} 条评论")
return total_comments_saved
except Exception as e:
print("发生错误:", e)
return 0
# =================== 持续爬虫后台运行函数 ===================
def continuous_crawler(product_id):
"""持续爬取评论的后台函数"""
global crawler_running
try:
print(f"开始持续爬取商品 {product_id} 的评论...")
while crawler_running:
result = fetch_jd_comments(product_id)
if not crawler_running:
break
# 如果没有获取到数据,等待一段时间再继续
time.sleep(10)
print(f"商品 {product_id} 的持续爬取已停止")
except Exception as e:
print(f"持续爬虫发生错误: {e}")
crawler_running = False
# =================== 提取评论并保存到数据库 ===================
def save_comments_to_db(product_id, comments):
session = Session()
try:
for comment in comments:
comment_info = comment.get('commentInfo', {})
comment_id = comment_info.get('commentId', '')
# 如果 comment_id 为空,跳过这条评论
if not comment_id:
print("跳过无 comment_id 的评论")
continue
# 检查是否已存在该评论
exists = session.query(Comment).filter_by(comment_id=comment_id).first()
if exists:
print(f"评论已存在:{comment_id}")
continue
# 提取其他字段
user_name = comment_info.get('userNickName', '匿名用户')
comment_text = comment_info.get('commentData', '无评论内容')
product_id = comment_info.get('productId', product_id)
picture_list = comment_info.get('pictureInfoList', [])
comment_date = comment_info.get('commentDate', '')
picture_urls = [pic.get('largePicURL') for pic in picture_list if pic.get('largePicURL')]
new_comment = Comment(
product_id=product_id,
user_name=user_name,
comment_text=comment_text,
comment_id=comment_id,
picture_urls=json.dumps(picture_urls, ensure_ascii=False),
comment_date=comment_date
)
session.add(new_comment)
session.commit()
except Exception as e:
session.rollback()
print("保存失败:", e)
finally:
session.close()
# =================== Flask API 接口 ===================
@app.route('/start_crawler', methods=['POST'])
def start_crawler():
"""启动持续爬虫"""
global crawler_running, crawler_thread, current_product_id
product_id = request.args.get('product_id')
if not product_id:
return jsonify({"error": "缺少 product_id"}), 400
if crawler_running:
return jsonify({
"message": f"爬虫已在运行中当前商品ID: {current_product_id}",
"status": "already_running"
}), 200
try:
with fetch_lock:
crawler_running = True
current_product_id = product_id
crawler_thread = threading.Thread(target=continuous_crawler, args=(product_id,))
crawler_thread.daemon = True
crawler_thread.start()
return jsonify({
"message": f"已启动持续爬虫商品ID: {product_id}",
"status": "started",
"product_id": product_id
}), 200
except Exception as e:
crawler_running = False
return jsonify({"error": str(e)}), 500
@app.route('/stop_crawler', methods=['POST'])
def stop_crawler():
"""停止持续爬虫"""
global crawler_running, crawler_thread, current_product_id
if not crawler_running:
return jsonify({
"message": "爬虫未在运行",
"status": "not_running"
}), 200
try:
with fetch_lock:
crawler_running = False
stopped_product_id = current_product_id
current_product_id = None
# 等待线程结束
if crawler_thread and crawler_thread.is_alive():
crawler_thread.join(timeout=10)
return jsonify({
"message": f"已停止持续爬虫商品ID: {stopped_product_id}",
"status": "stopped",
"product_id": stopped_product_id
}), 200
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route('/crawler_status', methods=['GET'])
def crawler_status():
"""获取爬虫状态"""
global crawler_running, current_product_id
return jsonify({
"running": crawler_running,
"product_id": current_product_id,
"status": "running" if crawler_running else "stopped"
}), 200
@app.route('/test', methods=['GET'])
def test():
"""测试端点,验证服务器是否正常工作"""
print("测试端点被访问")
return jsonify({"message": "服务器运行正常", "status": "ok"}), 200
@app.route('/fetch_comments', methods=['GET', 'POST'])
def fetch_comments():
"""单次获取评论(在后台运行,立即返回)。新请求会中断所有历史请求线程,只执行本次请求。"""
global crawler_running, active_fetch_product_id
print(f"[fetch_comments] 收到请求,方法: {request.method}, 参数: {request.args}")
product_id = request.args.get('product_id')
if not product_id:
print("[fetch_comments] 错误: 缺少 product_id")
return jsonify({"error": "缺少 product_id"}), 400
print(f"[fetch_comments] 开始处理商品ID: {product_id},将中断所有历史请求后执行")
try:
# 立刻中断所有历史:停止持续爬虫并标记“当前任务”为新 product_id旧线程在循环中检测到会自行退出
with fetch_lock:
crawler_running = False
active_fetch_product_id = product_id
def run_fetch():
try:
print(f"[后台线程] 开始获取商品 {product_id} 的评论...")
result = fetch_jd_comments(product_id)
print(f"[后台线程] 获取完成,结果: {result}")
except Exception as e:
import traceback
error_msg = f"后台获取评论时发生错误: {e}\n{traceback.format_exc()}"
print(f"[后台线程] {error_msg}")
fetch_thread = threading.Thread(target=run_fetch)
fetch_thread.daemon = True
fetch_thread.start()
print(f"[fetch_comments] 后台线程已启动(历史请求已标记为取消)")
response_data = {
"message": f"已开始获取商品 {product_id} 的评论,正在后台运行中...(已中断之前的请求)",
"status": "started",
"product_id": product_id,
"note": "评论获取在后台进行,请稍后查看数据库或使用 /crawler_status 查看状态"
}
print(f"[fetch_comments] 返回响应: {response_data}")
return jsonify(response_data), 200
except Exception as e:
import traceback
error_msg = f"处理请求时发生错误: {e}\n{traceback.format_exc()}"
print(f"[fetch_comments] {error_msg}")
return jsonify({"error": str(e)}), 500
# =================== 启动服务 ===================
if __name__ == '__main__':
try:
app.run(host='0.0.0.0', port=5008, debug=True)
finally:
if 'global_page' in globals() and global_page:
global_page.quit()
print("浏览器已关闭")