1
This commit is contained in:
8
.idea/.gitignore
generated
vendored
Normal file
8
.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
# 默认忽略的文件
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# 基于编辑器的 HTTP 客户端请求
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
17
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
17
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
@@ -0,0 +1,17 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="unused" enabled="true" level="ERROR" enabled_by_default="true" editorAttributes="ERRORS_ATTRIBUTES" checkParameterExcludingHierarchy="false">
|
||||
<option name="LOCAL_VARIABLE" value="true" />
|
||||
<option name="FIELD" value="true" />
|
||||
<option name="METHOD" value="true" />
|
||||
<option name="CLASS" value="true" />
|
||||
<option name="PARAMETER" value="true" />
|
||||
<option name="REPORT_PARAMETER_FOR_PUBLIC_METHODS" value="true" />
|
||||
<option name="ADD_MAINS_TO_ENTRIES" value="true" />
|
||||
<option name="ADD_APPLET_TO_ENTRIES" value="true" />
|
||||
<option name="ADD_SERVLET_TO_ENTRIES" value="true" />
|
||||
<option name="ADD_NONJAVA_TO_ENTRIES" value="true" />
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
||||
10
.idea/jdpl.iml
generated
Normal file
10
.idea/jdpl.iml
generated
Normal file
@@ -0,0 +1,10 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
7
.idea/misc.xml
generated
Normal file
7
.idea/misc.xml
generated
Normal file
@@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.12 (jdpl)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (jdpl)" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/jdpl.iml" filepath="$PROJECT_DIR$/.idea/jdpl.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
262
jd/jd.py
Normal file
262
jd/jd.py
Normal file
@@ -0,0 +1,262 @@
|
||||
import time
|
||||
import random
|
||||
import json
|
||||
import threading
|
||||
|
||||
from flask import Flask, request, jsonify
|
||||
from DrissionPage import ChromiumPage, ChromiumOptions
|
||||
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
# =================== 配置部分 ===================
|
||||
# 浏览器路径(请根据本地实际路径修改)
|
||||
CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
|
||||
|
||||
# MySQL 配置
|
||||
db_config = {
|
||||
"host": "192.168.8.88",
|
||||
"port": 3306,
|
||||
"user": "root",
|
||||
"password": "mysql_7sjTXH", # 修改为你的密码
|
||||
"database": "jd"
|
||||
}
|
||||
|
||||
# 初始化 Flask 应用
|
||||
app = Flask(__name__)
|
||||
# 初始化锁
|
||||
fetch_lock = threading.Lock()
|
||||
|
||||
|
||||
# 初始化数据库连接
|
||||
db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
|
||||
engine = create_engine(db_url, echo=False)
|
||||
Session = sessionmaker(bind=engine)
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
# 定义评论模型
|
||||
class Comment(Base):
|
||||
__tablename__ = 'comments'
|
||||
id = Column(Integer, primary_key=True)
|
||||
product_id = Column(String(50), nullable=False)
|
||||
user_name = Column(String(100))
|
||||
comment_text = Column(Text)
|
||||
comment_id = Column(String(100))
|
||||
picture_urls = Column(Text) # 存储 JSON 字符串
|
||||
created_at = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))
|
||||
comment_date = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))
|
||||
|
||||
|
||||
# 创建表(如果不存在)
|
||||
Base.metadata.create_all(engine)
|
||||
# =================== 核心爬虫函数 ===================
|
||||
# 全局浏览器实例(只初始化一次)
|
||||
global_page = None
|
||||
|
||||
|
||||
def get_global_browser():
|
||||
global global_page
|
||||
if global_page is None:
|
||||
options = ChromiumOptions()
|
||||
options.set_browser_path(CHROME_PATH)
|
||||
global_page = ChromiumPage(options)
|
||||
return global_page
|
||||
|
||||
|
||||
def fetch_jd_comments(product_id):
|
||||
page = get_global_browser() # 使用全局浏览器
|
||||
try:
|
||||
# 打开商品页面
|
||||
page.get(f'https://item.jd.com/{product_id}.html#crumb-wrap')
|
||||
time.sleep(random.uniform(5, 8))
|
||||
|
||||
# 向下滚动主页面
|
||||
page.scroll.down(150)
|
||||
time.sleep(random.uniform(3, 5))
|
||||
|
||||
# 点击“买家赞不绝口”
|
||||
element1 = page.ele('xpath=//div[contains(text(), "买家赞不绝口")]')
|
||||
if element1:
|
||||
element1.click()
|
||||
time.sleep(random.uniform(3, 5))
|
||||
else:
|
||||
element1 = page.ele('xpath=//div[contains(text(), "好评率")]')
|
||||
if element1:
|
||||
element1.click()
|
||||
time.sleep(random.uniform(3, 5))
|
||||
# 点击“当前商品”
|
||||
element2 = page.ele('xpath=//div[contains(text(), "当前商品")]')
|
||||
if element2:
|
||||
element2.click()
|
||||
time.sleep(random.uniform(3, 5))
|
||||
|
||||
# 定位弹窗区域
|
||||
popup = page.ele('xpath=//*[@id="rateList"]/div/div[3]')
|
||||
if not popup:
|
||||
return []
|
||||
|
||||
# 点击“视频”
|
||||
element3 = page.ele('xpath=//div[contains(text(), "视频")]')
|
||||
if element3:
|
||||
element3.click()
|
||||
time.sleep(random.uniform(3, 5))
|
||||
|
||||
# 监听请求
|
||||
page.listen.start('https://api.m.jd.com/client.action')
|
||||
|
||||
max_retries = 10 # 最多尝试 5 次无新数据
|
||||
retry_count = 0
|
||||
new_comments = [] # 存储最终的新评论
|
||||
seen_ids = set() # 已处理过的 comment_id
|
||||
|
||||
while retry_count < max_retries and len(new_comments) < 10:
|
||||
scroll_amount = random.randint(10000, 100000)
|
||||
popup.scroll.down(scroll_amount)
|
||||
print(f"弹窗向下滚动了 {scroll_amount} 像素")
|
||||
|
||||
time.sleep(random.uniform(3, 5))
|
||||
|
||||
resp = page.listen.wait(timeout=5)
|
||||
if resp and 'getCommentListPage' in resp.request.postData:
|
||||
json_data = resp.response.body
|
||||
if 'result' in json_data and 'floors' in json_data['result']:
|
||||
comment_floor = json_data['result']['floors'][2]
|
||||
if 'data' in comment_floor and isinstance(comment_floor['data'], list):
|
||||
batch_comments = comment_floor['data']
|
||||
|
||||
# 提取这批评论中的新评论
|
||||
fresh_comments = []
|
||||
for comment in batch_comments:
|
||||
comment_info = comment.get('commentInfo', {})
|
||||
comment_id = comment_info.get('commentId', '')
|
||||
comment_score = comment_info.get('commentScore', '') # 获取评分字段
|
||||
|
||||
if not comment_id:
|
||||
continue
|
||||
|
||||
# 只保留五星好评
|
||||
if comment_score != '5':
|
||||
print(f"跳过非五星评论:{comment_id},评分为 {comment_score}")
|
||||
continue
|
||||
|
||||
# 如果该评论已存在数据库或本次已收集,则跳过
|
||||
exists_in_db = False
|
||||
if comment_id in seen_ids:
|
||||
exists_in_db = True
|
||||
else:
|
||||
session = Session()
|
||||
exists_in_db = session.query(Comment).filter_by(comment_id=comment_id).first() is not None
|
||||
session.close()
|
||||
|
||||
if exists_in_db:
|
||||
print(f"评论已存在:{comment_id}")
|
||||
continue
|
||||
|
||||
seen_ids.add(comment_id)
|
||||
fresh_comments.append(comment)
|
||||
|
||||
if fresh_comments:
|
||||
print(f"本次获取到 {len(fresh_comments)} 条新评论")
|
||||
new_comments.extend(fresh_comments)
|
||||
retry_count = 0 # 有新数据,重置重试计数器
|
||||
else:
|
||||
print("本次无新评论,继续滚动...")
|
||||
retry_count += 1
|
||||
else:
|
||||
print("未找到有效的评论列表")
|
||||
retry_count += 1
|
||||
else:
|
||||
print("返回数据结构异常")
|
||||
retry_count += 1
|
||||
else:
|
||||
print("未捕获到新的评论数据,继续滚动...")
|
||||
retry_count += 1
|
||||
|
||||
print(f"共抓取到 {len(new_comments)} 条新评论(最多需要10条)")
|
||||
return new_comments[:10] # 只保留前10条
|
||||
|
||||
except Exception as e:
|
||||
print("发生错误:", e)
|
||||
return []
|
||||
|
||||
|
||||
|
||||
# =================== 提取评论并保存到数据库 ===================
|
||||
def save_comments_to_db(product_id, comments):
|
||||
session = Session()
|
||||
try:
|
||||
for comment in comments:
|
||||
comment_info = comment.get('commentInfo', {})
|
||||
comment_id = comment_info.get('commentId', '')
|
||||
|
||||
# 如果 comment_id 为空,跳过这条评论
|
||||
if not comment_id:
|
||||
print("跳过无 comment_id 的评论")
|
||||
continue
|
||||
|
||||
# 检查是否已存在该评论
|
||||
exists = session.query(Comment).filter_by(comment_id=comment_id).first()
|
||||
if exists:
|
||||
print(f"评论已存在:{comment_id}")
|
||||
continue
|
||||
|
||||
# 提取其他字段
|
||||
user_name = comment_info.get('userNickName', '匿名用户')
|
||||
comment_text = comment_info.get('commentData', '无评论内容')
|
||||
product_id = comment_info.get('productId', product_id)
|
||||
picture_list = comment_info.get('pictureInfoList', [])
|
||||
comment_date = comment_info.get('commentDate', '')
|
||||
picture_urls = [pic.get('largePicURL') for pic in picture_list if pic.get('largePicURL')]
|
||||
|
||||
new_comment = Comment(
|
||||
product_id=product_id,
|
||||
user_name=user_name,
|
||||
comment_text=comment_text,
|
||||
comment_id=comment_id,
|
||||
picture_urls=json.dumps(picture_urls, ensure_ascii=False),
|
||||
comment_date=comment_date
|
||||
)
|
||||
session.add(new_comment)
|
||||
|
||||
session.commit()
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
print("保存失败:", e)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
# =================== Flask API 接口 ===================
|
||||
@app.route('/fetch_comments', methods=['POST'])
|
||||
def fetch_comments():
|
||||
product_id = request.args.get('product_id')
|
||||
if not product_id:
|
||||
return jsonify({"error": "缺少 product_id"}), -200
|
||||
|
||||
try:
|
||||
with fetch_lock: # 加锁,防止并发调用
|
||||
comments = fetch_jd_comments(product_id)
|
||||
if not comments:
|
||||
return jsonify({"message": "未获取到评论数据"}), -200
|
||||
|
||||
save_comments_to_db(product_id, comments)
|
||||
|
||||
return jsonify({
|
||||
"message": f"成功保存 {len(comments)} 条评论",
|
||||
"product_id": product_id
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({"error": str(e)}), -200
|
||||
|
||||
|
||||
# =================== 启动服务 ===================
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
app.run(host='0.0.0.0', port=5000, debug=True)
|
||||
finally:
|
||||
if 'global_page' in globals() and global_page:
|
||||
global_page.quit()
|
||||
print("浏览器已关闭")
|
||||
2737
jd/response.txt
Normal file
2737
jd/response.txt
Normal file
File diff suppressed because it is too large
Load Diff
198
jd/tb.py
Normal file
198
jd/tb.py
Normal file
@@ -0,0 +1,198 @@
|
||||
import time
|
||||
import random
|
||||
import re
|
||||
import json
|
||||
import threading
|
||||
|
||||
from DrissionPage import ChromiumPage, ChromiumOptions
|
||||
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
|
||||
from sqlalchemy.orm import sessionmaker, declarative_base
|
||||
|
||||
# =================== 配置部分 ===================
|
||||
# 浏览器路径(请根据本地实际路径修改)
|
||||
CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
|
||||
|
||||
# 固定商品详情页 URL
|
||||
TARGET_URL = "https://detail.tmall.com/item.htm?abbucket=1&id=735141569627<k2=1753093866331wbixx4bjhgx78xdlrpyxq&ns=1&priceTId=213e074d17530938630755244e1109&skuId=5667837161089&spm=a21n57.1.hoverItem.2&utparam=%7B%22aplus_abtest%22%3A%228c55408acbff553514850c28e821c3b4%22%7D&xxc=taobaoSearch"
|
||||
# MySQL 配置
|
||||
db_config = {
|
||||
"host": "192.168.8.88",
|
||||
"port": 3306,
|
||||
"user": "root",
|
||||
"password": "mysql_7sjTXH", # 修改为你的密码
|
||||
"database": "jd"
|
||||
}
|
||||
|
||||
# 初始化数据库连接
|
||||
db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
|
||||
engine = create_engine(db_url, echo=False)
|
||||
Session = sessionmaker(bind=engine)
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
# 定义淘宝评论模型
|
||||
class TaobaoComment(Base):
|
||||
__tablename__ = 'taobao_comments'
|
||||
id = Column(Integer, primary_key=True)
|
||||
product_id = Column(String(50), nullable=False)
|
||||
user_name = Column(String(100))
|
||||
comment_text = Column(Text)
|
||||
comment_id = Column(String(100))
|
||||
picture_urls = Column(Text) # 存储 JSON 字符串
|
||||
created_at = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))
|
||||
comment_date = Column(DateTime)
|
||||
|
||||
|
||||
# 创建表(如果不存在)
|
||||
Base.metadata.create_all(engine)
|
||||
|
||||
|
||||
# 全局浏览器实例(只初始化一次)
|
||||
global_taobao_page = None
|
||||
|
||||
|
||||
def get_global_taobao_browser():
|
||||
global global_taobao_page
|
||||
if global_taobao_page is None:
|
||||
options = ChromiumOptions()
|
||||
options.set_browser_path(CHROME_PATH)
|
||||
global_taobao_page = ChromiumPage(options)
|
||||
return global_taobao_page
|
||||
|
||||
|
||||
def extract_json_from_mtop(raw_response: str) -> dict:
|
||||
# print("原始数据:", raw_response)
|
||||
|
||||
"""去除 mtopjsonppcdetail18(...) 等封装,提取真实 JSON"""
|
||||
match = re.search(r'mtopjsonppcdetail\d+\((\{.*\})\)', raw_response, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group(1))
|
||||
except json.JSONDecodeError as e:
|
||||
print("JSON 解析失败:", e)
|
||||
else:
|
||||
print("未找到有效的 JSON 数据")
|
||||
return {}
|
||||
|
||||
|
||||
def fetch_taobao_comments():
|
||||
page = get_global_taobao_browser()
|
||||
try:
|
||||
# 打开固定商品页
|
||||
page.get(TARGET_URL)
|
||||
# time.sleep(5)
|
||||
|
||||
# 向下滚动主页面
|
||||
page.scroll.down(150)
|
||||
# time.sleep(3)
|
||||
|
||||
# 点击“评价”按钮
|
||||
element = page.ele('xpath=//div[contains(text(), "全部评价")]')
|
||||
if element:
|
||||
element.click()
|
||||
time.sleep(3)
|
||||
else:
|
||||
print("未找到评价按钮")
|
||||
return []
|
||||
|
||||
# 开始监听指定请求
|
||||
target_url = 'https://h5api.m.tmall.com/h5/mtop.taobao.rate.detaillist.get/6.0/?jsv=2.7.5'
|
||||
page.listen.start(target_url)
|
||||
|
||||
seen_ids = set()
|
||||
|
||||
print("\n===============================")
|
||||
print("✅ 自动开始抓取评论,每次滚动到底部后自动保存新评论...")
|
||||
print("🚫 如需停止程序,请手动关闭浏览器或使用 Ctrl+C 中断")
|
||||
print("===============================\n")
|
||||
|
||||
while True:
|
||||
# 发送滚动到底部指令
|
||||
page.scroll.to_bottom()
|
||||
print("已向下滚动到底部,等待接口返回数据...")
|
||||
|
||||
resp = page.listen.wait(timeout=10)
|
||||
if resp and target_url in resp.url:
|
||||
raw_body = resp.response.body
|
||||
json_data = extract_json_from_mtop(raw_body)
|
||||
|
||||
if json_data and 'data' in json_data and 'rateList' in json_data['data']:
|
||||
batch_comments = json_data['data']['rateList']
|
||||
|
||||
fresh_comments = []
|
||||
for comment in batch_comments:
|
||||
comment_id = comment.get('id', '')
|
||||
if not comment_id or comment_id in seen_ids:
|
||||
continue
|
||||
|
||||
seen_ids.add(comment_id)
|
||||
fresh_comments.append(comment)
|
||||
|
||||
if fresh_comments:
|
||||
print(f"✅ 本次获取到 {len(fresh_comments)} 条新评论,正在保存...")
|
||||
save_taobao_comments_to_db(fresh_comments) # 👈 立即保存
|
||||
else:
|
||||
print("⚠️ 本次无新评论,可能已抓取完毕")
|
||||
else:
|
||||
print("🚫 返回数据结构异常,无法提取评论")
|
||||
else:
|
||||
print("🚫 未捕获到新的评论数据,请确认是否已滚动并加载出更多评论")
|
||||
|
||||
# 自动等待几秒再滚动
|
||||
# time.sleep(random.uniform(3, 5))
|
||||
|
||||
# 这里不再返回 comments,而是直接实时保存
|
||||
|
||||
except Exception as e:
|
||||
print("发生错误:", e)
|
||||
|
||||
|
||||
|
||||
def save_taobao_comments_to_db(comments):
|
||||
session = Session()
|
||||
try:
|
||||
for comment in comments:
|
||||
comment_id = comment.get('id', '')
|
||||
feedback = comment.get('feedback', '无评论内容')
|
||||
user_nick = comment.get('userNick', '匿名用户')
|
||||
pic_list = comment.get('feedPicPathList', [])
|
||||
comment_date = comment.get('feedbackDate', '')
|
||||
|
||||
exists = session.query(TaobaoComment).filter_by(comment_id=comment_id).first()
|
||||
if exists:
|
||||
print(f"评论已存在:{comment_id}")
|
||||
continue
|
||||
|
||||
picture_urls = [url for url in pic_list if url.startswith('//')]
|
||||
|
||||
new_comment = TaobaoComment(
|
||||
product_id="735141569627",
|
||||
user_name=user_nick,
|
||||
comment_text=feedback,
|
||||
comment_id=comment_id,
|
||||
picture_urls=json.dumps(picture_urls, ensure_ascii=False),
|
||||
comment_date=comment_date
|
||||
)
|
||||
session.add(new_comment)
|
||||
print(f"正在写入评论: {comment_id}")
|
||||
session.commit()
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
print("保存失败:", e)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
# =================== 主程序入口 ===================
|
||||
if __name__ == '__main__':
|
||||
print("开始抓取评论...")
|
||||
|
||||
# 获取评论
|
||||
comments = fetch_taobao_comments()
|
||||
if comments:
|
||||
print(f"成功获取 {len(comments)} 条评论,正在保存到数据库...")
|
||||
save_taobao_comments_to_db(comments)
|
||||
else:
|
||||
print("未获取到任何评论数据。")
|
||||
|
||||
print("浏览器保持打开状态,用于调试。")
|
||||
144
jd/test.py
Normal file
144
jd/test.py
Normal file
@@ -0,0 +1,144 @@
|
||||
import time
|
||||
import random
|
||||
import json
|
||||
from DrissionPage import ChromiumPage, ChromiumOptions
|
||||
|
||||
# 设置浏览器路径(请替换为本地 Chrome 的实际安装路径)
|
||||
chrome_path = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
|
||||
|
||||
# 配置并启动带指定路径的浏览器
|
||||
options = ChromiumOptions()
|
||||
options.set_browser_path(chrome_path)
|
||||
|
||||
# 创建浏览器实例
|
||||
page = ChromiumPage(options)
|
||||
|
||||
try:
|
||||
# 打开京东商品页面
|
||||
page.get('https://item.jd.com/100104238904.html#crumb-wrap')
|
||||
|
||||
while True: # 持续运行循环
|
||||
try:
|
||||
# 等待页面加载完成(可手动处理验证码)
|
||||
time.sleep(random.uniform(1, 6))
|
||||
|
||||
# 向下滚动主页面
|
||||
page.scroll.down(150)
|
||||
time.sleep(random.uniform(1, 3))
|
||||
|
||||
# 定位并点击“买家赞不绝口”
|
||||
element1 = page.ele('xpath=//div[contains(text(), "买家赞不绝口")]')
|
||||
if element1:
|
||||
print(f"找到元素:{element1.text}")
|
||||
element1.click()
|
||||
time.sleep(random.uniform(2, 4))
|
||||
else:
|
||||
print("未找到第一个元素")
|
||||
|
||||
# 定位并点击“当前商品”
|
||||
element2 = page.ele('xpath=//div[contains(text(), "当前商品")]')
|
||||
if element2:
|
||||
print(f"找到元素:{element2.text}")
|
||||
element2.click()
|
||||
time.sleep(random.uniform(2, 4))
|
||||
else:
|
||||
print("未找到第二个元素")
|
||||
# 定位并点击“图/视频”
|
||||
element3 = page.ele('xpath=//div[contains(text(), "视频")]')
|
||||
if element3:
|
||||
print(f"找到元素:{element3.text}")
|
||||
element3.click()
|
||||
time.sleep(random.uniform(2, 4))
|
||||
else:
|
||||
print("未找到第三个元素")
|
||||
|
||||
# 定位弹窗区域
|
||||
popup = page.ele('xpath=//*[@id="rateList"]/div/div[3]')
|
||||
if popup:
|
||||
# 开始监听目标接口请求
|
||||
page.listen.start('https://api.m.jd.com/client.action')
|
||||
|
||||
# 循环滚动直到成功获取新评论数据
|
||||
max_retries = 5 # 最大尝试次数
|
||||
retry_count = 0
|
||||
success = False
|
||||
|
||||
while retry_count < max_retries and not success:
|
||||
# 随机滚动一定像素
|
||||
scroll_amount = random.randint(1000, 4000)
|
||||
popup.scroll.down(scroll_amount)
|
||||
print(f"弹窗向下滚动了 {scroll_amount} 像素")
|
||||
|
||||
# 滚动后等待一段时间,模拟真实用户行为
|
||||
time.sleep(random.uniform(1, 3))
|
||||
|
||||
# 等待新的评论数据请求
|
||||
resp = page.listen.wait(timeout=5)
|
||||
|
||||
if resp and 'getCommentListPage' in resp.request.postData:
|
||||
print("成功捕获到新的评论数据请求!")
|
||||
# 解析 resp.body 中的 JSON 数据
|
||||
try:
|
||||
json_data = resp.response.body
|
||||
|
||||
# 提取评论楼层(第三个楼层)
|
||||
if 'result' in json_data and 'floors' in json_data['result']:
|
||||
comment_floor = json_data['result']['floors'][2] # 索引从0开始
|
||||
|
||||
if 'data' in comment_floor and isinstance(comment_floor['data'], list):
|
||||
comments = comment_floor['data']
|
||||
|
||||
print(f"成功提取到 {len(comments)} 条评论:\n")
|
||||
|
||||
for idx, comment in enumerate(comments, 1):
|
||||
comment_info = comment.get('commentInfo', {})
|
||||
|
||||
user_name = comment_info.get('userNickName', '匿名用户')
|
||||
comment_text = comment_info.get('commentData', '无评论内容')
|
||||
commentId = comment_info.get('commentId', '评价ID')
|
||||
productId = comment_info.get('productId', '商品ID')
|
||||
# 提取所有图片链接
|
||||
picture_list = comment_info.get('pictureInfoList', [])
|
||||
picture_urls = [pic.get('largePicURL') for pic in picture_list if pic.get('largePicURL')]
|
||||
|
||||
if picture_urls:
|
||||
print(f"第 {idx} 条评论:")
|
||||
print(f"用户名:{user_name}")
|
||||
print(f"评分:{commentId}")
|
||||
print(f"商品ID:{productId}")
|
||||
print(f"评论内容:{comment_text}")
|
||||
print(f"图片链接:{picture_urls}\n")
|
||||
else:
|
||||
print("未找到有效的评论数据或数据格式异常。")
|
||||
else:
|
||||
print("返回数据中不包含评论楼层信息。")
|
||||
except json.JSONDecodeError as je:
|
||||
print("JSON 解析失败:", je)
|
||||
except Exception as e:
|
||||
print("处理评论数据时出错:", e)
|
||||
else:
|
||||
print("未捕获到新的评论数据,继续滚动...")
|
||||
retry_count += 1
|
||||
|
||||
if not success:
|
||||
print("多次滚动后仍未获取到有效评论数据,请检查页面结构或网络请求状态。")
|
||||
else:
|
||||
print("未找到弹窗元素")
|
||||
|
||||
# 添加随机等待时间以模拟真实用户行为
|
||||
wait_time = random.uniform(5, 10)
|
||||
print(f"等待 {wait_time:.2f} 秒后继续下一轮操作...")
|
||||
time.sleep(wait_time)
|
||||
|
||||
except Exception as inner_e:
|
||||
print(f"内部循环发生错误:{inner_e}")
|
||||
time.sleep(5) # 出错后稍作等待再继续
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("用户中断脚本执行")
|
||||
except Exception as outer_e:
|
||||
print(f"外部异常:{outer_e}")
|
||||
finally:
|
||||
# 关闭浏览器
|
||||
print("正在关闭浏览器...")
|
||||
# page.quit()
|
||||
Reference in New Issue
Block a user