1

2025-08-13 16:02:21 +08:00
commit fa25bfd784
11 changed files with 3403 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
 # 默认忽略的文件
 /shelf/
 /workspace.xml
 # 基于编辑器的 HTTP 客户端请求
 /httpRequests/
 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,17 @@
 <component name="InspectionProjectProfileManager">
  <profile version="1.0">
    <option name="myName" value="Project Default" />
    <inspection_tool class="unused" enabled="true" level="ERROR" enabled_by_default="true" editorAttributes="ERRORS_ATTRIBUTES" checkParameterExcludingHierarchy="false">
      <option name="LOCAL_VARIABLE" value="true" />
      <option name="FIELD" value="true" />
      <option name="METHOD" value="true" />
      <option name="CLASS" value="true" />
      <option name="PARAMETER" value="true" />
      <option name="REPORT_PARAMETER_FOR_PUBLIC_METHODS" value="true" />
      <option name="ADD_MAINS_TO_ENTRIES" value="true" />
      <option name="ADD_APPLET_TO_ENTRIES" value="true" />
      <option name="ADD_SERVLET_TO_ENTRIES" value="true" />
      <option name="ADD_NONJAVA_TO_ENTRIES" value="true" />
    </inspection_tool>
  </profile>
 </component>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
 <component name="InspectionProjectProfileManager">
  <settings>
    <option name="USE_PROJECT_PROFILE" value="false" />
    <version value="1.0" />
  </settings>
 </component>
--- a/.idea/jdpl.iml
+++ b/.idea/jdpl.iml
@@ -0,0 +1,10 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$">
      <excludeFolder url="file://$MODULE_DIR$/.venv" />
    </content>
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="Black">
    <option name="sdkName" value="Python 3.12 (jdpl)" />
  </component>
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (jdpl)" project-jdk-type="Python SDK" />
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/jdpl.iml" filepath="$PROJECT_DIR$/.idea/jdpl.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
  </component>
 </project>
--- a/jd/jd.py
+++ b/jd/jd.py
@@ -0,0 +1,262 @@
 import time
 import random
 import json
 import threading
 from flask import Flask, request, jsonify
 from DrissionPage import ChromiumPage, ChromiumOptions
 from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker
 # =================== 配置部分 ===================
 # 浏览器路径（请根据本地实际路径修改）
 CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
 # MySQL 配置
 db_config = {
    "host": "192.168.8.88",
    "port": 3306,
    "user": "root",
    "password": "mysql_7sjTXH",  # 修改为你的密码
    "database": "jd"
 }
 # 初始化 Flask 应用
 app = Flask(__name__)
 # 初始化锁
 fetch_lock = threading.Lock()
 # 初始化数据库连接
 db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
 engine = create_engine(db_url, echo=False)
 Session = sessionmaker(bind=engine)
 Base = declarative_base()
 # 定义评论模型
 class Comment(Base):
    __tablename__ = 'comments'
    id = Column(Integer, primary_key=True)
    product_id = Column(String(50), nullable=False)
    user_name = Column(String(100))
    comment_text = Column(Text)
    comment_id = Column(String(100))
    picture_urls = Column(Text)  # 存储 JSON 字符串
    created_at = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))
    comment_date = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))
 # 创建表（如果不存在）
 Base.metadata.create_all(engine)
 # =================== 核心爬虫函数 ===================
 # 全局浏览器实例（只初始化一次）
 global_page = None
 def get_global_browser():
    global global_page
    if global_page is None:
        options = ChromiumOptions()
        options.set_browser_path(CHROME_PATH)
        global_page = ChromiumPage(options)
    return global_page
 def fetch_jd_comments(product_id):
    page = get_global_browser()  # 使用全局浏览器
    try:
        # 打开商品页面
        page.get(f'https://item.jd.com/{product_id}.html#crumb-wrap')
        time.sleep(random.uniform(5, 8))
        # 向下滚动主页面
        page.scroll.down(150)
        time.sleep(random.uniform(3, 5))
        # 点击“买家赞不绝口”
        element1 = page.ele('xpath=//div[contains(text(), "买家赞不绝口")]')
        if element1:
            element1.click()
            time.sleep(random.uniform(3, 5))
        else:
            element1 = page.ele('xpath=//div[contains(text(), "好评率")]')
            if element1:
                element1.click()
                time.sleep(random.uniform(3, 5))
        # 点击“当前商品”
        element2 = page.ele('xpath=//div[contains(text(), "当前商品")]')
        if element2:
            element2.click()
            time.sleep(random.uniform(3, 5))
        # 定位弹窗区域
        popup = page.ele('xpath=//*[@id="rateList"]/div/div[3]')
        if not popup:
            return []
        # 点击“视频”
        element3 = page.ele('xpath=//div[contains(text(), "视频")]')
        if element3:
            element3.click()
            time.sleep(random.uniform(3, 5))
        # 监听请求
        page.listen.start('https://api.m.jd.com/client.action')
        max_retries = 10  # 最多尝试 5 次无新数据
        retry_count = 0
        new_comments = []  # 存储最终的新评论
        seen_ids = set()  # 已处理过的 comment_id
        while retry_count < max_retries and len(new_comments) < 10:
            scroll_amount = random.randint(10000, 100000)
            popup.scroll.down(scroll_amount)
            print(f"弹窗向下滚动了 {scroll_amount} 像素")
            time.sleep(random.uniform(3, 5))
            resp = page.listen.wait(timeout=5)
            if resp and 'getCommentListPage' in resp.request.postData:
                json_data = resp.response.body
                if 'result' in json_data and 'floors' in json_data['result']:
                    comment_floor = json_data['result']['floors'][2]
                    if 'data' in comment_floor and isinstance(comment_floor['data'], list):
                        batch_comments = comment_floor['data']
                        # 提取这批评论中的新评论
                        fresh_comments = []
                        for comment in batch_comments:
                            comment_info = comment.get('commentInfo', {})
                            comment_id = comment_info.get('commentId', '')
                            comment_score = comment_info.get('commentScore', '')  # 获取评分字段
                            if not comment_id:
                                continue
                            # 只保留五星好评
                            if comment_score != '5':
                                print(f"跳过非五星评论：{comment_id}，评分为 {comment_score}")
                                continue
                            # 如果该评论已存在数据库或本次已收集，则跳过
                            exists_in_db = False
                            if comment_id in seen_ids:
                                exists_in_db = True
                            else:
                                session = Session()
                                exists_in_db = session.query(Comment).filter_by(comment_id=comment_id).first() is not None
                                session.close()
                            if exists_in_db:
                                print(f"评论已存在：{comment_id}")
                                continue
                            seen_ids.add(comment_id)
                            fresh_comments.append(comment)
                        if fresh_comments:
                            print(f"本次获取到 {len(fresh_comments)} 条新评论")
                            new_comments.extend(fresh_comments)
                            retry_count = 0  # 有新数据，重置重试计数器
                        else:
                            print("本次无新评论，继续滚动...")
                            retry_count += 1
                    else:
                        print("未找到有效的评论列表")
                        retry_count += 1
                else:
                    print("返回数据结构异常")
                    retry_count += 1
            else:
                print("未捕获到新的评论数据，继续滚动...")
                retry_count += 1
        print(f"共抓取到 {len(new_comments)} 条新评论（最多需要10条）")
        return new_comments[:10]  # 只保留前10条
    except Exception as e:
        print("发生错误:", e)
        return []
 # =================== 提取评论并保存到数据库 ===================
 def save_comments_to_db(product_id, comments):
    session = Session()
    try:
        for comment in comments:
            comment_info = comment.get('commentInfo', {})
            comment_id = comment_info.get('commentId', '')
            # 如果 comment_id 为空，跳过这条评论
            if not comment_id:
                print("跳过无 comment_id 的评论")
                continue
            # 检查是否已存在该评论
            exists = session.query(Comment).filter_by(comment_id=comment_id).first()
            if exists:
                print(f"评论已存在：{comment_id}")
                continue
            # 提取其他字段
            user_name = comment_info.get('userNickName', '匿名用户')
            comment_text = comment_info.get('commentData', '无评论内容')
            product_id = comment_info.get('productId', product_id)
            picture_list = comment_info.get('pictureInfoList', [])
            comment_date = comment_info.get('commentDate', '')
            picture_urls = [pic.get('largePicURL') for pic in picture_list if pic.get('largePicURL')]
            new_comment = Comment(
                product_id=product_id,
                user_name=user_name,
                comment_text=comment_text,
                comment_id=comment_id,
                picture_urls=json.dumps(picture_urls, ensure_ascii=False),
                comment_date=comment_date
            )
            session.add(new_comment)
        session.commit()
    except Exception as e:
        session.rollback()
        print("保存失败:", e)
    finally:
        session.close()
 # =================== Flask API 接口 ===================
@app.route('/fetch_comments', methods=['POST'])
 def fetch_comments():
    product_id = request.args.get('product_id')
    if not product_id:
        return jsonify({"error": "缺少 product_id"}), -200
    try:
        with fetch_lock:  # 加锁，防止并发调用
            comments = fetch_jd_comments(product_id)
            if not comments:
                return jsonify({"message": "未获取到评论数据"}), -200
            save_comments_to_db(product_id, comments)
        return jsonify({
            "message": f"成功保存 {len(comments)} 条评论",
            "product_id": product_id
        }), 200
    except Exception as e:
        return jsonify({"error": str(e)}), -200
 # =================== 启动服务 ===================
 if __name__ == '__main__':
    try:
        app.run(host='0.0.0.0', port=5000, debug=True)
    finally:
        if 'global_page' in globals() and global_page:
            global_page.quit()
            print("浏览器已关闭")
--- a/jd/response.txt
+++ b/jd/response.txt
--- a/jd/tb.py
+++ b/jd/tb.py
@@ -0,0 +1,198 @@
 import time
 import random
 import re
 import json
 import threading
 from DrissionPage import ChromiumPage, ChromiumOptions
 from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
 from sqlalchemy.orm import sessionmaker, declarative_base
 # =================== 配置部分 ===================
 # 浏览器路径（请根据本地实际路径修改）
 CHROME_PATH = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
 # 固定商品详情页 URL
 TARGET_URL = "https://detail.tmall.com/item.htm?abbucket=1&id=735141569627&ltk2=1753093866331wbixx4bjhgx78xdlrpyxq&ns=1&priceTId=213e074d17530938630755244e1109&skuId=5667837161089&spm=a21n57.1.hoverItem.2&utparam=%7B%22aplus_abtest%22%3A%228c55408acbff553514850c28e821c3b4%22%7D&xxc=taobaoSearch"
 # MySQL 配置
 db_config = {
    "host": "192.168.8.88",
    "port": 3306,
    "user": "root",
    "password": "mysql_7sjTXH",  # 修改为你的密码
    "database": "jd"
 }
 # 初始化数据库连接
 db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4"
 engine = create_engine(db_url, echo=False)
 Session = sessionmaker(bind=engine)
 Base = declarative_base()
 # 定义淘宝评论模型
 class TaobaoComment(Base):
    __tablename__ = 'taobao_comments'
    id = Column(Integer, primary_key=True)
    product_id = Column(String(50), nullable=False)
    user_name = Column(String(100))
    comment_text = Column(Text)
    comment_id = Column(String(100))
    picture_urls = Column(Text)  # 存储 JSON 字符串
    created_at = Column(DateTime, default=time.strftime('%Y-%m-%d %H:%M:%S'))
    comment_date = Column(DateTime)
 # 创建表（如果不存在）
 Base.metadata.create_all(engine)
 # 全局浏览器实例（只初始化一次）
 global_taobao_page = None
 def get_global_taobao_browser():
    global global_taobao_page
    if global_taobao_page is None:
        options = ChromiumOptions()
        options.set_browser_path(CHROME_PATH)
        global_taobao_page = ChromiumPage(options)
    return global_taobao_page
 def extract_json_from_mtop(raw_response: str) -> dict:
    # print("原始数据：", raw_response)
    """去除 mtopjsonppcdetail18(...) 等封装，提取真实 JSON"""
    match = re.search(r'mtopjsonppcdetail\d+\((\{.*\})\)', raw_response, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError as e:
            print("JSON 解析失败:", e)
    else:
        print("未找到有效的 JSON 数据")
    return {}
 def fetch_taobao_comments():
    page = get_global_taobao_browser()
    try:
        # 打开固定商品页
        page.get(TARGET_URL)
        # time.sleep(5)
        # 向下滚动主页面
        page.scroll.down(150)
        # time.sleep(3)
        # 点击“评价”按钮
        element = page.ele('xpath=//div[contains(text(), "全部评价")]')
        if element:
            element.click()
            time.sleep(3)
        else:
            print("未找到评价按钮")
            return []
        # 开始监听指定请求
        target_url = 'https://h5api.m.tmall.com/h5/mtop.taobao.rate.detaillist.get/6.0/?jsv=2.7.5'
        page.listen.start(target_url)
        seen_ids = set()
        print("\n===============================")
        print("✅ 自动开始抓取评论，每次滚动到底部后自动保存新评论...")
        print("🚫 如需停止程序，请手动关闭浏览器或使用 Ctrl+C 中断")
        print("===============================\n")
        while True:
            # 发送滚动到底部指令
            page.scroll.to_bottom()
            print("已向下滚动到底部，等待接口返回数据...")
            resp = page.listen.wait(timeout=10)
            if resp and target_url in resp.url:
                raw_body = resp.response.body
                json_data = extract_json_from_mtop(raw_body)
                if json_data and 'data' in json_data and 'rateList' in json_data['data']:
                    batch_comments = json_data['data']['rateList']
                    fresh_comments = []
                    for comment in batch_comments:
                        comment_id = comment.get('id', '')
                        if not comment_id or comment_id in seen_ids:
                            continue
                        seen_ids.add(comment_id)
                        fresh_comments.append(comment)
                    if fresh_comments:
                        print(f"✅ 本次获取到 {len(fresh_comments)} 条新评论，正在保存...")
                        save_taobao_comments_to_db(fresh_comments)  # 👈 立即保存
                    else:
                        print("⚠️ 本次无新评论，可能已抓取完毕")
                else:
                    print("🚫 返回数据结构异常，无法提取评论")
            else:
                print("🚫 未捕获到新的评论数据，请确认是否已滚动并加载出更多评论")
            # 自动等待几秒再滚动
            # time.sleep(random.uniform(3, 5))
        # 这里不再返回 comments，而是直接实时保存
    except Exception as e:
        print("发生错误:", e)
 def save_taobao_comments_to_db(comments):
    session = Session()
    try:
        for comment in comments:
            comment_id = comment.get('id', '')
            feedback = comment.get('feedback', '无评论内容')
            user_nick = comment.get('userNick', '匿名用户')
            pic_list = comment.get('feedPicPathList', [])
            comment_date = comment.get('feedbackDate', '')
            exists = session.query(TaobaoComment).filter_by(comment_id=comment_id).first()
            if exists:
                print(f"评论已存在：{comment_id}")
                continue
            picture_urls = [url for url in pic_list if url.startswith('//')]
            new_comment = TaobaoComment(
                product_id="735141569627",
                user_name=user_nick,
                comment_text=feedback,
                comment_id=comment_id,
                picture_urls=json.dumps(picture_urls, ensure_ascii=False),
                comment_date=comment_date
            )
            session.add(new_comment)
            print(f"正在写入评论: {comment_id}")
        session.commit()
    except Exception as e:
        session.rollback()
        print("保存失败:", e)
    finally:
        session.close()
 # =================== 主程序入口 ===================
 if __name__ == '__main__':
    print("开始抓取评论...")
    # 获取评论
    comments = fetch_taobao_comments()
    if comments:
        print(f"成功获取 {len(comments)} 条评论，正在保存到数据库...")
        save_taobao_comments_to_db(comments)
    else:
        print("未获取到任何评论数据。")
    print("浏览器保持打开状态，用于调试。")
--- a/jd/test.py
+++ b/jd/test.py
@@ -0,0 +1,144 @@
 import time
 import random
 import json
 from DrissionPage import ChromiumPage, ChromiumOptions
 # 设置浏览器路径（请替换为本地 Chrome 的实际安装路径）
 chrome_path = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
 # 配置并启动带指定路径的浏览器
 options = ChromiumOptions()
 options.set_browser_path(chrome_path)
 # 创建浏览器实例
 page = ChromiumPage(options)
 try:
    # 打开京东商品页面
    page.get('https://item.jd.com/100104238904.html#crumb-wrap')
    while True:  # 持续运行循环
        try:
            # 等待页面加载完成（可手动处理验证码）
            time.sleep(random.uniform(1, 6))
            # 向下滚动主页面
            page.scroll.down(150)
            time.sleep(random.uniform(1, 3))
            # 定位并点击“买家赞不绝口”
            element1 = page.ele('xpath=//div[contains(text(), "买家赞不绝口")]')
            if element1:
                print(f"找到元素：{element1.text}")
                element1.click()
                time.sleep(random.uniform(2, 4))
            else:
                print("未找到第一个元素")
            # 定位并点击“当前商品”
            element2 = page.ele('xpath=//div[contains(text(), "当前商品")]')
            if element2:
                print(f"找到元素：{element2.text}")
                element2.click()
                time.sleep(random.uniform(2, 4))
            else:
                print("未找到第二个元素")
            # 定位并点击“图/视频”
            element3 = page.ele('xpath=//div[contains(text(), "视频")]')
            if element3:
                print(f"找到元素：{element3.text}")
                element3.click()
                time.sleep(random.uniform(2, 4))
            else:
                print("未找到第三个元素")
            # 定位弹窗区域
            popup = page.ele('xpath=//*[@id="rateList"]/div/div[3]')
            if popup:
                # 开始监听目标接口请求
                page.listen.start('https://api.m.jd.com/client.action')
                # 循环滚动直到成功获取新评论数据
                max_retries = 5  # 最大尝试次数
                retry_count = 0
                success = False
                while retry_count < max_retries and not success:
                    # 随机滚动一定像素
                    scroll_amount = random.randint(1000, 4000)
                    popup.scroll.down(scroll_amount)
                    print(f"弹窗向下滚动了 {scroll_amount} 像素")
                    # 滚动后等待一段时间，模拟真实用户行为
                    time.sleep(random.uniform(1, 3))
                    # 等待新的评论数据请求
                    resp = page.listen.wait(timeout=5)
                    if resp and 'getCommentListPage' in resp.request.postData:
                        print("成功捕获到新的评论数据请求！")
                        # 解析 resp.body 中的 JSON 数据
                        try:
                            json_data = resp.response.body
                            # 提取评论楼层（第三个楼层）
                            if 'result' in json_data and 'floors' in json_data['result']:
                                comment_floor = json_data['result']['floors'][2]  # 索引从0开始
                                if 'data' in comment_floor and isinstance(comment_floor['data'], list):
                                    comments = comment_floor['data']
                                    print(f"成功提取到 {len(comments)} 条评论：\n")
                                    for idx, comment in enumerate(comments, 1):
                                        comment_info = comment.get('commentInfo', {})
                                        user_name = comment_info.get('userNickName', '匿名用户')
                                        comment_text = comment_info.get('commentData', '无评论内容')
                                        commentId = comment_info.get('commentId', '评价ID')
                                        productId = comment_info.get('productId', '商品ID')
                                        # 提取所有图片链接
                                        picture_list = comment_info.get('pictureInfoList', [])
                                        picture_urls = [pic.get('largePicURL') for pic in picture_list if pic.get('largePicURL')]
                                        if picture_urls:
                                            print(f"第 {idx} 条评论：")
                                            print(f"用户名：{user_name}")
                                            print(f"评分：{commentId}")
                                            print(f"商品ID：{productId}")
                                            print(f"评论内容：{comment_text}")
                                            print(f"图片链接：{picture_urls}\n")
                                else:
                                    print("未找到有效的评论数据或数据格式异常。")
                            else:
                                print("返回数据中不包含评论楼层信息。")
                        except json.JSONDecodeError as je:
                            print("JSON 解析失败:", je)
                        except Exception as e:
                            print("处理评论数据时出错：", e)
                    else:
                        print("未捕获到新的评论数据，继续滚动...")
                        retry_count += 1
                if not success:
                    print("多次滚动后仍未获取到有效评论数据，请检查页面结构或网络请求状态。")
            else:
                print("未找到弹窗元素")
            # 添加随机等待时间以模拟真实用户行为
            wait_time = random.uniform(5, 10)
            print(f"等待 {wait_time:.2f} 秒后继续下一轮操作...")
            time.sleep(wait_time)
        except Exception as inner_e:
            print(f"内部循环发生错误：{inner_e}")
            time.sleep(5)  # 出错后稍作等待再继续
 except KeyboardInterrupt:
    print("用户中断脚本执行")
 except Exception as outer_e:
    print(f"外部异常：{outer_e}")
 finally:
    # 关闭浏览器
    print("正在关闭浏览器...")
    # page.quit()