自学内容网 自学内容网

requests案例——爬取微博的一级和二级评论

案例需求:

1.爬取该网页下的一级评论和二级评论

https://m.weibo.cn/detail/4813628149072458

36b59627ef6044ae865f383c66846596.png

2.

3.

分析:

1.找到一级评论请求地址

a18642a4d4194091933c6920aa42e05c.png

url请求地址:

43869e32bfe541d8a477c62fd2407f1e.png

fec3e93cf1fd4c6a9a011715029c2479.png

二级url地址

cf1b4ceba6c54c72bbf9dd7a154c92c2.png 

分析翻页参数——可知翻页是从第二页开始的

从而得到:

移动端链接:https://m.weibo.cn/detail/4813628149072458
一级评论接口:https://m.weibo.cn/comments/hotflow?id=4813628149072458&mid=4813628149072458&max_id_type=0
    --参数:
        id: 4813628149072458
        mid: 4813628149072458
        max_id_type: 0
        max_id: 13883307764046392 #翻页参数---从第二页开始 (在上一页一级评论接口可以找到)

二级评论接口:https://m.weibo.cn/comments/hotFlowChild?cid=4813628329693567&max_id=0&max_id_type=0
    ---参数:
            cid: 4813628329693567
            max_id: 0 #二级翻页参数(在上一页评论接口中可以找到)
            max_id_type: 0

获取一级评论

import requests
from jsonpath import jsonpath
import re


class Weibo():
    def __init__(self):
        self.one_url = 'https://m.weibo.cn/comments/hotflow'
        self.one_data = {
            'id': '4813628149072458',
            'mid': '4813628149072458',
            'max_id_type': '0',
            'max_id': None
        }
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36',
        }
    def get_one_data(self):
        response = requests.get(url=self.one_url, headers=self.headers, params=self.one_data)
        # print(response.text)
        # print(json.loads(response.text))
        json_data = response.json()
        # 解析内容
        one_name = jsonpath(json_data, '$..data[0:18].user.screen_name')
        one_text = jsonpath(json_data, '$..data[0:18].text')
        for one_names, one_texts in zip(one_name, one_text):
            content = re.sub('<.*?>', '', one_texts)
            print('-------一级评论-------')
            print(one_names)
            print(content)
    def man(self):
        self.get_one_data()


if __name__ == '__main__':
    w = Weibo()
    w.man()

79ee1f73d2294761b0448f7895c7cade.png

一级翻页

max_id = json_data.get('max_id')
if max_id:
    self.one_data['max_id'] = max_id
    time.sleep(random.uniform(1, 3))  # 随机等待时间,避免被封
    self.get_one_data(self.one_url, self.one_data)
else:
    print('该用户的一级评论已经爬完')

获取二级评论

# 获取二级评论
def get_two_data(self):
    response = requests.get(url=self.two_url, headers=self.headers, params=self.two_data)
    try:
        json_data = response.json()
    except json.JSONDecodeError:
        print("解析 JSON 失败")
        return

    two_name = json_data.get('data', [])
    two_text = [item.get('text') for item in two_name]
    for two_names, two_texts in zip(two_name, two_text):
        content = re.sub('<.*?>', '', two_texts)
        print('-------二级评论-------')
        print(two_names.get('user', {}).get('screen_name', ''))
        print(content)

示例代码:

import requests
import json
import re
import sys
import time
import random
from jsonpath import jsonpath


class Weibo:
    def __init__(self):
        self.one_url = 'https://m.weibo.cn/comments/hotflow'  # 修正 URL
        self.two_url = "https://m.weibo.cn/comments/hotFlowChild"  # 修正 URL
        self.one_data = {
            'id': '4813628149072458',
            'mid': '4813628149072458',
            'max_id_type': '0',
            'max_id': None
        }
        self.two_data = {
            'cid': '4813628329693567',
            'max_id': '0',
            'max_id_type': '0'
        }
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36',
            'Cookie': '__bid_n = 188b7ef5179e6369a94207;FPTOKEN = crNJdsvZwwGF7dp2IPNpPQfcb8QXfJcGqmdrORkyZDx1InnMS5HmnAi2IuK / +GpaFRGA9SsxzhVt1I6QlRCtoWJsIFKbbc8 // DeUCm0HH9ux6X85QM + Z3WBbGns26hiiQngHN + M5q + ErW1eifOLk + +KasqWhbWrd12AHMF7vC / 3qXfLfRN60SEVuv1ZGCnrBIc3lN1sba2e0UzVGEzYejWmJ / yzCNkUZ1qZHPSNvEzGfJlYhJGxDiyyBInzi / cTWyk1g988msn9UMRE3GBjWIcZXsqOl0HbbsOz5AYS + n1b86VqgY4eVk3EB / Dr9Fgkl2UBcstP5NcEJ9MXcHyZDfRsXbz / rGPbnYsrT7iZxjwGn4gnTqnzQ / HZsyaVvGf1gxf3oEB3SwBHvlflbg7KxXQ == | G4pgWxSL7Ti7lQoJilvXBEq0Vs37iI4r + CsQg3BKI2U = | 10 | 6f811a8a83e72fbe3ec90c6845e372b0;_T_WM = 40779448236;WEIBOCN_FROM = 1110006030;SUBP = 0033WrSXqPxfM725Ws9jqgMF55529P9D9WF_IbR1VnBGsR3IZdB7J.Ey5JpX5K - hUgL.FoMNShMEeK2E1Kq2dJLoIEnLxK - L1hqL1K.LxKMLB.zL1K.LxKnL1hMLB - 2LxK - LBoMLBo27S05N;MLOGIN = 1;SCF = AkYYFk70crAYROKfk6SUopCK_fVD7Tu5nSTQdkf6622fzN_KAP3J3MPopagzOTf3wUrTpRLoS2QmwjIeNQ2nik4.;SUB = _2A25JxhX3DeRhGeFJ71UT8S_OwjqIHXVrSLu_rDV6PUJbktAGLRehkW1Nf8RTnplqgytu5YRphj7 - Op57y6XIcQDE;SSOLoginState = 1690461607;ALF = 1693053607;XSRF - TOKEN = c16fc6;M_WEIBOCN_PARAMS = oid % 3D4813628149072458 % 26luicode % 3D20000061 % 26lfid % 3D4813628149072458 % 26uicode % 3D20000061 % 26fid % 3D4813628149072458;mweibo_short_token = f67e070b3b'        }

    def get_one_data(self, url, data):
        response = requests.get(url=url, headers=self.headers, params=data)
        try:
            json_data = response.json()
        except json.JSONDecodeError:
            print("解析 JSON 失败")
            return

        # 解析内容
        one_name = jsonpath(json_data, '$..data[0:18].user.screen_name')
        one_text = jsonpath(json_data, '$..data[0:18].text')
        cid = jsonpath(json_data, '$..data[0:10].rootid')
        for one_names, one_texts, cids in zip(one_name, one_text, cid):
            content = re.sub('<.*?>', '', one_texts)
            print('-------一级评论-------')
            print(one_names)
            print(content)
            # print(one_texts)
            print('跟评ID', cids)
            self.two_data['cid'] = cids
            self.get_two_data()

        # 翻页处理
        max_id = json_data.get('max_id')
        if max_id:
            self.one_data['max_id'] = max_id
            time.sleep(random.uniform(1, 3))  # 随机等待时间,避免被封
            self.get_one_data(self.one_url, self.one_data)
        else:
            print('该用户的一级评论已经爬完')

    # 获取二级评论
    def get_two_data(self):
        response = requests.get(url=self.two_url, headers=self.headers, params=self.two_data)
        try:
            json_data = response.json()
        except json.JSONDecodeError:
            print("解析 JSON 失败")
            return

        two_name = json_data.get('data', [])
        two_text = [item.get('text') for item in two_name]
        for two_names, two_texts in zip(two_name, two_text):
            content = re.sub('<.*?>', '', two_texts)
            print('-------二级评论-------')
            print(two_names.get('user', {}).get('screen_name', ''))
            print(content)

    def man(self):
        self.get_one_data(self.one_url, self.one_data)

if __name__ == '__main__':
    w = Weibo()
    w.man()

f814c8e7258f483e93197f56f59b379f.png

 


原文地址:https://blog.csdn.net/qq_53256193/article/details/142714260

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!