自学内容网 自学内容网

Python爬虫入门

豆瓣榜单250爬取:

直接上代码:

import requests
import re


def top250_crawer(url, sum):
    headers = {
        'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # 若请求不成功,抛出异常
    except requests.RequestException as e:
        print(f"请求出现问题: {e}")
        return
    title = re.findall('<span class="title">(.*?)</span>', response.text, re.S)
    new_title = []
    for t in title:
        if '&nbsp;/&nbsp' not in t:
            new_title.append(t)
    data = re.findall('<br>(.*?)</p>', response.text, re.S)
    time = []
    country = []
    for str1 in data:
        str1 = str1.replace(' ', '')
        str1 = str1.replace('\n', '')
        time_data = str1.split('&nbsp;/&nbsp;')[0]
        country_data = str1.split('&nbsp;/&nbsp;')[1]
        time.append(time_data)
        country.append(country_data)
    for j in range(len(country)):
        sum += 1
        print(str(sum) + '.' + new_title[j] + ',' + country[j] + ',' + time[j])


url = 'https://movie.douban.com/top250'
sum = 0
'遍历10页数据,250条结果'
for a in range(10):
    if sum == 0:
        top250_crawer(url, sum)
        sum += 25
    else:
        page = '?start=' + str(sum) + '&filter='
        new_url = url + page
        top250_crawer(new_url, sum)
        sum += 25

原文地址:https://blog.csdn.net/yzx991013/article/details/145212727

免责声明:本站文章内容转载自网络资源,如侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!