自学内容网 自学内容网

Python 爬虫

爬取豆瓣电影前250

import random
import urllib.request
from bs4 import BeautifulSoup
import csv
from time import sleep
import re  # 导入正则表达式模块


ip = '218.95.39.56:11747'

def main(url, headers, writer):
    global language, director, actor, length
    try:
        # 假设你的IP地址存储在一个名为ips.txt的文件中,每行一个IP
        def load_proxies_from_file(file_path):
            with open(file_path, 'r') as file:
                ips = [line.strip() for line in file.readlines()]
            return ips

        def get_random_proxy(ips):
            return random.choice(ips)

        # 替换为你的IP文件路径
        file_path = 'ips.txt'
        ips = load_proxies_from_file(file_path)
        random_ip = get_random_proxy(ips)
        print(f'Randomly selected proxy: {random_ip}')
        ip = random_ip
        # 设置代理服务器
        proxy = 'https://' + ip

        # 创建一个 opener 对象,添加代理处理程序
        opener = urllib.request.build_opener(
            urllib.request.ProxyHandler({
                'http': proxy,
                'https': proxy,
            })
        )

        # 安装 opener
        urllib.request.install_opener(opener)

        proxy = {
            'http': 'http://' + ip,
            'https': 'http://' + ip
        }
        page = urllib.request.Request(url,headers=headers)
        page = urllib.request.urlopen(page)
        contents = page.read()
        # 用BeautifulSoup解析网页
        soup = BeautifulSoup(contents, "html.parser")

        print('爬取豆瓣电影250: \n')

        for tag in soup.find_all(attrs={"class": "item"}):
            # 爬取序号
            num = tag.find('em').get_text()
            # 电影名称
            name = tag.find_all(attrs={"class": "title"})
            zwname = name[0].get_text()
            # 爬取评分
            rating_num = tag.find(attrs={"class": "rating_num"}).get_text()
            # 爬取评价人数
            rating_people = tag.find_all(attrs={"class": "star"})[0].find_all('span')[-1].get_text().strip('()')
            # 使用正则表达式提取数字
            rating_people_num = re.findall(r'\d+', rating_people)[0]
            # 获取评语
            content = tag.find(attrs={"class": "inq"}).get_text() if tag.find(attrs={"class": "inq"}) else ''
            print('[影评]', content)
            #
            year_text = tag.find(attrs={"class": "bd"}).get_text()
            lines = year_text.split('\n')
            print(lines[3])
            # 导演
            director_x = re.search(r'导演:\s*(.+?)\s*\s', lines[2])
            if director_x:
                director = director_x.group(1)
            # 主演
            actor_x = re.search(r'主演:\s*(.+?)\s*\s', lines[2])
            if actor_x:
                actor = actor_x.group(1)
            # 上映年份
            year = re.search(r'\d{4}', year_text)
            if year:
                year = year.group(0)
            # 制片国家
            country = re.search(r'/\s*([^/]+)\s*/', lines[3])
            if country:
                country = country.group(1)

            # 网页链接
            url_movie = tag.find(attrs={"class": "hd"}).a.attrs['href']
            print(url_movie)
            import requests
            from lxml import etree

            req = requests.get(url_movie, headers=headers, proxies=proxy)
            req.encoding = 'utf-8'
            language = "英语"
            length = "123"
            story = "剧情"
            if req.status_code == 200:
                html = req.text
                language_match = re.search(r'<span class=\"pl\">语言:</span> (.*?)<br/>', html)
                if language_match:
                    language = language_match.group(1)
                    print(f"语言: {language}")
                length_match = re.search(r'<span property=\"v:runtime\" content=\"(\d+)\">', html)
                if length_match:
                    length = length_match.group(1)
                    print(length)
                story_match = re.search(
                    r'<span property=\"v:genre\">剧情</span> / <span property=\"v:genre\">(.*?)<', html)
                if story_match:
                    story = story_match.group(1)
                    print(story)

                html = etree.HTML(html)
                content = html.xpath('//*[@id="link-report-intra"]/span/text()')
                cleaned_content = [text.strip() for text in content if text.strip()]
                print(cleaned_content)
                content = cleaned_content[0]

            # 写入CSV
            writer.writerow({'序号': num, '电影名': zwname, '导演': director, '主演': actor,
                             '类型': story, '制片国家': country, '语言': language,
                             '片长': length, '评分': rating_num, '评论人数': rating_people_num, '上映年份': year, '简介': content})
            print(f"""序号:{num}, 电影名:{zwname}, 导演:{director}, 主演:{actor}, 简介:{content}, 
            类型:{story}, 制片国家:{country}, 语言:{language}, 片长:{length}, 评分:{rating_num}, 
            评论人数:{rating_people_num}, 上映年份:{year}""")

    except Exception as e:
        print(f"发生错误: {e}")


if __name__ == '__main__':
    # 消息头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
    }

    # 打开CSV文件准备写入
    with open('movie1.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['序号', '电影名', '导演', '主演', '类型', '制片国家', '语言', '片长', '评分', '评论人数', '上映年份', '简介']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # 翻页
        for i in range(0, 4):
            print(f'页码 {i}')
            num = i * 25  # 每次显示25部 URL序号按25增加
            url = f'https://movie.douban.com/top250?start={num}&filter='
            main(url, headers, writer)
            sleep(5 + random.random())  # 随机等待时间,避免过快请求被封IP

在这里插入图片描述


原文地址:https://blog.csdn.net/qq_52331221/article/details/145109557

免责声明:本站文章内容转载自网络资源,如侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!