Python 爬虫
爬取豆瓣电影前250
import random
import urllib.request
from bs4 import BeautifulSoup
import csv
from time import sleep
import re # 导入正则表达式模块
ip = '218.95.39.56:11747'
def main(url, headers, writer):
global language, director, actor, length
try:
# 假设你的IP地址存储在一个名为ips.txt的文件中,每行一个IP
def load_proxies_from_file(file_path):
with open(file_path, 'r') as file:
ips = [line.strip() for line in file.readlines()]
return ips
def get_random_proxy(ips):
return random.choice(ips)
# 替换为你的IP文件路径
file_path = 'ips.txt'
ips = load_proxies_from_file(file_path)
random_ip = get_random_proxy(ips)
print(f'Randomly selected proxy: {random_ip}')
ip = random_ip
# 设置代理服务器
proxy = 'https://' + ip
# 创建一个 opener 对象,添加代理处理程序
opener = urllib.request.build_opener(
urllib.request.ProxyHandler({
'http': proxy,
'https': proxy,
})
)
# 安装 opener
urllib.request.install_opener(opener)
proxy = {
'http': 'http://' + ip,
'https': 'http://' + ip
}
page = urllib.request.Request(url,headers=headers)
page = urllib.request.urlopen(page)
contents = page.read()
# 用BeautifulSoup解析网页
soup = BeautifulSoup(contents, "html.parser")
print('爬取豆瓣电影250: \n')
for tag in soup.find_all(attrs={"class": "item"}):
# 爬取序号
num = tag.find('em').get_text()
# 电影名称
name = tag.find_all(attrs={"class": "title"})
zwname = name[0].get_text()
# 爬取评分
rating_num = tag.find(attrs={"class": "rating_num"}).get_text()
# 爬取评价人数
rating_people = tag.find_all(attrs={"class": "star"})[0].find_all('span')[-1].get_text().strip('()')
# 使用正则表达式提取数字
rating_people_num = re.findall(r'\d+', rating_people)[0]
# 获取评语
content = tag.find(attrs={"class": "inq"}).get_text() if tag.find(attrs={"class": "inq"}) else ''
print('[影评]', content)
#
year_text = tag.find(attrs={"class": "bd"}).get_text()
lines = year_text.split('\n')
print(lines[3])
# 导演
director_x = re.search(r'导演:\s*(.+?)\s*\s', lines[2])
if director_x:
director = director_x.group(1)
# 主演
actor_x = re.search(r'主演:\s*(.+?)\s*\s', lines[2])
if actor_x:
actor = actor_x.group(1)
# 上映年份
year = re.search(r'\d{4}', year_text)
if year:
year = year.group(0)
# 制片国家
country = re.search(r'/\s*([^/]+)\s*/', lines[3])
if country:
country = country.group(1)
# 网页链接
url_movie = tag.find(attrs={"class": "hd"}).a.attrs['href']
print(url_movie)
import requests
from lxml import etree
req = requests.get(url_movie, headers=headers, proxies=proxy)
req.encoding = 'utf-8'
language = "英语"
length = "123"
story = "剧情"
if req.status_code == 200:
html = req.text
language_match = re.search(r'<span class=\"pl\">语言:</span> (.*?)<br/>', html)
if language_match:
language = language_match.group(1)
print(f"语言: {language}")
length_match = re.search(r'<span property=\"v:runtime\" content=\"(\d+)\">', html)
if length_match:
length = length_match.group(1)
print(length)
story_match = re.search(
r'<span property=\"v:genre\">剧情</span> / <span property=\"v:genre\">(.*?)<', html)
if story_match:
story = story_match.group(1)
print(story)
html = etree.HTML(html)
content = html.xpath('//*[@id="link-report-intra"]/span/text()')
cleaned_content = [text.strip() for text in content if text.strip()]
print(cleaned_content)
content = cleaned_content[0]
# 写入CSV
writer.writerow({'序号': num, '电影名': zwname, '导演': director, '主演': actor,
'类型': story, '制片国家': country, '语言': language,
'片长': length, '评分': rating_num, '评论人数': rating_people_num, '上映年份': year, '简介': content})
print(f"""序号:{num}, 电影名:{zwname}, 导演:{director}, 主演:{actor}, 简介:{content},
类型:{story}, 制片国家:{country}, 语言:{language}, 片长:{length}, 评分:{rating_num},
评论人数:{rating_people_num}, 上映年份:{year}""")
except Exception as e:
print(f"发生错误: {e}")
if __name__ == '__main__':
# 消息头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
}
# 打开CSV文件准备写入
with open('movie1.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['序号', '电影名', '导演', '主演', '类型', '制片国家', '语言', '片长', '评分', '评论人数', '上映年份', '简介']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# 翻页
for i in range(0, 4):
print(f'页码 {i}')
num = i * 25 # 每次显示25部 URL序号按25增加
url = f'https://movie.douban.com/top250?start={num}&filter='
main(url, headers, writer)
sleep(5 + random.random()) # 随机等待时间,避免过快请求被封IP
原文地址:https://blog.csdn.net/qq_52331221/article/details/145109557
免责声明:本站文章内容转载自网络资源,如侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!