Python爬虫入门
豆瓣榜单250爬取:
直接上代码:
import requests import re def top250_crawer(url, sum): headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' } try: response = requests.get(url, headers=headers) response.raise_for_status() # 若请求不成功,抛出异常 except requests.RequestException as e: print(f"请求出现问题: {e}") return title = re.findall('<span class="title">(.*?)</span>', response.text, re.S) new_title = [] for t in title: if ' / ' not in t: new_title.append(t) data = re.findall('<br>(.*?)</p>', response.text, re.S) time = [] country = [] for str1 in data: str1 = str1.replace(' ', '') str1 = str1.replace('\n', '') time_data = str1.split(' / ')[0] country_data = str1.split(' / ')[1] time.append(time_data) country.append(country_data) for j in range(len(country)): sum += 1 print(str(sum) + '.' + new_title[j] + ',' + country[j] + ',' + time[j]) url = 'https://movie.douban.com/top250' sum = 0 '遍历10页数据,250条结果' for a in range(10): if sum == 0: top250_crawer(url, sum) sum += 25 else: page = '?start=' + str(sum) + '&filter=' new_url = url + page top250_crawer(new_url, sum) sum += 25
原文地址:https://blog.csdn.net/yzx991013/article/details/145212727
免责声明:本站文章内容转载自网络资源,如侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!