python爬取小说内容
结合第三方模块requests,文件I0、正则表达式,通过函数封装爬虫应用采集数据 CSDN中对代码进行总结: 1.需求分析:如何确定采集的URL地址和数据的 2.代码实现:描述包含详细注释的代码 3.结果呈现:截图展示采集数据
需求分析
目录采集地址:我在精神病院学斩神完整版在线免费阅读_我在精神病院学斩神小说_番茄小说官网
内容采集地址:我在精神病院学斩神第1章 黑缎缠目在线免费阅读_番茄小说官网
查看每章是否有下一页
(1)采集目录
"""
采集章节数据
"""
#导入模块
import requests
import json
import re
def fetch_chapter_data(url):
"""
发起请求获取章节数据
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
}
# 发起伪造请求
response = requests.get(url, headers=headers)
# 设置响应编码
response.encoding = "UTF-8"
return response.text
def extract_chapters(content):
"""
从响应内容中提取章节名称和链接
"""
#正则表达式
p = r'href="([^"]+)"[^>]*>(第\d+章.+?)</a>'
#全部匹配的方式提取数据
chs = re.findall(p, content, re.DOTALL)
#拼接链接和章节名称
chapter = dict()
for ch in chs:
chapter[ch[1]] = "https://fanqienovel.com" + ch[0]
print(chapter)
return chapter
def save_chapters_to_file(chapters, filename):
"""
将章节数据保存到文件
"""
with open(filename, mode="wt", encoding="utf-8") as file:
json.dump(chapters, file)
def main():
"""
调用相应函数
"""
url = "https://fanqienovel.com/page/6982529841564224526"
content = fetch_chapter_data(url)
chapters = extract_chapters(content)
save_chapters_to_file(chapters, "chapters.txt")
print("章节数据已成功保存至chapters.txt")
if __name__ == "__main__":
main()
运行结果
(2)采集章节内容
"""
采集章节
"""
import requests,re
import time,random
import json
#1.加载需要采集的目录
with open("chapters.txt",mode="rt",encoding="UTF-8") as file:
chs = json.load(file)
#print(chs)
#2.循环遍历,发起伪造请求
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"}
for title,url in chs.items():
print(f"准备采集:{title}")
#发起伪造请求
response = requests.get(url,headers=headers)
#设置编码
response.encoding = "UTF-8"
# 分析数据格式
content = response.text
#print("------------------------")
#定义正则,匹配数据
p =r'<div class="muye-reader-content noselect">(.*?)</div>'
content = re.search(p,content,re.DOTALL)
content = content.group(1).strip()
# # #数据筛选
p2 = r'<p.*?>(.*?)</p>'
content = re.findall(p2,content,re.DOTALL)
p3 = r'\\[^\s]*'
content = re.sub(p3, '', ''.join(content))
# print(content)
with open("斩神.txt",mode="at",encoding="UTF-8") as file:
#保存到文件
file.write("\n\n---------------------\n\n")
file.write("\n\n"+title+"\n\n")
file.write(content)
#模拟用户请求,每次请求完成休眠3~5s
time.sleep(random.randint(3, 5))
print(f"{title}章节采集完成")
运行结果
原文地址:https://blog.csdn.net/2301_80811863/article/details/143993958
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!