自学内容网 自学内容网

python爬虫数据(小说)

"""
确定目标网站:https://www.wxscs.com/book/9422/
内容页:
"""
#引入网页请求模块
import requests
#网页主界面
url = "https://www.wxscs.com/book/9422/"
#伪造亲求头部
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0"
}
# 发起伪造请求
response = requests.get(url, headers=headers)
# 设置响应编码
response.encoding = "UTF-8"
# 查看响应数据
content = response.text
#的打印html页面
print(content)

import re

# <a href="/book/9422/1874033.html" title="第九章 壶娱中秋节" target="_blank">第九章 壶娱中秋节</a>
#写出对应正则表达式
p = r'<a href="(/book/9422/187.*?)"\s+title=".*?"\s+target="_blank">(第.*?)</a>'
chs = re.findall(p,content)
print(chs)

chapter = {}
for ch in chs:
    chapter_url = "https://www.wxscs.com" + ch[0]
    chapter_title = ch[1]
    chapter[chapter_title] = chapter_url
# 最终链接数据
print(chapter)
import json
with open("chapters.txt",mode="wt",encoding="UTF-8") as file:
    json.dump(chapter,file)
#得到一个文件 文件内是章节目录

"""
章节内数据
"""

import requests,re
import time,random
import json
#找到文件
with open("chapters.txt",encoding="UTF-8") as file:
    chs = json.load(file)
    # print(chs)

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0"
}

#分离标题和网页链接
for title,url in chs.items():
    print(f"准备采集{title}\n")
    response = requests.get(url,headers=headers)
    response.encoding = "UTF-8"
    html = response.text
    # print(html)
    print("---------------------")
#正则找到想要的内容
    p = r'<div id="cont-body"\s+class="cont-body 187.*?">.*?<script>.*?</script>(.*?)</div>'
    content = re.search(p,html,re.DOTALL)
    content = content.group(1).strip()

    # 数据清晰
    p2 = r'(<p>|</p>)'
    content = re.sub(p2, '\n', content, re.X)
    # content = "\n".join(content)

    # print(content)
#将数据输出为一个文件
    with open("杨戬.txt",mode="at",encoding="UTF-8") as file:
        file.write("\n\n---------------\n\n")
        file.write("\n\n"+title+"\n\n")
        file.write(content)
#休眠伪造真人操作
    time.sleep(random.randint(5,10))
    print(f"{title}采集完成")


原文地址:https://blog.csdn.net/2301_81140745/article/details/143906324

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!