渗透利器-kali工具 (第四章-6) 爬虫编写
1,从官方文档学习模块:
调用help函数,可以看到一个函数或者方法的字符串文档
help(name) 很详细的模块文档
help(range) 很详细的类的文档
help(time.localtime()) 很详细的函数文档
举例:
print(time.__doc__) 较详细的模块文档
print(time.localtime().__doc__) 较详细的函数文档
print(range.__doc__) 较详细的类的文档
使用dir可以查看模块或对象都有哪些方法:
dir(requests) 将requests模块的方法显示出来
print(dri(time)) 简略的模块函数显示
print(dir(time.localtime())) 简略的函数参数显示
print(dir(range)) 简略的类构造函数参数显示
使用ipython+?查看:
requests.get?
Python中官方文档:https://docs.python.org/zh-cn/3
Pytcharm中查看快速帮助和Python官方帮助文档:
把光标放在要查询的对象上,打开视图菜单,quick definition查看对象的定义
quick documentation 快速文档,这个是jet brains自己对Python的解释文档
第三个external documentation 外部文档,这个是Python的官方帮助文档,跳转到网页帮助文档中。
2,实战爬取数据:
1.爬取图片:
1.思路:
根据我们输入的关键字在百度进行抓取图片并下载,最后给我们推荐:
所需模块:
import re 筛选所要的内容
import requests 网路请求模块[获取网页信息]
from urllib import error 异常模块
from bs4 import BeautifulSoup 解析网页内容
import os 文件操作模块
2.代码实现:
import re
import requests
from urllib import error
from bs4 import BeautifulSoup
import os
num = 0
numPicture = 0
file = ' '
List = []
def Find(url):
global List
print('正在检测图片总数量,稍等...')
t = 0
i = 1
s = 0
while t < 1000:
Url = url +str(t)
try:
Result = requests.get(Url,timeout = 7)
except BaseException:
t = t + 60
continue
else:
result = Result.text
pic_url = re.findall('"objURL":"(.*?)",',result,re.S)
s += len(pic_url)
if len(pic_url) == 0:
break
else:
List.append(pic_url)
t = t + 60
return s
def recommend(url):
Re = []
try:
html = requests.get(url)
except error.HTTPError as e:
return
else:
html.encoding = 'utf-8'
bsobj = BeautifulSoup(html.text,'html.parser')
div = bsobj.find('div',id='topRS')
if div is not None:
listA = div.findAll('a')
for i in listA:
if i is not None:
Re.append(i.get_text())
return Re
def dowmloadPicture(html,keyword):
global num
pic_url = re.findall('"objURL":"(.*?)",',html,re.S)
print('找到关键词:' + keyword + '的图片,即将开始下载...')
for each in pic_url:
print('正在下载第' + str(num + 1) + '张图,图片地址:' + str(each))
try:
if each is not None:
pic = requests.get(each,timeout=7)
else:
continue
except BaseException:
print('错了,当前图片不能下载')
continue
else:
string = file + r'\\' +keyword + '_' +str(num) +'.jpg'
fp = open(string,'wb')
fp.write(pic.content)
fp.close()
num += 1
if num >= numPicture:
return
if __name__ == '__main__':
word = input("请输入搜索关键词(可以是人名,地名等):")
url='http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='+word+'&pn='
tot=Find(url)
Recommend=recommend(url)# 记录相关推荐
print('经过检测%s类图片共有%d张'%(word,tot))
numPicture=int(input('请输入想要下载的图片数量'))
file=input('请建立一个存储图片的文件夹,输入文件夹名称即可')
y=os.path.exists(file)
if y == 1:
print('该文件已存在,请重新输入')
file=input('请建立一个存储图片的文件夹,输入文件夹名称即可')
else:
os.mkdir(file)
# while True:
# file = input('请建立一个存储图片的文件夹,输入文件夹名称即可')
# if os.path.exists(file):
# continue
# else:
# break
# file = 'D:/pythonlianxi/web_png'
t=0
tmp=url
while t< numPicture:
try:
url=tmp + str(t)
result=requests.get(url,timeout=10)
print(url)
except error.HTTPError as e:
print('网络错误,请调整网络后重试')
t = t+60
else:
dowmloadPicture(result.text,word)
t = t + 60
print('你还想看')
for re in Recommend:
print(re,end=" ")
3.爬取URL:
1.代码实现:
使用命令: web_pc.py -p 页数 -t 线程 -o 文件名 关键词
解释:
web_pc.py:文件名称
关键字:可是使用Google Hacking语法
import requests,re,threading,time
from bs4 import BeautifulSoup as bs
from queue import Queue
from argparse import ArgumentParser
arg = ArgumentParser(description='baidu_url_collection')
arg.add_argument('keyword',help='inurl:.asp?id=1')
arg.add_argument('-p', '--page', help='page count', dest='pagecount', type=int)
arg.add_argument('-t', '--thread', help='the thread_count', dest='thread_count', type=int, default=10)
arg.add_argument('-o', '--outfile', help='the file save result', dest='outfile', default='result.txt')
result = arg.parse_args()
headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)'}
class Bd_url(threading.Thread):
def __init__(self, que):
threading.Thread.__init__(self)
self._que = que
def run(self):
while not self._que.empty():
URL = self._que.get()
try:
self.bd_url_collect(URL)
except Exception as e:
print ('Exception: ',e)
pass
def bd_url_collect(self, url):
r = requests.get(url, headers=headers, timeout=5)
soup = bs(r.content, 'lxml', from_encoding='utf-8')
bqs = soup.find_all(name='a', attrs={'data-click':re.compile(r'.'), 'class':None})
for bq in bqs:
r = requests.get(bq['href'], headers=headers, timeout=5)
if r.status_code == 200:
print(r.url)
with open(result.outfile, 'a+') as f:
f.write(r.url + '\n')
# result.outfile.close()
def main():
thread = []
thread_count = result.thread_count
que = Queue()
for i in range(0,(result.pagecount)):
que.put('https://www.baidu.com/s?wd=' + result.keyword + '&pn=' + str(i))
for i in range(thread_count):
thread.append(Bd_url(que))
for i in thread:
i.start()
for i in thread:
i.join()
if __name__ == '__main__':
start = time.perf_counter()
main()
end = time.perf_counter()
原文地址:https://blog.csdn.net/ztc131450/article/details/144408993
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!