0x00:使用xpath進(jìn)行網(wǎng)頁解析
#coding: utf-8
import requests
import os
import re
from lxml import etree
import time
def get_title(title): #獲取標(biāo)題,創(chuàng)建文件
path=r"./Pic/"+title
if os.path.exists(path): #文件夾存在,返回
return path
else:
os.makedirs(path) #創(chuàng)建空文件夾
return path
def pic_get(info):#下載圖片
url = info['url']
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0",
"Referer": url
}
store_path=info['path']
all=info['all']
print("獲取"+store_path+"******************")
for i in range (1,all+1):
i_str=str(i)
finall_url=url+"/"+i_str
response=requests.get(finall_url,headers=header)
data=response.content.decode('utf-8')
try:
html=etree.HTML(data)
img_url=html.xpath("http://div[@class=\"main-image\"]//img")[0].xpath("./@src")[0]
response=requests.get(img_url,headers=header)
if response.status_code==200:
data=response.content
with open(store_path+"/"+i_str+'.jpg',"wb+") as fp:
fp.write(data)
fp.close()
print(img_url)
time.sleep(0.5)
except:
pass
return
def url_create(url_path,type):
#主url產(chǎn)生
if type=='main_url':
print("正在獲取全部可訪問頁面....")
parser=etree.HTMLParser(encoding="utf-8")
html=etree.parse(url_path,parser)
num=html.xpath("http://div[@class=\"nav-links\"]/a[4]")[0].xpath('text()')[0]
main_url=[]
for i in range(1,int(num)-10):
tmp_url="https://www.xxxx.com/tag/xxx/page/"+str(i)
main_url.append(tmp_url)
return main_url
#圖片url獲取
if type=='pic_url':
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
}
response=requests.get(url_path+"/",headers=header)
data=response.content.decode("utf-8")
html=etree.HTML(data)
lis=html.xpath("http://ul[@id=\"pins\"]/li/span/a")
pic_info=[]
for li in lis:
tmp_url=li.xpath("./@href")[0]
title=li.xpath("text()")[0]
pre_rul=r"[:,.<>'\":]"
title=re.sub(pre_rul,'-',title)
path=get_title(title) #創(chuàng)建文件夾
info={
"path":path,
"url":tmp_url
}
pic_info.append(info)
return pic_info
#查看每一個主題可以下載多少圖片
if type=='title_url':
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
}
response = requests.get(url_path+"/1", headers=header)
data = response.content.decode("utf-8")
html = etree.HTML(data)
all = html.xpath("http://div[@class=\"pagenavi\"]/a/span")[4].xpath("text()")[0]
return int(all)
def main():
#首先訪問主頁獲取基本參數(shù)信息
url="https://www.xxxxxx.com/tag/xxxxxx/"
header={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
}
response=requests.get(url,headers=header)
data=response.content.decode("utf-8")
with open(r"./1.html","w+",encoding="utf-8") as fp:
fp.write(data)
fp.close()
#調(diào)用鏈接生成函數(shù),生成可操作鏈接
url_path=r"./1.html"
main_url=url_create(url_path,'main_url') #獲取所有可訪問頁面
time.sleep(1)
#進(jìn)入每一頁面,獲取當(dāng)前頁的全部可訪問圖片鏈接
pic_url=[]
for page_url in main_url:
tmp_url=url_create(page_url,'pic_url')
pic_url.append(tmp_url)
#print(pic_url)
time.sleep(1) #避免操作過快,服務(wù)器拒絕響應(yīng)
#處理獲取的信息
for first in pic_url:
for seconde in first:
all=url_create(seconde['url'],"title_url")
seconde['all']=all
time.sleep(0.5)
print("全部信息獲取完畢,開始下載圖片?。。?!\n")
print(pic_url)
for first in pic_url:
for seconde in first:
pic_get(seconde)
time.sleep(0.5)
if __name__ == '__main__':
main()
0x01:使用正則表達(dá)式進(jìn)行網(wǎng)頁數(shù)據(jù)獲?。?
#-*-coding:utf-8 -*-
import re
import requests
from multiprocessing import Pool
import time
import os
def get_Pic(url):
print(url)
header = {
"Referer": url,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
response = requests.get(url, headers=header)
data = response.content.decode()
title = re.findall(r'
(.*?)
',data)
Pic_url=re.findall(r'
<img\ssrc=" (.*?)".*?="">
',data,re.DOTALL)
max=re.findall(r'
…
(.*?)
',data,re.DOTALL)
#創(chuàng)建存儲文件夾
path="./Pic/"+title[0]
if os.path.exists(path):
print("圖片存儲位置:"+path)
pass
else:
print("成功創(chuàng)建存儲文件夾"+path)
os.makedirs(path)
#############
#開始下載圖片
for i in range(1,int(max[0])+1):
if i<10:
i_str="0"+str(i)
else:
i_str=str(i)
pic_url=Pic_url[0][:-6]+i_str+".jpg"
print("開始下載"+pic_url)
try:
response=requests.get(pic_url,headers=header)
store_path=path+"/"+i_str+".jpg"
with open(store_path,"wb+") as fp:
fp.write(response.content)
fp.close()
time.sleep(0.5)
except:
print(pic_url+"下載失敗,下載下一張")
pass
return
def get_Url(url):
header={
"Referer": url,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
response=requests.get(url,headers=header)
data=response.content.decode()
all_url=re.findall(r"https://www.xxxxxx.com/\d{4,6}",data)
return list(set(all_url))#去重之后返回
def get_wight():
print("頁數(shù)區(qū)間實(shí)例:4-10,爬行第四頁到第十頁。")
in_ = input("請輸入想爬行的頁數(shù)區(qū)間(頁數(shù)過多可能導(dǎo)致服務(wù)停止,最大10頁):")
wight = re.findall(r".*(\d{1,2}).(\d{1,2}).*", in_, re.DOTALL)
if wight == []:
print("爬行區(qū)間輸入有誤!")
exit(0)
else:
(start, end) = wight[0]
start = int(start)
end = int(end)
if start <= 0 or start > end:
print("請重新輸入爬行區(qū)間。")
exit(0)
elif end > 230:
print("末區(qū)間超過最大頁數(shù)。")
exit(0)
elif end - start > 10:
print("區(qū)間間隔過大,請重新輸入。")
exit(0)
return (start,end)
def main():
(start,end)=get_wight()
urls=[]
for i in range(start,end+1):
i_str=str(i)
url="https://www.xxxxx.com/page/%s/"% i_str
#print(url)
url_list=get_Url(url)
time.sleep(1) #休眠一秒,避免訪問速度過快
urls.append(url_list)
pool=Pool(15) #創(chuàng)建進(jìn)程池
for url_list in urls:
for url in url_list:
next_one=pool.apply_async(get_Pic,args=(url,))
time.sleep(0.5)
next_one.wait()
print("等待全部子進(jìn)程結(jié)束")
pool.close()
pool.join()
print("圖片下載完成")
if __name__ == '__main__':
main()
end:
之前看到有一堆人都爬過,剛好學(xué)了爬蟲,來試試手,中間遇到了一些坑,還是很有成長的,繼續(xù)加油!
更多文章、技術(shù)交流、商務(wù)合作、聯(lián)系博主
微信掃碼或搜索:z360901061
微信掃一掃加我為好友
QQ號聯(lián)系: 360901061
您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點(diǎn)擊下面給點(diǎn)支持吧,站長非常感激您!手機(jī)微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點(diǎn)擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元

