0x00:使用xpath進行網頁解析
#coding: utf-8
import requests
import os
import re
from lxml import etree
import time
def get_title(title): #獲取標題,創建文件
path=r"./Pic/"+title
if os.path.exists(path): #文件夾存在,返回
return path
else:
os.makedirs(path) #創建空文件夾
return path
def pic_get(info):#下載圖片
url = info['url']
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0",
"Referer": url
}
store_path=info['path']
all=info['all']
print("獲取"+store_path+"******************")
for i in range (1,all+1):
i_str=str(i)
finall_url=url+"/"+i_str
response=requests.get(finall_url,headers=header)
data=response.content.decode('utf-8')
try:
html=etree.HTML(data)
img_url=html.xpath("http://div[@class=\"main-image\"]//img")[0].xpath("./@src")[0]
response=requests.get(img_url,headers=header)
if response.status_code==200:
data=response.content
with open(store_path+"/"+i_str+'.jpg',"wb+") as fp:
fp.write(data)
fp.close()
print(img_url)
time.sleep(0.5)
except:
pass
return
def url_create(url_path,type):
#主url產生
if type=='main_url':
print("正在獲取全部可訪問頁面....")
parser=etree.HTMLParser(encoding="utf-8")
html=etree.parse(url_path,parser)
num=html.xpath("http://div[@class=\"nav-links\"]/a[4]")[0].xpath('text()')[0]
main_url=[]
for i in range(1,int(num)-10):
tmp_url="https://www.xxxx.com/tag/xxx/page/"+str(i)
main_url.append(tmp_url)
return main_url
#圖片url獲取
if type=='pic_url':
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
}
response=requests.get(url_path+"/",headers=header)
data=response.content.decode("utf-8")
html=etree.HTML(data)
lis=html.xpath("http://ul[@id=\"pins\"]/li/span/a")
pic_info=[]
for li in lis:
tmp_url=li.xpath("./@href")[0]
title=li.xpath("text()")[0]
pre_rul=r"[:,.<>'\":]"
title=re.sub(pre_rul,'-',title)
path=get_title(title) #創建文件夾
info={
"path":path,
"url":tmp_url
}
pic_info.append(info)
return pic_info
#查看每一個主題可以下載多少圖片
if type=='title_url':
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
}
response = requests.get(url_path+"/1", headers=header)
data = response.content.decode("utf-8")
html = etree.HTML(data)
all = html.xpath("http://div[@class=\"pagenavi\"]/a/span")[4].xpath("text()")[0]
return int(all)
def main():
#首先訪問主頁獲取基本參數信息
url="https://www.xxxxxx.com/tag/xxxxxx/"
header={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
}
response=requests.get(url,headers=header)
data=response.content.decode("utf-8")
with open(r"./1.html","w+",encoding="utf-8") as fp:
fp.write(data)
fp.close()
#調用鏈接生成函數,生成可操作鏈接
url_path=r"./1.html"
main_url=url_create(url_path,'main_url') #獲取所有可訪問頁面
time.sleep(1)
#進入每一頁面,獲取當前頁的全部可訪問圖片鏈接
pic_url=[]
for page_url in main_url:
tmp_url=url_create(page_url,'pic_url')
pic_url.append(tmp_url)
#print(pic_url)
time.sleep(1) #避免操作過快,服務器拒絕響應
#處理獲取的信息
for first in pic_url:
for seconde in first:
all=url_create(seconde['url'],"title_url")
seconde['all']=all
time.sleep(0.5)
print("全部信息獲取完畢,開始下載圖片!!!!\n")
print(pic_url)
for first in pic_url:
for seconde in first:
pic_get(seconde)
time.sleep(0.5)
if __name__ == '__main__':
main()
0x01:使用正則表達式進行網頁數據獲取:
#-*-coding:utf-8 -*-
import re
import requests
from multiprocessing import Pool
import time
import os
def get_Pic(url):
print(url)
header = {
"Referer": url,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
response = requests.get(url, headers=header)
data = response.content.decode()
title = re.findall(r'
(.*?)
',data)
Pic_url=re.findall(r'
<img\ssrc=" (.*?)".*?="">
',data,re.DOTALL)
max=re.findall(r'
…
(.*?)
',data,re.DOTALL)
#創建存儲文件夾
path="./Pic/"+title[0]
if os.path.exists(path):
print("圖片存儲位置:"+path)
pass
else:
print("成功創建存儲文件夾"+path)
os.makedirs(path)
#############
#開始下載圖片
for i in range(1,int(max[0])+1):
if i<10:
i_str="0"+str(i)
else:
i_str=str(i)
pic_url=Pic_url[0][:-6]+i_str+".jpg"
print("開始下載"+pic_url)
try:
response=requests.get(pic_url,headers=header)
store_path=path+"/"+i_str+".jpg"
with open(store_path,"wb+") as fp:
fp.write(response.content)
fp.close()
time.sleep(0.5)
except:
print(pic_url+"下載失敗,下載下一張")
pass
return
def get_Url(url):
header={
"Referer": url,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
response=requests.get(url,headers=header)
data=response.content.decode()
all_url=re.findall(r"https://www.xxxxxx.com/\d{4,6}",data)
return list(set(all_url))#去重之后返回
def get_wight():
print("頁數區間實例:4-10,爬行第四頁到第十頁。")
in_ = input("請輸入想爬行的頁數區間(頁數過多可能導致服務停止,最大10頁):")
wight = re.findall(r".*(\d{1,2}).(\d{1,2}).*", in_, re.DOTALL)
if wight == []:
print("爬行區間輸入有誤!")
exit(0)
else:
(start, end) = wight[0]
start = int(start)
end = int(end)
if start <= 0 or start > end:
print("請重新輸入爬行區間。")
exit(0)
elif end > 230:
print("末區間超過最大頁數。")
exit(0)
elif end - start > 10:
print("區間間隔過大,請重新輸入。")
exit(0)
return (start,end)
def main():
(start,end)=get_wight()
urls=[]
for i in range(start,end+1):
i_str=str(i)
url="https://www.xxxxx.com/page/%s/"% i_str
#print(url)
url_list=get_Url(url)
time.sleep(1) #休眠一秒,避免訪問速度過快
urls.append(url_list)
pool=Pool(15) #創建進程池
for url_list in urls:
for url in url_list:
next_one=pool.apply_async(get_Pic,args=(url,))
time.sleep(0.5)
next_one.wait()
print("等待全部子進程結束")
pool.close()
pool.join()
print("圖片下載完成")
if __name__ == '__main__':
main()
end:
之前看到有一堆人都爬過,剛好學了爬蟲,來試試手,中間遇到了一些坑,還是很有成長的,繼續加油!
更多文章、技術交流、商務合作、聯系博主
微信掃碼或搜索:z360901061

微信掃一掃加我為好友
QQ號聯系: 360901061
您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元
