欧美影院一区二区,欧美日韩在线不卡,久久综合九色综合精品

0x00:使用xpath進行網頁解析

            
              #coding: utf-8
import requests
import os
import re
from lxml import etree
import time

def get_title(title):  #獲取標題，創建文件
    path=r"./Pic/"+title

    if os.path.exists(path):   #文件夾存在，返回
        return path
    else:
        os.makedirs(path)   #創建空文件夾
        return path

def pic_get(info):#下載圖片
    url = info['url']
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0",
        "Referer": url
    }
    store_path=info['path']
    all=info['all']
    print("獲取"+store_path+"******************")
    for i in range (1,all+1):
        i_str=str(i)
        finall_url=url+"/"+i_str
        response=requests.get(finall_url,headers=header)
        data=response.content.decode('utf-8')
        try:
            html=etree.HTML(data)
            img_url=html.xpath("http://div[@class=\"main-image\"]//img")[0].xpath("./@src")[0]
            response=requests.get(img_url,headers=header)
            if response.status_code==200:
                data=response.content
                with open(store_path+"/"+i_str+'.jpg',"wb+") as fp:
                    fp.write(data)
                fp.close()
                print(img_url)
            time.sleep(0.5)
        except:
            pass
    return

def url_create(url_path,type):
    #主url產生
    if type=='main_url':
        print("正在獲取全部可訪問頁面....")
        parser=etree.HTMLParser(encoding="utf-8")
        html=etree.parse(url_path,parser)
        num=html.xpath("http://div[@class=\"nav-links\"]/a[4]")[0].xpath('text()')[0]

        main_url=[]
        for i in range(1,int(num)-10):
            tmp_url="https://www.xxxx.com/tag/xxx/page/"+str(i)
            main_url.append(tmp_url)
        return main_url


    #圖片url獲取
    if type=='pic_url':
        header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
        }
        response=requests.get(url_path+"/",headers=header)
        data=response.content.decode("utf-8")
        html=etree.HTML(data)
        lis=html.xpath("http://ul[@id=\"pins\"]/li/span/a")

        pic_info=[]
        for li in lis:
            tmp_url=li.xpath("./@href")[0]
            title=li.xpath("text()")[0]
            pre_rul=r"[:,.<>'\":]"
            title=re.sub(pre_rul,'-',title)
            path=get_title(title)  #創建文件夾
            info={
                "path":path,
                "url":tmp_url
            }
            pic_info.append(info)
        return pic_info


    #查看每一個主題可以下載多少圖片
    if type=='title_url':
        header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
        }
        response = requests.get(url_path+"/1", headers=header)
        data = response.content.decode("utf-8")
        html = etree.HTML(data)
        all = html.xpath("http://div[@class=\"pagenavi\"]/a/span")[4].xpath("text()")[0]
        return int(all)


def main():
    #首先訪問主頁獲取基本參數信息
    url="https://www.xxxxxx.com/tag/xxxxxx/"
    header={
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0"
    }
    response=requests.get(url,headers=header)
    data=response.content.decode("utf-8")
    with open(r"./1.html","w+",encoding="utf-8") as fp:
        fp.write(data)
    fp.close()

    #調用鏈接生成函數，生成可操作鏈接
    url_path=r"./1.html"
    main_url=url_create(url_path,'main_url')   #獲取所有可訪問頁面
    time.sleep(1)

    #進入每一頁面，獲取當前頁的全部可訪問圖片鏈接
    pic_url=[]
    for page_url in main_url:
        tmp_url=url_create(page_url,'pic_url')
        pic_url.append(tmp_url)
    #print(pic_url)
    time.sleep(1)   #避免操作過快，服務器拒絕響應

    #處理獲取的信息
    for first in pic_url:
        for seconde in first:
            all=url_create(seconde['url'],"title_url")
            seconde['all']=all
            time.sleep(0.5)
    print("全部信息獲取完畢，開始下載圖片！！！！\n")
    print(pic_url)

    for first in pic_url:
        for seconde in first:
            pic_get(seconde)
            time.sleep(0.5)

if __name__ == '__main__':
    main()

0x01：使用正則表達式進行網頁數據獲取：

            
              #-*-coding:utf-8 -*-
import re
import requests
from multiprocessing import Pool
import time
import os

def get_Pic(url):
    print(url)
    header = {
        "Referer": url,
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
    response = requests.get(url, headers=header)
    data = response.content.decode()
    title = re.findall(r'
              
                (.*?)
              
              ',data)
    Pic_url=re.findall(r'
              
                ',data,re.DOTALL)
    max=re.findall(r'
                
                  …
                
              
              
                (.*?)
              
              ',data,re.DOTALL)

    #創建存儲文件夾
    path="./Pic/"+title[0]
    if os.path.exists(path):
        print("圖片存儲位置："+path)
        pass
    else:
        print("成功創建存儲文件夾"+path)
        os.makedirs(path)
    #############

    #開始下載圖片
    for i in range(1,int(max[0])+1):
        if i<10:
            i_str="0"+str(i)
        else:
            i_str=str(i)
        pic_url=Pic_url[0][:-6]+i_str+".jpg"
        print("開始下載"+pic_url)
        try:
            response=requests.get(pic_url,headers=header)
            store_path=path+"/"+i_str+".jpg"
            with open(store_path,"wb+") as fp:
                fp.write(response.content)
            fp.close()
            time.sleep(0.5)
        except:
            print(pic_url+"下載失敗，下載下一張")
            pass
    return


def get_Url(url):
    header={
        "Referer": url,
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
    response=requests.get(url,headers=header)
    data=response.content.decode()
    all_url=re.findall(r"https://www.xxxxxx.com/\d{4,6}",data)
    return list(set(all_url))#去重之后返回

def get_wight():
    print("頁數區間實例：4-10，爬行第四頁到第十頁。")
    in_ = input("請輸入想爬行的頁數區間（頁數過多可能導致服務停止,最大10頁）：")
    wight = re.findall(r".*(\d{1,2}).(\d{1,2}).*", in_, re.DOTALL)
    if wight == []:
        print("爬行區間輸入有誤！")
        exit(0)
    else:
        (start, end) = wight[0]
        start = int(start)
        end = int(end)
        if start <= 0 or start > end:
            print("請重新輸入爬行區間。")
            exit(0)
        elif end > 230:
            print("末區間超過最大頁數。")
            exit(0)
        elif end - start > 10:
            print("區間間隔過大，請重新輸入。")
            exit(0)
    return (start,end)

def main():
    (start,end)=get_wight()
    urls=[]
    for i in range(start,end+1):
        i_str=str(i)
        url="https://www.xxxxx.com/page/%s/"% i_str
        #print(url)
        url_list=get_Url(url)
        time.sleep(1)  #休眠一秒，避免訪問速度過快
        urls.append(url_list)

    pool=Pool(15)  #創建進程池
    for url_list in urls:
        for url in url_list:
            next_one=pool.apply_async(get_Pic,args=(url,))
            time.sleep(0.5)
        next_one.wait()

    print("等待全部子進程結束")
    pool.close()
    pool.join()
    print("圖片下載完成")

if __name__ == '__main__':
    main()

end:

之前看到有一堆人都爬過，剛好學了爬蟲，來試試手，中間遇到了一些坑，還是很有成長的，繼續加油！

更多文章、技術交流、商務合作、聯系博主

微信掃碼或搜索：z360901061

微信掃一掃加我為好友

QQ號聯系： 360901061

您的支持是博主寫作最大的動力，如果您喜歡我的文章，感覺我的文章對您有幫助，請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧，狠狠點擊下面給點支持吧，站長非常感激您！手機微信長按不能支付解決辦法：請將微信支付二維碼保存到相冊，切換到微信，然后點擊微信右上角掃一掃功能，選擇支付二維碼完成支付。

【本文對您有幫助就好】元

2元

5元

10元

20元

自定義