欧美三区_成人在线免费观看视频_欧美极品少妇xxxxⅹ免费视频_a级毛片免费播放_鲁一鲁中文字幕久久_亚洲一级特黄

Python爬取讀者并制作成PDF

系統 1668 0

學了下beautifulsoup后,做個個網絡爬蟲,爬取讀者雜志并用reportlab制作成pdf..

crawler.py

復制代碼 代碼如下:

#!/usr/bin/env python
#coding=utf-8
"""
??? Author:???????? Anemone
??? Filename:?????? getmain.py
??? Last modified:? 2015-02-19 16:47
??? E-mail:???????? anemone@82flex.com
"""
import urllib2
from bs4 import BeautifulSoup
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getEachArticle(url):
#??? response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html')
??? response = urllib2.urlopen(url)
??? html = response.read()
??? soup = BeautifulSoup(html)#.decode("utf-8").encode("gbk"))
??? #for i in soup.find_all('div'):
??? #??? print i,1
??? title=soup.find("h1").string
??? writer=soup.find(id="pub_date").string.strip()
??? _from=soup.find(id="media_name").string.strip()
??? text=soup.get_text()#.encode("utf-8")
??? main=re.split("BAIDU_CLB.*;",text)
??? result={"title":title,"writer":writer,"from":_from,"context":main[1]}
??? return result
??? #new=open("new.txt","w")
??? #new.write(result["title"]+"\n\n")
??? #new.write(result["writer"]+"? "+result["from"])
??? #new.write(result["context"])
??? #new.close()
def getCatalog(issue):
??? url=" http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/"
??? firstUrl=url+"duzh"+issue+"01.html"
??? firstUrl=url+"index.html"
??? duzhe=dict()
??? response = urllib2.urlopen(firstUrl)
??? html = response.read()
??? soup=BeautifulSoup(html)
??? firstUrl=url+soup.table.a.get("href")
??? response = urllib2.urlopen(firstUrl)
??? html = response.read()
??? soup = BeautifulSoup(html)
??? all=soup.find_all("h2")
??? for i in all:
??????? print i.string
??????? duzhe[i.string]=list()
??????? for link in i.parent.find_all("a"):
??????????? href=url+link.get("href")
??????????? print href
??????????? while 1:
??????????????? try:
??????????????????? article=getEachArticle(href)
??????????????????? break
??????????????? except:
??????????????????? continue
??????????? duzhe[i.string].append(article)
??? return duzhe
def readDuZhe(duzhe):
??? for eachColumn in duzhe:
??????? for eachArticle in duzhe[eachColumn]:
??????????? print eachArticle["title"]
if __name__ == '__main__':
#??? issue=raw_input("issue(201501):")
??? readDuZhe(getCatalog("201424"))

getpdf.py

復制代碼 代碼如下:

#!/usr/bin/env python
#coding=utf-8
"""
??? Author:???????? Anemone
??? Filename:?????? writetopdf.py
??? Last modified:? 2015-02-20 19:19
??? E-mail:???????? anemone@82flex.com
"""
#coding=utf-8
import reportlab.rl_config
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib import fonts
import copy
from reportlab.platypus import Paragraph, SimpleDocTemplate,flowables
from reportlab.lib.styles import getSampleStyleSheet
import crawler
def writePDF(issue,duzhe):
??? reportlab.rl_config.warnOnMissingFontGlyphs = 0
??? pdfmetrics.registerFont(TTFont('song',"simsun.ttc"))
??? pdfmetrics.registerFont(TTFont('hei',"msyh.ttc"))
??? fonts.addMapping('song', 0, 0, 'song')
??? fonts.addMapping('song', 0, 1, 'song')
??? fonts.addMapping('song', 1, 0, 'hei')
??? fonts.addMapping('song', 1, 1, 'hei')
??? stylesheet=getSampleStyleSheet()
??? normalStyle = copy.deepcopy(stylesheet['Normal'])
??? normalStyle.fontName ='song'
??? normalStyle.fontSize = 11
??? normalStyle.leading = 11
??? normalStyle.firstLineIndent = 20
??? titleStyle = copy.deepcopy(stylesheet['Normal'])
??? titleStyle.fontName ='song'
??? titleStyle.fontSize = 15
??? titleStyle.leading = 20
??? firstTitleStyle = copy.deepcopy(stylesheet['Normal'])
??? firstTitleStyle.fontName ='song'
??? firstTitleStyle.fontSize = 20
??? firstTitleStyle.leading = 20
??? firstTitleStyle.firstLineIndent = 50
??? smallStyle = copy.deepcopy(stylesheet['Normal'])
??? smallStyle.fontName ='song'
??? smallStyle.fontSize = 8
??? smallStyle.leading = 8
??? story = []
??? story.append(Paragraph(" 讀者{0}期 ".format(issue), firstTitleStyle))
??? for eachColumn in duzhe:
??????? story.append(Paragraph('__'*28, titleStyle))
??????? story.append(Paragraph(' {0} '.format(eachColumn), titleStyle))
??????? for eachArticle in duzhe[eachColumn]:
??????????? story.append(Paragraph(eachArticle["title"],normalStyle))
??? story.append(flowables.PageBreak())
??? for eachColumn in duzhe:
??????? for eachArticle in duzhe[eachColumn]:
??????????? story.append(Paragraph(" {0} ".format(eachArticle["title"]),titleStyle))
??????????? story.append(Paragraph(" {0}? {1}".format(eachArticle["writer"],eachArticle["from"]),smallStyle))
??????????? para=eachArticle["context"].split("  ")
??????????? for eachPara in para:
??????????????? story.append(Paragraph(eachPara,normalStyle))
??????????? story.append(flowables.PageBreak())
??? #story.append(Paragraph("context",normalStyle))
??? doc = SimpleDocTemplate("duzhe"+issue+".pdf")
??? print "Writing PDF..."
??? doc.build(story)
def main(issue):
??? duzhe=crawler.getCatalog(issue)
??? writePDF(issue,duzhe)
if __name__ == '__main__':
??? issue=raw_input("Enter issue(201501):")
??? main(issue)

以上就是本文的全部內容了,希望大家能夠喜歡。


更多文章、技術交流、商務合作、聯系博主

微信掃碼或搜索:z360901061

微信掃一掃加我為好友

QQ號聯系: 360901061

您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。

【本文對您有幫助就好】

您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描上面二維碼支持博主2元、5元、10元、自定義金額等您想捐的金額吧,站長會非常 感謝您的哦!!!

發表我的評論
最新評論 總共0條評論
主站蜘蛛池模板: 日本视频网址 | 在线不卡视频 | 国内精品视频区在线2021 | 91福利在线观看 | 日韩视频观看 | 奇米第四色网站 | 日韩伦理免费在线观看 | 色亚洲色图 | 久久亚洲精品中文字幕二区 | 国产精品久久久久久52AVAV | 色999精品| 成在线人免费视频 | 日本青草视频 | 91精品欧美久久久久久动漫 | 小视频你懂得 | 伊人2222 | 黄色影院在线看 | 日韩欧美在线播放 | 欧美人成在线视频 | 国产日韩亚洲不卡高清在线观看 | 不卡在线一区 | 一区二区三区免费在线观看 | 国产在线观看www鲁啊鲁免费 | 亚洲欧美日韩中文字幕在线不卡 | 人人夜| 欧美激情无码成人A片 | 亚洲欧美色欧另类欧 | 丁香激情五月 | 欧美另类视频一区二区三区 | 国产精品一区久久久 | 久久九九国产精品 | 国产成人在线视频 | 操白浆 | 欧美亚洲理伦电影毛片在线播放 | 99久久久久久| 久久精品小视频 | 久久国产成人 | 天堂在线www网亚洲 欧美 日韩 | 欧美成人h版在线观看 | 色聚网久久综合 | 亚洲国产综合精品中文第一区 |