欧美三区_成人在线免费观看视频_欧美极品少妇xxxxⅹ免费视频_a级毛片免费播放_鲁一鲁中文字幕久久_亚洲一级特黄

Python爬取讀者并制作成PDF

系統 1668 0

學了下beautifulsoup后,做個個網絡爬蟲,爬取讀者雜志并用reportlab制作成pdf..

crawler.py

復制代碼 代碼如下:

#!/usr/bin/env python
#coding=utf-8
"""
??? Author:???????? Anemone
??? Filename:?????? getmain.py
??? Last modified:? 2015-02-19 16:47
??? E-mail:???????? anemone@82flex.com
"""
import urllib2
from bs4 import BeautifulSoup
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getEachArticle(url):
#??? response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html')
??? response = urllib2.urlopen(url)
??? html = response.read()
??? soup = BeautifulSoup(html)#.decode("utf-8").encode("gbk"))
??? #for i in soup.find_all('div'):
??? #??? print i,1
??? title=soup.find("h1").string
??? writer=soup.find(id="pub_date").string.strip()
??? _from=soup.find(id="media_name").string.strip()
??? text=soup.get_text()#.encode("utf-8")
??? main=re.split("BAIDU_CLB.*;",text)
??? result={"title":title,"writer":writer,"from":_from,"context":main[1]}
??? return result
??? #new=open("new.txt","w")
??? #new.write(result["title"]+"\n\n")
??? #new.write(result["writer"]+"? "+result["from"])
??? #new.write(result["context"])
??? #new.close()
def getCatalog(issue):
??? url=" http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/"
??? firstUrl=url+"duzh"+issue+"01.html"
??? firstUrl=url+"index.html"
??? duzhe=dict()
??? response = urllib2.urlopen(firstUrl)
??? html = response.read()
??? soup=BeautifulSoup(html)
??? firstUrl=url+soup.table.a.get("href")
??? response = urllib2.urlopen(firstUrl)
??? html = response.read()
??? soup = BeautifulSoup(html)
??? all=soup.find_all("h2")
??? for i in all:
??????? print i.string
??????? duzhe[i.string]=list()
??????? for link in i.parent.find_all("a"):
??????????? href=url+link.get("href")
??????????? print href
??????????? while 1:
??????????????? try:
??????????????????? article=getEachArticle(href)
??????????????????? break
??????????????? except:
??????????????????? continue
??????????? duzhe[i.string].append(article)
??? return duzhe
def readDuZhe(duzhe):
??? for eachColumn in duzhe:
??????? for eachArticle in duzhe[eachColumn]:
??????????? print eachArticle["title"]
if __name__ == '__main__':
#??? issue=raw_input("issue(201501):")
??? readDuZhe(getCatalog("201424"))

getpdf.py

復制代碼 代碼如下:

#!/usr/bin/env python
#coding=utf-8
"""
??? Author:???????? Anemone
??? Filename:?????? writetopdf.py
??? Last modified:? 2015-02-20 19:19
??? E-mail:???????? anemone@82flex.com
"""
#coding=utf-8
import reportlab.rl_config
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib import fonts
import copy
from reportlab.platypus import Paragraph, SimpleDocTemplate,flowables
from reportlab.lib.styles import getSampleStyleSheet
import crawler
def writePDF(issue,duzhe):
??? reportlab.rl_config.warnOnMissingFontGlyphs = 0
??? pdfmetrics.registerFont(TTFont('song',"simsun.ttc"))
??? pdfmetrics.registerFont(TTFont('hei',"msyh.ttc"))
??? fonts.addMapping('song', 0, 0, 'song')
??? fonts.addMapping('song', 0, 1, 'song')
??? fonts.addMapping('song', 1, 0, 'hei')
??? fonts.addMapping('song', 1, 1, 'hei')
??? stylesheet=getSampleStyleSheet()
??? normalStyle = copy.deepcopy(stylesheet['Normal'])
??? normalStyle.fontName ='song'
??? normalStyle.fontSize = 11
??? normalStyle.leading = 11
??? normalStyle.firstLineIndent = 20
??? titleStyle = copy.deepcopy(stylesheet['Normal'])
??? titleStyle.fontName ='song'
??? titleStyle.fontSize = 15
??? titleStyle.leading = 20
??? firstTitleStyle = copy.deepcopy(stylesheet['Normal'])
??? firstTitleStyle.fontName ='song'
??? firstTitleStyle.fontSize = 20
??? firstTitleStyle.leading = 20
??? firstTitleStyle.firstLineIndent = 50
??? smallStyle = copy.deepcopy(stylesheet['Normal'])
??? smallStyle.fontName ='song'
??? smallStyle.fontSize = 8
??? smallStyle.leading = 8
??? story = []
??? story.append(Paragraph(" 讀者{0}期 ".format(issue), firstTitleStyle))
??? for eachColumn in duzhe:
??????? story.append(Paragraph('__'*28, titleStyle))
??????? story.append(Paragraph(' {0} '.format(eachColumn), titleStyle))
??????? for eachArticle in duzhe[eachColumn]:
??????????? story.append(Paragraph(eachArticle["title"],normalStyle))
??? story.append(flowables.PageBreak())
??? for eachColumn in duzhe:
??????? for eachArticle in duzhe[eachColumn]:
??????????? story.append(Paragraph(" {0} ".format(eachArticle["title"]),titleStyle))
??????????? story.append(Paragraph(" {0}? {1}".format(eachArticle["writer"],eachArticle["from"]),smallStyle))
??????????? para=eachArticle["context"].split("  ")
??????????? for eachPara in para:
??????????????? story.append(Paragraph(eachPara,normalStyle))
??????????? story.append(flowables.PageBreak())
??? #story.append(Paragraph("context",normalStyle))
??? doc = SimpleDocTemplate("duzhe"+issue+".pdf")
??? print "Writing PDF..."
??? doc.build(story)
def main(issue):
??? duzhe=crawler.getCatalog(issue)
??? writePDF(issue,duzhe)
if __name__ == '__main__':
??? issue=raw_input("Enter issue(201501):")
??? main(issue)

以上就是本文的全部內容了,希望大家能夠喜歡。


更多文章、技術交流、商務合作、聯系博主

微信掃碼或搜索:z360901061

微信掃一掃加我為好友

QQ號聯系: 360901061

您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。

【本文對您有幫助就好】

您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描上面二維碼支持博主2元、5元、10元、自定義金額等您想捐的金額吧,站長會非常 感謝您的哦!!!

發表我的評論
最新評論 總共0條評論
主站蜘蛛池模板: 久久视频精品 | ak福利视频| 欧美一区二区在线观看 | 中文字幕第一页在线 | 久久精品久久久久久 | 欧美日韩性生活 | www.国产视频 | 午夜网 | 成人午夜在线观看 | 国产高清在线精品一区二区三区 | 我要看欧美一级毛片 | 日日干狠狠干 | 黑粗硬大欧美 | 91网站在线观看视频 | 国产欧美日韩精品一区 | 久久精品久久精品国产大片 | 古装三级在线观看 | 亚洲欧美成人中文在线网站 | 国产一区二区丁香婷婷 | 青娱乐在线免费观看视频 | 国产精品一区二区三 | 91精品久久久久久久久网影视 | 亚洲免费网站 | 欧美三极 | 国产精品99久久久久 | 精品日韩欧美国产一区二区 | 奇米影视色 | 亚洲 无码 自拍 欧美 小说 | 香蕉视频在线观看免费 | aaaaaaa片毛片免费观看 | 久久久久国产成人精品亚洲午夜 | 国产一区免费 | 青娱分类视频精品免费2 | 久久久99精品免费观看 | 久久久久国 | 狠狠色丁香婷婷综合久久片 | 欧美一区黄 | 国产精品香蕉一区二区三区 | 六月丁香婷婷天天在线 | 免费看一区二区三区 | 日韩一级视频 |