欧美激情一区二区,亚洲精品电影在线观看,a网站免费

?我每個月都要讀一本書寫一個書評，平時寫書評用的插圖都是在網上找的圖，前段時間覺得這樣不夠炫酷要做一點炫酷的東西。最開始的想法是提取小說中的高頻詞做成詞云，實踐下來發現效果并不理想，主要是有吸引力的詞匯太少并不能突出這本書的特點；于是想到用爬蟲爬取評論來提取關鍵詞，試驗下來發現效果不錯。
?有了思路接下來要看怎么實現了，由于我本人是寫java語言的，而java語言上并沒有很好詞云工具，于是自然想到了python。python我并不是很熟悉，完成這個小程序也遇到了不少坑，這里把代碼貼一下，如果大家遇到類似的問題可以借鑒一下這個思路.
效果如下:

技術點

python基本語法
網絡爬蟲
線程池
生成圖片
wordcolud詞云
中文分詞

代碼

            
              #import的包大家需要自己安裝一下，安裝方式非常簡單,pip install xxx就可以了
import json
import re
from urllib import parse

import jieba
import matplotlib
import numpy as np
import requests
from bs4 import BeautifulSoup as bf
from wordcloud import WordCloud,STOPWORDS
from PIL import Image, ImageDraw, ImageFont
import os
import concurrent.futures


matplotlib.use('agg')
import matplotlib.pyplot as plt



#需要去爬取評論的書名
bookName = '悲劇人偶'
#作者
author = ''

bookSearchUrl = 'https://book.douban.com/j/subject_suggest?q=%s'
commentUrl = '%s/reviews?start=%s'
FULLCOMMENTURL = 'https://book.douban.com/j/review/%s/full'
#不指定字體中文無法顯示,這里的字體是詞云內顯示的字體
#詞云中顯示中文的字體，某些字體不支持中文，因此要妥善選擇
commentFont = '/System/Library/fonts/PingFang.ttc'
#詞云輪廓的字體
profileFont = "/Users/daiwenkai/Library/Fonts/RuiZiYunZiKuPangTouYuTiGBK-1.ttf"
requestPool = concurrent.futures.ThreadPoolExecutor(max_workers=5)


def crawlCommentDetail(commentId):
    try:
        fullComment = requests.get(FULLCOMMENTURL % commentId,cookies=cookies,headers=headers)
        fullCommentJson = json.loads(fullComment.text)
        fullCommentContent = fullCommentJson['html']
        fullCommentContent = striphtml(fullCommentContent)
    except Exception as exc:
        print('crawlCommentDetail there is something worong {}'.format(exc))
    return fullCommentContent


def crawlCommentInfo(i):
    ids = []
    commentContent = requests.get(commentUrl % (bookUrl, i),cookies=cookies,headers=headers)
    commentContentBf = bf(commentContent.text, "html.parser")
    shortCommentLists = commentContentBf.find_all("div", {"data-cid": True})
    for shortComment in shortCommentLists:
        ids.append(shortComment["data-cid"])
    return ids




def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)






bookSearchUrl = bookSearchUrl % parse.quote(bookName)

#我在爬取過程中遇到過豆瓣的校驗，需要使用cookie和header才能繞過校驗
#cookies = {}
#headers = {}

#搜索書名,獲得書詳情的鏈接
bookNameList = requests.get(bookSearchUrl,cookies=cookies,headers=headers)
bookNameListJson = json.loads(bookNameList.text)
bookUrl = ''
for bookInfo in bookNameListJson:
    if bookName.lower() in bookInfo['title'].lower() and author.lower() in bookInfo['author_name'].lower():
        bookNameListJson = json.loads(bookNameList.text)
        bookUrl = bookNameListJson[0]['url']
        bookName = bookInfo['title'].lower()
        break

if bookUrl:
    print('獲取書籍相關信息成功!')
else:
    print('未搜索到相關書籍')
    os._exit(0)

#訪問書籍詳情連接，主要是為了獲取評論的地址
bookinfoHtmlContent = requests.get(bookUrl,cookies=cookies,headers=headers)
bookinfoHtmlContentBf = bf(bookinfoHtmlContent.text, "html.parser")
commentUrlSuffix = bookinfoHtmlContentBf.find_all("p", class_="pl")
#得到評論的地址
try:
    commentUrlSuffix = commentUrlSuffix[0].a.get('href')
except Exception as exc:
    print('評論模塊出現故障!')
    os._exit(0)

commentContent = requests.get(bookUrl + "/" + commentUrlSuffix,cookies=cookies,headers=headers)
if commentContent:
    print('獲取書籍評論成功!')


commentContentBf = bf(commentContent.text, "html.parser")
#獲取總評論數量
total = commentContentBf.find("span",{"class":"count"})
total = re.findall(r"\d+",total.string)[0]
commentDetailIds = []

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Start the load operations and mark each future with its URL
    commentId = {executor.submit(crawlCommentInfo, i): i for i in range(0,int(total),20)}
    for future in concurrent.futures.as_completed(commentId):
        commentDetailId = future.result()
        commentDetailIds.extend(commentDetailId)



print('獲取書籍評論詳情完成!')


allContent = ''
#使用線程池去爬
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Start the load operations and mark each future with its URL
    fullCommentContent = {executor.submit(crawlCommentDetail, commentId): commentId for commentId in commentDetailIds}
    for future in concurrent.futures.as_completed(fullCommentContent):
        try:
            content = future.result()
            allContent += content
        except Exception as exc:
            print('there is something worong {}'.format(future))





#生成詞云輪廓圖片
#圖片的長為書名長度*4，高度為600
img = Image.new('RGB', (400*len(bookName), 600), color=(255,255,255))
#指定字體，字體大小為400
fnt = ImageFont.truetype('/Users/daiwenkai/Library/Fonts/RuiZiYunZiKuPangTouYuTiGBK-1.ttf', 400)
d = ImageDraw.Draw(img)
#指定字體在寫入圖片中時
d.text((0, 100), bookName, font=fnt, fill=(0, 0, 0))


#使用分詞器對文本進行分詞
str_list = jieba.cut(allContent, HMM=True)
outstr = ''
for word in str_list:
    outstr += word
    outstr += ' '
mask = np.array(img)

#這只我下載了百度的stopwords合集。使用這個合集可以排除一些無意義的介詞、連詞，可以使我們詞云上的詞匯更有吸引力。當然，不用也沒關系
myStopWords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()]
stopwords = set(STOPWORDS)
stopwords.add("nbsp")
stopwords |= set(myStopWords)

wordcloud = WordCloud(stopwords=stopwords,font_path=commentFont,background_color="white",max_words=len(bookName) * 120,mask=mask,contour_width=1, contour_color='green',height=800,width=1000).generate_from_text(outstr)
#使用 interpolation="bilinear" 讓圖片顯示的更平滑
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
wordcloud.to_file(bookName + ".png")

后續我還會持續對這個小程序進行改進，最終目標是把這個應用做成一個小程序來讓大家使用

更多文章、技術交流、商務合作、聯系博主

微信掃碼或搜索：z360901061

微信掃一掃加我為好友

QQ號聯系： 360901061

您的支持是博主寫作最大的動力，如果您喜歡我的文章，感覺我的文章對您有幫助，請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧，狠狠點擊下面給點支持吧，站長非常感激您！手機微信長按不能支付解決辦法：請將微信支付二維碼保存到相冊，切換到微信，然后點擊微信右上角掃一掃功能，選擇支付二維碼完成支付。

【本文對您有幫助就好】元

2元

5元

10元

20元

自定義