不停的博客

buting's blog
时光不停

豆瓣爬虫1

发表于 2017-12-29 | 分类于编程，爬虫， python |

豆瓣热门电影评论爬虫

环境

Python36
Chrome
分词库:jieba 词云:wordcloud 解析DOM:beautifulsoup4 绘图:matplotlib 数据处理:pandas, numpy
以上模块均可通过 pip install ** 安装
学会Google, Baidu, 看博客
开始吧^_^
网页分析

直接上代码，注释很详细

# coding:utf-8
__author__ = 'buting'
from urllib import request
from http import cookiejar
from urllib import parse
import requests
from bs4 import BeautifulSoup as bs
import re
import jieba
import pandas as pd
import numpy
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib
import time
def getMovieIdList():
    head = {}
    head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
    head['Connection'] = 'keep-alive'
    head['Host'] = 'movie.douban.com'
    reqs = request.urlopen(request.Request('https://movie.douban.com/nowplaying/wuhan/', headers = head))
    html_data = reqs.read().decode('utf-8')
    soup = bs(html_data, 'html.parser') # 使用beautifulsoup解析html
    nowplaying = soup.find_all('div', id = 'nowplaying')    # 找到容纳影片的div
    nowplaying = nowplaying[0].find_all('li', class_ = 'list-item') # 找到各个影片li
    nowplaying_movie_id_name = []
    for movie in nowplaying:
        movie_id_name = {}
        movie_id_name = dict(zip(
            ('id', 'score', 'director', 'actors', 'duration'), 
            (movie['data-subject'], movie['data-score'], movie['data-director'], movie['data-actors'], movie['data-duration'])
            ))
        # movie_id_name['id'] = movie['data-subject']
        movie_id_name['name'] = movie.find('img')['alt']
        nowplaying_movie_id_name.append(movie_id_name)
    return nowplaying_movie_id_name
def getComments(id, pageNum):
    start = 0
    if type(pageNum) == type(7) and pageNum > 0:
        start = (pageNum - 1) * 20
    url = 'https://movie.douban.com/subject/' + id + '/comments' + '?start=' + str(start) + '&limit=20'
    head = {}
    head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
    reqs = request.urlopen(request.Request(url, headers = head))
    html_data = reqs.read().decode('utf-8')
    soup = bs(html_data, 'html.parser')
    comment = soup.find_all('div', class_ = 'comment')
    comments = ''
    for div in comment:
        if div.find('p').string is not None:
            comments += div.find('p').string.strip()
    return comments
def spider():
    # 获取评论
    movie_info_dict_list = getMovieIdList()
    comments = ''
    for i in range(10):
        if i % 8 == 0:
            time.sleep(0.2)
        comments += getComments(movie_info_dict_list[0]['id'], i + 1)
    # 提取汉字, 汉字unicode编码开头至结尾
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    characters_list = re.findall(pattern, comments)
    ch_comments = ''.join(characters_list)
    
    # 使用结巴分词进行中文分词
    ch_comments_list = jieba.lcut(ch_comments)
    # 通过pandas将词汇列表生成一张数据表格
    ch_df = pd.DataFrame({'segment':ch_comments_list})
    # 获取停用词数据表
    stop_ch = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopch'], encoding='utf-8')
    # 使用pandas去掉停用词
    ch_df = ch_df[~ch_df.segment.isin(stop_ch.stopch)]
    
    # 词频统计,生成一张词语与词频数的数据表
    ch_fre_df = ch_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})
    ch_fre_df = ch_fre_df.reset_index().sort_values(by=["计数"],ascending=False)
    # 词云显示
    wordcloud = WordCloud(font_path="simhei.ttf",background_color="#ffffff",
                        width=1200, height=800, min_font_size=20, max_words=2000, max_font_size=100)
    ch_frequence = {x[0]:x[1] for x in ch_fre_df.head(1000).values}
    wordcloud = wordcloud.fit_words(ch_frequence)
    # 绘图显示保存
    plt.imshow(wordcloud)
    plt.savefig('./' + movie_info_dict_list[0]['name'] + '_douban_wordcloud.png', dpi=800)
    plt.show()
if __name__ == '__main__':
    print('https://butingshiguang.github.io/')
    spider()

结果展示

时光不停 TeslaChan @buting
转载请注明出处！