系统环境:python3.5.2,WIN10操作系统,使用Pycharm 编辑器 使用方法介绍:目录下有俩个文件,一个doubanMovie.py文件,一个stopwords.txt中文停用词文件,需要百度下载simhei.ttf字体文件,放到同一目录下,运行doubanMovie.py 会生成电影《古墓丽影:源起之战》的词云图 分析一下爬取的过程 1.抓取网页数据 2.清理数据 3.用词云进行展示。
代码下载地址见后面地址。
import warnings
7 warnings.filterwarnings('ignore')
8 import jieba # 分词包
9 import numpy # numpy计算包
10 import codecs # codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode
11 import re # 正则
12 import pandas as pd
13 import matplotlib.pyplot as plt
14 from urllib import request
15 from bs4 import BeautifulSoup as bs
16 #%matplotlib inline
17
18 import matplotlib
19 matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
20 from wordcloud import WordCloud # 词云包
21
22
23 # 分析网页函数
24 def getNowPlayingMovie_list():
25 resp = request.urlopen('https://movie.douban.com/cinema/nowplaying/wuhan/') # 获取豆瓣电影的网页的内容
26 html_data = resp.read().decode('utf-8')
27 soup = bs(html_data, 'html.parser')
28 nowplaying_movie = soup.find_all('div', id='nowplaying')
29 nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')
30
31 nowplaying_list = []
32 for item in nowplaying_movie_list:
33 nowplaying_dict = {} # 用来存放电影的id和name
34 nowplaying_dict['id'] = item['data-subject']
35 # 通过遍历找到电影的id号码
36 for tag_img_item in item.find_all('img'):
37 nowplaying_dict['name'] = tag_img_item['alt']
38 nowplaying_list.append(nowplaying_dict)
39 return nowplaying_list
40
41
42 # 爬取评论函数
43 def getCommentsByld(movield, pageNum):
44 eachCommentList = [];
45 if pageNum > 0:
46 start = (pageNum - 1) * 20
47 else:
48 return False
49 requrl = 'https://movie.douban.com/subject/' + movield + '/comments' + '?' + 'start=' + str(start) + '&limit=20'
50
51 print(requrl)
52 resp = request.urlopen(requrl)
53 html_data = resp.read().decode('utf-8')
54 soup = bs(html_data, 'html.parser')
55 comment_div_list = soup.find_all('div', class_='comment')
56 for item in comment_div_list:
57 if item.find_all('p')[0].string is not None:
58 eachCommentList.append(item.find_all('p')[0].string)
59 return eachCommentList
60
61
62 def main():
63 # 循环获取第一个电影的前10页评论
64 commentList = []
65 NowPlayingMovie_list = getNowPlayingMovie_list()
66 for i in range(10):
67 num = i + 1
68 commentList_temp = getCommentsByld(NowPlayingMovie_list[0]['id'], num)
69 commentList.append(commentList_temp)
70
71 # 将列表中的数据转换为字符串
72 comments = ''
73 for k in range(len(commentList)):
74 comments = comments + (str(commentList[k])).strip()
75 # print(comments)
76
77
78 # 使用正则表达式去除标点符号
79 pattern = re.compile(r'[\\u4e00-\\u9fa5]+')
80 filterdata = re.findall(pattern, comments)
81 cleaned_comments = ''.join(filterdata)
82 # print(cleaned_comments)
83
84
85 # 使用结巴分词进行中文分词
86 segment = jieba.lcut(cleaned_comments)
87 words_df = pd.DataFrame({'segment': segment})
88
89 # 去掉停用词
90 stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='utf-8')
91 # quoting=3全不引用
92 words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
93 # print(words_df.head())
94
95
96 #统计词频
97 words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})
98 words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
99
100 # 用词云进行显示
101 wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80)
102 word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
103
104
105 # word_frequence_list = []
106 # for key in word_frequence:
107 # temp = (key, word_frequence[key])
108 # word_frequence_list.append(temp)
109
110
111 wordcloud = wordcloud.fit_words(word_frequence)
112 plt.imshow(wordcloud)
113 plt.show()
114
115 # 主函数
116 main()
下载地址:https://github.com/YuSong759770423/DouBanMovies.git
閱讀更多 Python樂園 的文章