有图才有料,爬去豆瓣电影的影评生成词云图片(完整项目源码)

系统环境:python3.5.2,WIN10操作系统,使用Pycharm 编辑器 使用方法介绍:目录下有俩个文件,一个doubanMovie.py文件,一个stopwords.txt中文停用词文件,需要百度下载simhei.ttf字体文件,放到同一目录下,运行doubanMovie.py 会生成电影《古墓丽影:源起之战》的词云图 分析一下爬取的过程 1.抓取网页数据 2.清理数据 3.用词云进行展示。


代码下载地址见后面地址。


import warnings

7 warnings.filterwarnings('ignore')

8 import jieba # 分词包

9 import numpy # numpy计算包

10 import codecs # codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode

11 import re # 正则

12 import pandas as pd

13 import matplotlib.pyplot as plt

14 from urllib import request

15 from bs4 import BeautifulSoup as bs

16 #%matplotlib inline

17

18 import matplotlib

19 matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)

20 from wordcloud import WordCloud # 词云包

21

22

23 # 分析网页函数

24 def getNowPlayingMovie_list():

25 resp = request.urlopen('https://movie.douban.com/cinema/nowplaying/wuhan/') # 获取豆瓣电影的网页的内容

26 html_data = resp.read().decode('utf-8')

27 soup = bs(html_data, 'html.parser')

28 nowplaying_movie = soup.find_all('div', id='nowplaying')

29 nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')

30

31 nowplaying_list = []

32 for item in nowplaying_movie_list:

33 nowplaying_dict = {} # 用来存放电影的id和name

34 nowplaying_dict['id'] = item['data-subject']

35 # 通过遍历找到电影的id号码

36 for tag_img_item in item.find_all('img'):

37 nowplaying_dict['name'] = tag_img_item['alt']

38 nowplaying_list.append(nowplaying_dict)

39 return nowplaying_list

40

41

42 # 爬取评论函数

43 def getCommentsByld(movield, pageNum):

44 eachCommentList = [];

45 if pageNum > 0:

46 start = (pageNum - 1) * 20

47 else:

48 return False

49 requrl = 'https://movie.douban.com/subject/' + movield + '/comments' + '?' + 'start=' + str(start) + '&limit=20'

50

51 print(requrl)

52 resp = request.urlopen(requrl)

53 html_data = resp.read().decode('utf-8')

54 soup = bs(html_data, 'html.parser')

55 comment_div_list = soup.find_all('div', class_='comment')

56 for item in comment_div_list:

57 if item.find_all('p')[0].string is not None:

58 eachCommentList.append(item.find_all('p')[0].string)

59 return eachCommentList

60

61

62 def main():

63 # 循环获取第一个电影的前10页评论

64 commentList = []

65 NowPlayingMovie_list = getNowPlayingMovie_list()

66 for i in range(10):

67 num = i + 1

68 commentList_temp = getCommentsByld(NowPlayingMovie_list[0]['id'], num)

69 commentList.append(commentList_temp)

70

71 # 将列表中的数据转换为字符串

72 comments = ''

73 for k in range(len(commentList)):

74 comments = comments + (str(commentList[k])).strip()

75 # print(comments)

76

77

78 # 使用正则表达式去除标点符号

79 pattern = re.compile(r'[\\u4e00-\\u9fa5]+')

80 filterdata = re.findall(pattern, comments)

81 cleaned_comments = ''.join(filterdata)

82 # print(cleaned_comments)

83

84

85 # 使用结巴分词进行中文分词

86 segment = jieba.lcut(cleaned_comments)

87 words_df = pd.DataFrame({'segment': segment})

88

89 # 去掉停用词

90 stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='utf-8')

91 # quoting=3全不引用

92 words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

93 # print(words_df.head())

94

95

96 #统计词频

97 words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})

98 words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)

99

100 # 用词云进行显示

101 wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80)

102 word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}

103

104

105 # word_frequence_list = []

106 # for key in word_frequence:

107 # temp = (key, word_frequence[key])

108 # word_frequence_list.append(temp)

109

110

111 wordcloud = wordcloud.fit_words(word_frequence)

112 plt.imshow(wordcloud)

113 plt.show()

114

115 # 主函数

116 main()


下载地址:https://github.com/YuSong759770423/DouBanMovies.git



分享到:


相關文章: