import time import requests from bs4 import BeautifulSoup import jieba from wordcloud import WordCloud def get_text(url, page): r = requests.get(url) r.encoding = 'utf-8' content = r.text bs = BeautifulSoup(content, 'html.parser') text = bs.findAll('p') with open('红楼梦.txt', 'at') as f: f.write("第" + str(page) + "章") for para in text: ls = str(para).replace('

', '').replace('

', '') f.write(ls) f.write('\n') def do_crawl(): server = 'http://www.gudianmingzhu.com/guji/hongloumeng/' page = 1 for index in range(11368, 11488): url = server + str(index) + '.html' print('正在爬取' + url) get_text(url, page) time.sleep(3) page += 1 def generate_wordcloud(text): w = WordCloud(font_path="simhei.ttf", width=1000, height=500, background_color="white") w.generate(text) w.to_file("wordcloud.jpg") def split_words(filename): excludes = {'什么', '一个', '我们', '你们', '如今', '说道', '知道', '起来', '这里', '奶奶', '姑娘', '出来', '众人', '那里', '自己', '他们', '一面', '只见', '怎么', '老太太', '两个', '没有', '不是', '不知', '这个', '听见', '这样', '进来', '咱们', '太太', '告诉', '就是', '东西', '回来', '只是', '大家', '只得', '丫头', '姐姐', '不用', '过来', '心里', '如此', '今日', '这些', '不敢', '出去', '所以', '不过', '的话', '不好', '一时', '不能', '银子', '几个', '答应', '二人', '还有', '只管', '这么', '说话', '一回', '那边', '这话', '外头', '打发', '自然', '今儿', '罢了', '屋里', '那些', '听说', '如何', '问道', '看见', '二爷', '小丫头', '人家', '妹妹', '老爷', '原来', '一声'} txt = open(filename, "r", encoding='utf-8').read() words = jieba.lcut(txt) counts = {} for word in words: if len(word) == 1: continue elif word == "宝玉" or word == "宝二爷": right_char = "贾宝玉" elif word == "凤姐" or word == "凤辣子" or word == "凤姐儿" or word == "琏二奶奶" or word == "凤丫头" or word == "凤哥儿": right_char = "王熙凤" elif word == "老祖宗" or word == "老太君": right_char = "贾母" elif word == "颦颦" or word == "林姑娘" or word == "黛玉" or word == "林妹妹" or word == "潇湘妃子" or word == "林丫头": right_char = "林黛玉" elif word == "宝姑娘" or word == "宝丫头" or word == "蘅芜君" or word == "宝姐姐" or word == "宝钗": right_char = "薛宝钗" elif word == "湘云": right_char = "史湘云" elif word == "存周": right_char = "贾政" elif word == "花珍珠" or word == "花大姑娘": right_char = "袭人" else: right_char = word counts[right_char] = counts.get(right_char, 0) + 1 for word in excludes: del counts[word] items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) stats = "" for i in range(20): character, count = items[i] print("{0:{2}<5}{1:{2}>5}".format(character, count, chr(12288))) stats = stats + ' ' + character return stats if __name__ == '__main__': do_crawl() words_stat = split_words("红楼梦.txt") generate_wordcloud(words_stat)