92 lines
3.5 KiB
Python
Executable File
92 lines
3.5 KiB
Python
Executable File
import time
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import jieba
|
|
from wordcloud import WordCloud
|
|
|
|
|
|
def get_text(url, page):
|
|
r = requests.get(url)
|
|
r.encoding = 'utf-8'
|
|
content = r.text
|
|
bs = BeautifulSoup(content, 'html.parser')
|
|
text = bs.findAll('p')
|
|
with open('红楼梦.txt', 'at') as f:
|
|
f.write("第" + str(page) + "章")
|
|
for para in text:
|
|
ls = str(para).replace('<p>', '').replace('</p>', '')
|
|
f.write(ls)
|
|
f.write('\n')
|
|
|
|
|
|
def do_crawl():
|
|
server = 'http://www.gudianmingzhu.com/guji/hongloumeng/'
|
|
page = 1
|
|
for index in range(11368, 11488):
|
|
url = server + str(index) + '.html'
|
|
print('正在爬取' + url)
|
|
get_text(url, page)
|
|
time.sleep(3)
|
|
page += 1
|
|
|
|
|
|
def generate_wordcloud(text):
|
|
w = WordCloud(font_path="simhei.ttf", width=1000, height=500, background_color="white")
|
|
w.generate(text)
|
|
w.to_file("wordcloud.jpg")
|
|
|
|
|
|
def split_words(filename):
|
|
excludes = {'什么', '一个', '我们', '你们', '如今', '说道', '知道', '起来', '这里', '奶奶',
|
|
'姑娘', '出来', '众人', '那里', '自己', '他们', '一面', '只见', '怎么', '老太太',
|
|
'两个', '没有', '不是', '不知', '这个', '听见', '这样', '进来', '咱们', '太太',
|
|
'告诉', '就是', '东西', '回来', '只是', '大家', '只得', '丫头', '姐姐', '不用',
|
|
'过来', '心里', '如此', '今日', '这些', '不敢', '出去', '所以', '不过', '的话',
|
|
'不好', '一时', '不能', '银子', '几个', '答应', '二人', '还有', '只管', '这么',
|
|
'说话', '一回', '那边', '这话', '外头', '打发', '自然', '今儿', '罢了', '屋里',
|
|
'那些', '听说', '如何', '问道', '看见', '二爷', '小丫头', '人家', '妹妹', '老爷',
|
|
'原来', '一声'}
|
|
txt = open(filename, "r", encoding='utf-8').read()
|
|
words = jieba.lcut(txt)
|
|
counts = {}
|
|
for word in words:
|
|
if len(word) == 1:
|
|
continue
|
|
elif word == "宝玉" or word == "宝二爷":
|
|
right_char = "贾宝玉"
|
|
elif word == "凤姐" or word == "凤辣子" or word == "凤姐儿" or word == "琏二奶奶" or word == "凤丫头" or word == "凤哥儿":
|
|
right_char = "王熙凤"
|
|
elif word == "老祖宗" or word == "老太君":
|
|
right_char = "贾母"
|
|
elif word == "颦颦" or word == "林姑娘" or word == "黛玉" or word == "林妹妹" or word == "潇湘妃子" or word == "林丫头":
|
|
right_char = "林黛玉"
|
|
elif word == "宝姑娘" or word == "宝丫头" or word == "蘅芜君" or word == "宝姐姐" or word == "宝钗":
|
|
right_char = "薛宝钗"
|
|
elif word == "湘云":
|
|
right_char = "史湘云"
|
|
elif word == "存周":
|
|
right_char = "贾政"
|
|
elif word == "花珍珠" or word == "花大姑娘":
|
|
right_char = "袭人"
|
|
else:
|
|
right_char = word
|
|
counts[right_char] = counts.get(right_char, 0) + 1
|
|
for word in excludes:
|
|
del counts[word]
|
|
|
|
items = list(counts.items())
|
|
items.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
stats = ""
|
|
for i in range(20):
|
|
character, count = items[i]
|
|
print("{0:{2}<5}{1:{2}>5}".format(character, count, chr(12288)))
|
|
stats = stats + ' ' + character
|
|
return stats
|
|
|
|
|
|
if __name__ == '__main__':
|
|
do_crawl()
|
|
words_stat = split_words("红楼梦.txt")
|
|
generate_wordcloud(words_stat)
|