55 lines
1.6 KiB
Python
55 lines
1.6 KiB
Python
import pandas as pd
|
||
import jieba
|
||
from collections import Counter
|
||
import matplotlib.pyplot as plt
|
||
import matplotlib
|
||
from wordcloud import WordCloud
|
||
|
||
# 设置字体 MAC
|
||
matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS']
|
||
# 设置字体 Windows
|
||
# matplotlib.rcParams['font.sans-serif'] = ['SimHei']
|
||
matplotlib.rcParams['axes.unicode_minus'] = False # 正确显示负号
|
||
|
||
|
||
# 读取Excel文件
|
||
df = pd.read_excel('智慧社区文本.xlsx')
|
||
|
||
# 读取停用词表
|
||
with open('stopwords.txt', 'r', encoding='utf-8') as f:
|
||
stop_words = set([line.strip() for line in f.readlines()])
|
||
|
||
# 合并所有文本内容到一个字符串
|
||
all_text = ' '.join(df['文本内容'].dropna())
|
||
|
||
# 使用jieba进行中文分词
|
||
words = jieba.cut(all_text)
|
||
|
||
|
||
# 过滤停用词并统计词频
|
||
filtered_words = [word for word in words if word not in stop_words and len(word) > 1]
|
||
word_counts = Counter(filtered_words)
|
||
|
||
# 可视化最常见的词
|
||
most_common_words = word_counts.most_common(20)
|
||
words, counts = zip(*most_common_words)
|
||
|
||
plt.figure(figsize=(10, 8))
|
||
plt.barh(words, counts)
|
||
plt.xlabel('词频')
|
||
plt.title('词频分析')
|
||
plt.gca().invert_yaxis() # 反转y轴,使得最高的条形图在上方
|
||
plt.savefig('词频分析.png')
|
||
plt.show()
|
||
|
||
# 创建词云图
|
||
wordcloud = WordCloud(font_path='苹方-简.ttf',width=800, height=600,
|
||
background_color='white').generate_from_frequencies(word_counts)
|
||
|
||
plt.figure(figsize=(10, 8))
|
||
plt.imshow(wordcloud, interpolation='bilinear')
|
||
plt.axis('off') # 关闭坐标轴
|
||
plt.title('词云图')
|
||
plt.savefig('词云图.png')
|
||
plt.show()
|