python-archieve-projects/政策文本爬虫/3.analyze.py

55 lines
1.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import jieba
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib
from wordcloud import WordCloud
# 设置字体 MAC
matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS']
# 设置字体 Windows
# matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False # 正确显示负号
# 读取Excel文件
df = pd.read_excel('智慧社区文本.xlsx')
# 读取停用词表
with open('stopwords.txt', 'r', encoding='utf-8') as f:
stop_words = set([line.strip() for line in f.readlines()])
# 合并所有文本内容到一个字符串
all_text = ' '.join(df['文本内容'].dropna())
# 使用jieba进行中文分词
words = jieba.cut(all_text)
# 过滤停用词并统计词频
filtered_words = [word for word in words if word not in stop_words and len(word) > 1]
word_counts = Counter(filtered_words)
# 可视化最常见的词
most_common_words = word_counts.most_common(20)
words, counts = zip(*most_common_words)
plt.figure(figsize=(10, 8))
plt.barh(words, counts)
plt.xlabel('词频')
plt.title('词频分析')
plt.gca().invert_yaxis() # 反转y轴使得最高的条形图在上方
plt.savefig('词频分析.png')
plt.show()
# 创建词云图
wordcloud = WordCloud(font_path='苹方-简.ttf',width=800, height=600,
background_color='white').generate_from_frequencies(word_counts)
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off') # 关闭坐标轴
plt.title('词云图')
plt.savefig('词云图.png')
plt.show()