194 lines
17 KiB
Plaintext
194 lines
17 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from IPython.core.interactiveshell import InteractiveShell #执行该代码可以使得当前nb支持多输出\n",
|
||
"InteractiveShell.ast_node_interactivity = \"all\" \n",
|
||
"import numpy as np \n",
|
||
"import pandas as pd \n",
|
||
"import re"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"**作业**"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 62,
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'dict_values'>\n",
|
||
"《唐诗三百首》总共有 313 首诗,其中 李白 有 29 首诗,占比为 0.0926517571884984 排名第 2\n",
|
||
"[[1, 2, 7, [3, 4, 5, 6]], [17, 22, 28, [23, 24, 25, 26, 27]], [1692, 3054, 3059, [3055, 3056, 3057, 3058]]]\n",
|
||
"双字词中,词频大于1次的包括: [('苍苍', 2), ('幽径', 2), ('美酒', 2), ('举杯', 2), ('明月', 4), ('行乐', 2), ('断肠', 3), ('春风', 4), ('长风', 3), ('万里', 5), ('不见', 5), ('高楼', 2), ('长安', 3), ('秋风', 2), ('门前', 2), ('低头', 2), ('不可', 4), ('猿声', 2), ('天上', 2), ('一生', 2), ('不能', 2), ('红颜', 2), ('黄鹤', 3), ('五岳', 2), ('名山', 2), ('庐山', 3), ('倒挂', 2), ('不到', 2), ('天长', 2), ('我心', 2), ('彩云', 2), ('天姥', 2), ('四万', 2), ('对此', 2), ('迷花', 2), ('日月', 2), ('纷纷', 2), ('如麻', 2), ('古来', 2), ('流水', 3), ('何时', 2), ('不得', 2), ('请君', 2), ('可以', 2), ('青天', 7), ('人生', 2), ('明朝', 2), ('蜀道', 3), ('之难', 3), ('难于', 3), ('上青', 3), ('茫然', 2), ('峨嵋', 2), ('下有', 2), ('长叹', 2), ('相思', 2), ('凤凰', 3), ('明镜', 2), ('十千', 2), ('黄河', 2), ('行路', 2), ('千金', 2), ('夫子', 2), ('故乡', 2), ('浮云', 2), ('挥手', 2), ('秋月', 2)]\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "OSError",
|
||
"evalue": "cannot open resource",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
|
||
"\u001B[0;31mOSError\u001B[0m Traceback (most recent call last)",
|
||
"Input \u001B[0;32mIn [62]\u001B[0m, in \u001B[0;36m<cell line: 80>\u001B[0;34m()\u001B[0m\n\u001B[1;32m 77\u001B[0m \u001B[38;5;28mprint\u001B[39m( \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m双字词中,词频大于1次的包括:\u001B[39m\u001B[38;5;124m'\u001B[39m,doubleword(jb))\n\u001B[1;32m 79\u001B[0m w \u001B[38;5;241m=\u001B[39m WordCloud(font_path\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mC:\u001B[39m\u001B[38;5;130;01m\\\\\u001B[39;00m\u001B[38;5;124mWindows\u001B[39m\u001B[38;5;130;01m\\\\\u001B[39;00m\u001B[38;5;124mFonts\u001B[39m\u001B[38;5;130;01m\\\\\u001B[39;00m\u001B[38;5;124mSTFANGSO.ttf\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m---> 80\u001B[0m my_wordcloud\u001B[38;5;241m=\u001B[39m\u001B[43mw\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m \u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mjoin\u001B[49m\u001B[43m(\u001B[49m\u001B[43mjb\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;66;03m#生成词云\u001B[39;00m\n\u001B[1;32m 81\u001B[0m plt\u001B[38;5;241m.\u001B[39mimshow(my_wordcloud)\n\u001B[1;32m 82\u001B[0m plt\u001B[38;5;241m.\u001B[39maxis(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124moff\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
|
||
"File \u001B[0;32m~/Library/Python/3.10/lib/python/site-packages/wordcloud/wordcloud.py:639\u001B[0m, in \u001B[0;36mWordCloud.generate\u001B[0;34m(self, text)\u001B[0m\n\u001B[1;32m 624\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mgenerate\u001B[39m(\u001B[38;5;28mself\u001B[39m, text):\n\u001B[1;32m 625\u001B[0m \u001B[38;5;124;03m\"\"\"Generate wordcloud from text.\u001B[39;00m\n\u001B[1;32m 626\u001B[0m \n\u001B[1;32m 627\u001B[0m \u001B[38;5;124;03m The input \"text\" is expected to be a natural text. If you pass a sorted\u001B[39;00m\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 637\u001B[0m \u001B[38;5;124;03m self\u001B[39;00m\n\u001B[1;32m 638\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[0;32m--> 639\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate_from_text\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtext\u001B[49m\u001B[43m)\u001B[49m\n",
|
||
"File \u001B[0;32m~/Library/Python/3.10/lib/python/site-packages/wordcloud/wordcloud.py:621\u001B[0m, in \u001B[0;36mWordCloud.generate_from_text\u001B[0;34m(self, text)\u001B[0m\n\u001B[1;32m 604\u001B[0m \u001B[38;5;124;03m\"\"\"Generate wordcloud from text.\u001B[39;00m\n\u001B[1;32m 605\u001B[0m \n\u001B[1;32m 606\u001B[0m \u001B[38;5;124;03mThe input \"text\" is expected to be a natural text. If you pass a sorted\u001B[39;00m\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 618\u001B[0m \u001B[38;5;124;03mself\u001B[39;00m\n\u001B[1;32m 619\u001B[0m \u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m 620\u001B[0m words \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mprocess_text(text)\n\u001B[0;32m--> 621\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate_from_frequencies\u001B[49m\u001B[43m(\u001B[49m\u001B[43mwords\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 622\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\n",
|
||
"File \u001B[0;32m~/Library/Python/3.10/lib/python/site-packages/wordcloud/wordcloud.py:453\u001B[0m, in \u001B[0;36mWordCloud.generate_from_frequencies\u001B[0;34m(self, frequencies, max_font_size)\u001B[0m\n\u001B[1;32m 451\u001B[0m font_size \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mheight\n\u001B[1;32m 452\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m--> 453\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate_from_frequencies\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mdict\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mfrequencies\u001B[49m\u001B[43m[\u001B[49m\u001B[43m:\u001B[49m\u001B[38;5;241;43m2\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 454\u001B[0m \u001B[43m \u001B[49m\u001B[43mmax_font_size\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mheight\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 455\u001B[0m \u001B[38;5;66;03m# find font sizes\u001B[39;00m\n\u001B[1;32m 456\u001B[0m sizes \u001B[38;5;241m=\u001B[39m [x[\u001B[38;5;241m1\u001B[39m] \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlayout_]\n",
|
||
"File \u001B[0;32m~/Library/Python/3.10/lib/python/site-packages/wordcloud/wordcloud.py:503\u001B[0m, in \u001B[0;36mWordCloud.generate_from_frequencies\u001B[0;34m(self, frequencies, max_font_size)\u001B[0m\n\u001B[1;32m 500\u001B[0m tried_other_orientation \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mFalse\u001B[39;00m\n\u001B[1;32m 501\u001B[0m \u001B[38;5;28;01mwhile\u001B[39;00m \u001B[38;5;28;01mTrue\u001B[39;00m:\n\u001B[1;32m 502\u001B[0m \u001B[38;5;66;03m# try to find a position\u001B[39;00m\n\u001B[0;32m--> 503\u001B[0m font \u001B[38;5;241m=\u001B[39m \u001B[43mImageFont\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtruetype\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfont_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mfont_size\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 504\u001B[0m \u001B[38;5;66;03m# transpose font optionally\u001B[39;00m\n\u001B[1;32m 505\u001B[0m transposed_font \u001B[38;5;241m=\u001B[39m ImageFont\u001B[38;5;241m.\u001B[39mTransposedFont(\n\u001B[1;32m 506\u001B[0m font, orientation\u001B[38;5;241m=\u001B[39morientation)\n",
|
||
"File \u001B[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/PIL/ImageFont.py:844\u001B[0m, in \u001B[0;36mtruetype\u001B[0;34m(font, size, index, encoding, layout_engine)\u001B[0m\n\u001B[1;32m 841\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m FreeTypeFont(font, size, index, encoding, layout_engine)\n\u001B[1;32m 843\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 844\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfreetype\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfont\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 845\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mOSError\u001B[39;00m:\n\u001B[1;32m 846\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m isPath(font):\n",
|
||
"File \u001B[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/PIL/ImageFont.py:841\u001B[0m, in \u001B[0;36mtruetype.<locals>.freetype\u001B[0;34m(font)\u001B[0m\n\u001B[1;32m 840\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfreetype\u001B[39m(font):\n\u001B[0;32m--> 841\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mFreeTypeFont\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfont\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msize\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mindex\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mencoding\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mlayout_engine\u001B[49m\u001B[43m)\u001B[49m\n",
|
||
"File \u001B[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/PIL/ImageFont.py:193\u001B[0m, in \u001B[0;36mFreeTypeFont.__init__\u001B[0;34m(self, font, size, index, encoding, layout_engine)\u001B[0m\n\u001B[1;32m 191\u001B[0m load_from_bytes(f)\n\u001B[1;32m 192\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m\n\u001B[0;32m--> 193\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mfont \u001B[38;5;241m=\u001B[39m \u001B[43mcore\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgetfont\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 194\u001B[0m \u001B[43m \u001B[49m\u001B[43mfont\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msize\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mindex\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mencoding\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mlayout_engine\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mlayout_engine\u001B[49m\n\u001B[1;32m 195\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 196\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 197\u001B[0m load_from_bytes(font)\n",
|
||
"\u001B[0;31mOSError\u001B[0m: cannot open resource"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import re\n",
|
||
"import jieba\n",
|
||
"from collections import defaultdict\n",
|
||
"from collections import Counter\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"from wordcloud import WordCloud\n",
|
||
"\"\"\"\n",
|
||
"这段代码实现了对《唐诗三百首》中某位作者的诗词进行分析,并生成了该作者诗词中出现频率大于1的双字词的词云。\n",
|
||
"代码中的data_pre函数将原始数据按照作者、标题、注解等信息进行分类整理,并返回一个字典,其中包含每个作者的诗词行号、每首诗的标题行号、每首诗的注解行号等信息。\n",
|
||
"doubleword函数用于遍历词列表并找到所有双字词,并将频次写入字典,再从字典中找出符合要求的词。\n",
|
||
"poem函数用于将每个作者的诗词行号转换为诗词内容,并将所有诗词拼接成一个字符串列表。\n",
|
||
"最后,使用了jieba分词库对诗词进行分词,并使用WordCloud库生成词云图。\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"def data_pre(data):\n",
|
||
" authorAll = {}\n",
|
||
" titleAll = {}\n",
|
||
" noteAll = {}\n",
|
||
" lineNum = 0 # 循环读取行号\n",
|
||
" for line in data:\n",
|
||
" if(line != \"\\n\"):\n",
|
||
" if('作者:' in line):\n",
|
||
" # 利用正则查找冒号后的字\n",
|
||
" author = re.search(r'作者:(.+)', line).group(1)\n",
|
||
" authorAll[lineNum] = author\n",
|
||
" elif '《' in line and '》' in line:\n",
|
||
" # 用正则查找书名号中的字\n",
|
||
" title = re.search(r'《(.+)》', line).group(1)\n",
|
||
" titleAll[lineNum] = title\n",
|
||
" elif '【注解】:' in line:\n",
|
||
" # 用正则找注解后的几行\n",
|
||
" note = re.findall(r'【注解】:\\n(.+?)\\n', line, re.DOTALL)\n",
|
||
" noteAll[lineNum] = note\n",
|
||
" lineNum += 1\n",
|
||
" print(type(authorAll.values()))\n",
|
||
" return authorAll, titleAll, noteAll\n",
|
||
"\n",
|
||
"def doubleword (words_lst):\n",
|
||
" \"\"\"\n",
|
||
" 遍历列表以找到所有双词,并将频次写入字典,再从字典中找出符合要求的词\n",
|
||
" \"\"\"\n",
|
||
" result_lst = []\n",
|
||
" doublewords = {}\n",
|
||
" for word in words_lst:\n",
|
||
" # 用len过滤掉非双词\n",
|
||
" if len(word) == 2:\n",
|
||
" if word in doublewords:\n",
|
||
" doublewords[word] += 1\n",
|
||
" else:\n",
|
||
" doublewords[word] = 1\n",
|
||
" # 遍历所有词与频次\n",
|
||
" for word, freq in doublewords.items():\n",
|
||
" if freq > 1:\n",
|
||
" # 用元组打包一对值\n",
|
||
" result_lst.append((word, freq))\n",
|
||
" return result_lst\n",
|
||
"\n",
|
||
"def poem (tanc_author, data):\n",
|
||
" poems_lst = []\n",
|
||
" for poems in tanc_author:\n",
|
||
" content = ''\n",
|
||
" # 取每一行拼成全部,也可使用切片操作\n",
|
||
" lines = poems[-1]\n",
|
||
" for line in lines:\n",
|
||
" content += data[line]\n",
|
||
" poems_lst.append(content.replace('\\n', ''))\n",
|
||
" # 加入列表并去除换行标记\n",
|
||
" return poems_lst\n",
|
||
"\n",
|
||
"data = open('唐诗三百首完整版.txt','rt',encoding = 'utf-8').readlines()\n",
|
||
"authorAll,titleAll,noteAll = data_pre(data) #所有作者的行号、所有题目的行号、所有注解的行号\n",
|
||
"\n",
|
||
"# author = input(\"输入作者名:\")\n",
|
||
"author='李白'\n",
|
||
"authorDict = Counter(authorAll.values())\n",
|
||
"authorNum= authorDict[author]\n",
|
||
"allNum =len(authorAll) #sum(authorDict.values())\n",
|
||
"print('《唐诗三百首》总共有',allNum,'首诗,其中', author ,'有',authorNum,'首诗,占比为',authorNum/allNum,'排名第',sorted(authorDict.values(),reverse=True).index(authorNum)+1) #李白数量\n",
|
||
"\n",
|
||
"#求tanc结构 ,即title、author、note、content\n",
|
||
"tanc = defaultdict(list)\n",
|
||
"for key, t,a,n in zip(authorAll.values(),titleAll.keys(),authorAll.keys(),noteAll.keys()): #每个作者的字典值包括标题、作者、注解、诗句所在的行数\n",
|
||
" tanc[key].append([t,a,n,list(range(a+1,n))])\n",
|
||
"print(tanc['张九龄']) #用于测试结构\n",
|
||
"\n",
|
||
"data_lst=poem(tanc[author], data) #返回该作者全部诗句列表\n",
|
||
"#print(data_lst) #用于测试获得改作者全部诗词\n",
|
||
"jb=jieba.lcut_for_search(''.join(data_lst)) #生成jieba分词\n",
|
||
"print( '双字词中,词频大于1次的包括:',doubleword(jb))\n",
|
||
"\n",
|
||
"w = WordCloud(font_path=\"C:\\\\Windows\\\\Fonts\\\\STFANGSO.ttf\")\n",
|
||
"my_wordcloud=w.generate(' '.join(jb)) #生成词云\n",
|
||
"plt.imshow(my_wordcloud)\n",
|
||
"plt.axis(\"off\")\n",
|
||
"plt.show()"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"hide_input": false,
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.8.5"
|
||
},
|
||
"toc": {
|
||
"base_numbering": 1,
|
||
"nav_menu": {},
|
||
"number_sections": true,
|
||
"sideBar": true,
|
||
"skip_h1_title": false,
|
||
"title_cell": "Table of Contents",
|
||
"title_sidebar": "Contents",
|
||
"toc_cell": false,
|
||
"toc_position": {},
|
||
"toc_section_display": true,
|
||
"toc_window_display": false
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|