python-archieve-projects/M301/第七章/第七章作业.ipynb

194 lines
17 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"from IPython.core.interactiveshell import InteractiveShell #执行该代码可以使得当前nb支持多输出\n",
"InteractiveShell.ast_node_interactivity = \"all\" \n",
"import numpy as np \n",
"import pandas as pd \n",
"import re"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**作业**"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'dict_values'>\n",
"《唐诗三百首》总共有 313 首诗,其中 李白 有 29 首诗,占比为 0.0926517571884984 排名第 2\n",
"[[1, 2, 7, [3, 4, 5, 6]], [17, 22, 28, [23, 24, 25, 26, 27]], [1692, 3054, 3059, [3055, 3056, 3057, 3058]]]\n",
"双字词中词频大于1次的包括 [('苍苍', 2), ('幽径', 2), ('美酒', 2), ('举杯', 2), ('明月', 4), ('行乐', 2), ('断肠', 3), ('春风', 4), ('长风', 3), ('万里', 5), ('不见', 5), ('高楼', 2), ('长安', 3), ('秋风', 2), ('门前', 2), ('低头', 2), ('不可', 4), ('猿声', 2), ('天上', 2), ('一生', 2), ('不能', 2), ('红颜', 2), ('黄鹤', 3), ('五岳', 2), ('名山', 2), ('庐山', 3), ('倒挂', 2), ('不到', 2), ('天长', 2), ('我心', 2), ('彩云', 2), ('天姥', 2), ('四万', 2), ('对此', 2), ('迷花', 2), ('日月', 2), ('纷纷', 2), ('如麻', 2), ('古来', 2), ('流水', 3), ('何时', 2), ('不得', 2), ('请君', 2), ('可以', 2), ('青天', 7), ('人生', 2), ('明朝', 2), ('蜀道', 3), ('之难', 3), ('难于', 3), ('上青', 3), ('茫然', 2), ('峨嵋', 2), ('下有', 2), ('长叹', 2), ('相思', 2), ('凤凰', 3), ('明镜', 2), ('十千', 2), ('黄河', 2), ('行路', 2), ('千金', 2), ('夫子', 2), ('故乡', 2), ('浮云', 2), ('挥手', 2), ('秋月', 2)]\n"
]
},
{
"ename": "OSError",
"evalue": "cannot open resource",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mOSError\u001B[0m Traceback (most recent call last)",
"Input \u001B[0;32mIn [62]\u001B[0m, in \u001B[0;36m<cell line: 80>\u001B[0;34m()\u001B[0m\n\u001B[1;32m 77\u001B[0m \u001B[38;5;28mprint\u001B[39m( \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m双字词中词频大于1次的包括\u001B[39m\u001B[38;5;124m'\u001B[39m,doubleword(jb))\n\u001B[1;32m 79\u001B[0m w \u001B[38;5;241m=\u001B[39m WordCloud(font_path\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mC:\u001B[39m\u001B[38;5;130;01m\\\\\u001B[39;00m\u001B[38;5;124mWindows\u001B[39m\u001B[38;5;130;01m\\\\\u001B[39;00m\u001B[38;5;124mFonts\u001B[39m\u001B[38;5;130;01m\\\\\u001B[39;00m\u001B[38;5;124mSTFANGSO.ttf\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m---> 80\u001B[0m my_wordcloud\u001B[38;5;241m=\u001B[39m\u001B[43mw\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m \u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mjoin\u001B[49m\u001B[43m(\u001B[49m\u001B[43mjb\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;66;03m#生成词云\u001B[39;00m\n\u001B[1;32m 81\u001B[0m plt\u001B[38;5;241m.\u001B[39mimshow(my_wordcloud)\n\u001B[1;32m 82\u001B[0m plt\u001B[38;5;241m.\u001B[39maxis(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124moff\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
"File \u001B[0;32m~/Library/Python/3.10/lib/python/site-packages/wordcloud/wordcloud.py:639\u001B[0m, in \u001B[0;36mWordCloud.generate\u001B[0;34m(self, text)\u001B[0m\n\u001B[1;32m 624\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mgenerate\u001B[39m(\u001B[38;5;28mself\u001B[39m, text):\n\u001B[1;32m 625\u001B[0m \u001B[38;5;124;03m\"\"\"Generate wordcloud from text.\u001B[39;00m\n\u001B[1;32m 626\u001B[0m \n\u001B[1;32m 627\u001B[0m \u001B[38;5;124;03m The input \"text\" is expected to be a natural text. If you pass a sorted\u001B[39;00m\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 637\u001B[0m \u001B[38;5;124;03m self\u001B[39;00m\n\u001B[1;32m 638\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[0;32m--> 639\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate_from_text\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtext\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[0;32m~/Library/Python/3.10/lib/python/site-packages/wordcloud/wordcloud.py:621\u001B[0m, in \u001B[0;36mWordCloud.generate_from_text\u001B[0;34m(self, text)\u001B[0m\n\u001B[1;32m 604\u001B[0m \u001B[38;5;124;03m\"\"\"Generate wordcloud from text.\u001B[39;00m\n\u001B[1;32m 605\u001B[0m \n\u001B[1;32m 606\u001B[0m \u001B[38;5;124;03mThe input \"text\" is expected to be a natural text. If you pass a sorted\u001B[39;00m\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 618\u001B[0m \u001B[38;5;124;03mself\u001B[39;00m\n\u001B[1;32m 619\u001B[0m \u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m 620\u001B[0m words \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mprocess_text(text)\n\u001B[0;32m--> 621\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate_from_frequencies\u001B[49m\u001B[43m(\u001B[49m\u001B[43mwords\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 622\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\n",
"File \u001B[0;32m~/Library/Python/3.10/lib/python/site-packages/wordcloud/wordcloud.py:453\u001B[0m, in \u001B[0;36mWordCloud.generate_from_frequencies\u001B[0;34m(self, frequencies, max_font_size)\u001B[0m\n\u001B[1;32m 451\u001B[0m font_size \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mheight\n\u001B[1;32m 452\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m--> 453\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate_from_frequencies\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mdict\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mfrequencies\u001B[49m\u001B[43m[\u001B[49m\u001B[43m:\u001B[49m\u001B[38;5;241;43m2\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 454\u001B[0m \u001B[43m \u001B[49m\u001B[43mmax_font_size\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mheight\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 455\u001B[0m \u001B[38;5;66;03m# find font sizes\u001B[39;00m\n\u001B[1;32m 456\u001B[0m sizes \u001B[38;5;241m=\u001B[39m [x[\u001B[38;5;241m1\u001B[39m] \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlayout_]\n",
"File \u001B[0;32m~/Library/Python/3.10/lib/python/site-packages/wordcloud/wordcloud.py:503\u001B[0m, in \u001B[0;36mWordCloud.generate_from_frequencies\u001B[0;34m(self, frequencies, max_font_size)\u001B[0m\n\u001B[1;32m 500\u001B[0m tried_other_orientation \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mFalse\u001B[39;00m\n\u001B[1;32m 501\u001B[0m \u001B[38;5;28;01mwhile\u001B[39;00m \u001B[38;5;28;01mTrue\u001B[39;00m:\n\u001B[1;32m 502\u001B[0m \u001B[38;5;66;03m# try to find a position\u001B[39;00m\n\u001B[0;32m--> 503\u001B[0m font \u001B[38;5;241m=\u001B[39m \u001B[43mImageFont\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtruetype\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfont_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mfont_size\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 504\u001B[0m \u001B[38;5;66;03m# transpose font optionally\u001B[39;00m\n\u001B[1;32m 505\u001B[0m transposed_font \u001B[38;5;241m=\u001B[39m ImageFont\u001B[38;5;241m.\u001B[39mTransposedFont(\n\u001B[1;32m 506\u001B[0m font, orientation\u001B[38;5;241m=\u001B[39morientation)\n",
"File \u001B[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/PIL/ImageFont.py:844\u001B[0m, in \u001B[0;36mtruetype\u001B[0;34m(font, size, index, encoding, layout_engine)\u001B[0m\n\u001B[1;32m 841\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m FreeTypeFont(font, size, index, encoding, layout_engine)\n\u001B[1;32m 843\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 844\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfreetype\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfont\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 845\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mOSError\u001B[39;00m:\n\u001B[1;32m 846\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m isPath(font):\n",
"File \u001B[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/PIL/ImageFont.py:841\u001B[0m, in \u001B[0;36mtruetype.<locals>.freetype\u001B[0;34m(font)\u001B[0m\n\u001B[1;32m 840\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfreetype\u001B[39m(font):\n\u001B[0;32m--> 841\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mFreeTypeFont\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfont\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msize\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mindex\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mencoding\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mlayout_engine\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/PIL/ImageFont.py:193\u001B[0m, in \u001B[0;36mFreeTypeFont.__init__\u001B[0;34m(self, font, size, index, encoding, layout_engine)\u001B[0m\n\u001B[1;32m 191\u001B[0m load_from_bytes(f)\n\u001B[1;32m 192\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m\n\u001B[0;32m--> 193\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mfont \u001B[38;5;241m=\u001B[39m \u001B[43mcore\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgetfont\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 194\u001B[0m \u001B[43m \u001B[49m\u001B[43mfont\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msize\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mindex\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mencoding\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mlayout_engine\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mlayout_engine\u001B[49m\n\u001B[1;32m 195\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 196\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 197\u001B[0m load_from_bytes(font)\n",
"\u001B[0;31mOSError\u001B[0m: cannot open resource"
]
}
],
"source": [
"import re\n",
"import jieba\n",
"from collections import defaultdict\n",
"from collections import Counter\n",
"import matplotlib.pyplot as plt\n",
"from wordcloud import WordCloud\n",
"\"\"\"\n",
"这段代码实现了对《唐诗三百首》中某位作者的诗词进行分析并生成了该作者诗词中出现频率大于1的双字词的词云。\n",
"代码中的data_pre函数将原始数据按照作者、标题、注解等信息进行分类整理并返回一个字典其中包含每个作者的诗词行号、每首诗的标题行号、每首诗的注解行号等信息。\n",
"doubleword函数用于遍历词列表并找到所有双字词并将频次写入字典再从字典中找出符合要求的词。\n",
"poem函数用于将每个作者的诗词行号转换为诗词内容并将所有诗词拼接成一个字符串列表。\n",
"最后使用了jieba分词库对诗词进行分词并使用WordCloud库生成词云图。\n",
"\"\"\"\n",
"\n",
"def data_pre(data):\n",
" authorAll = {}\n",
" titleAll = {}\n",
" noteAll = {}\n",
" lineNum = 0 # 循环读取行号\n",
" for line in data:\n",
" if(line != \"\\n\"):\n",
" if('作者:' in line):\n",
" # 利用正则查找冒号后的字\n",
" author = re.search(r'作者:(.+)', line).group(1)\n",
" authorAll[lineNum] = author\n",
" elif '《' in line and '》' in line:\n",
" # 用正则查找书名号中的字\n",
" title = re.search(r'《(.+)》', line).group(1)\n",
" titleAll[lineNum] = title\n",
" elif '【注解】:' in line:\n",
" # 用正则找注解后的几行\n",
" note = re.findall(r'【注解】:\\n(.+?)\\n', line, re.DOTALL)\n",
" noteAll[lineNum] = note\n",
" lineNum += 1\n",
" print(type(authorAll.values()))\n",
" return authorAll, titleAll, noteAll\n",
"\n",
"def doubleword (words_lst):\n",
" \"\"\"\n",
" 遍历列表以找到所有双词,并将频次写入字典,再从字典中找出符合要求的词\n",
" \"\"\"\n",
" result_lst = []\n",
" doublewords = {}\n",
" for word in words_lst:\n",
" # 用len过滤掉非双词\n",
" if len(word) == 2:\n",
" if word in doublewords:\n",
" doublewords[word] += 1\n",
" else:\n",
" doublewords[word] = 1\n",
" # 遍历所有词与频次\n",
" for word, freq in doublewords.items():\n",
" if freq > 1:\n",
" # 用元组打包一对值\n",
" result_lst.append((word, freq))\n",
" return result_lst\n",
"\n",
"def poem (tanc_author, data):\n",
" poems_lst = []\n",
" for poems in tanc_author:\n",
" content = ''\n",
" # 取每一行拼成全部,也可使用切片操作\n",
" lines = poems[-1]\n",
" for line in lines:\n",
" content += data[line]\n",
" poems_lst.append(content.replace('\\n', ''))\n",
" # 加入列表并去除换行标记\n",
" return poems_lst\n",
"\n",
"data = open('唐诗三百首完整版.txt','rt',encoding = 'utf-8').readlines()\n",
"authorAll,titleAll,noteAll = data_pre(data) #所有作者的行号、所有题目的行号、所有注解的行号\n",
"\n",
"# author = input(\"输入作者名:\")\n",
"author='李白'\n",
"authorDict = Counter(authorAll.values())\n",
"authorNum= authorDict[author]\n",
"allNum =len(authorAll) #sum(authorDict.values())\n",
"print('《唐诗三百首》总共有',allNum,'首诗,其中', author ,'有',authorNum,'首诗,占比为',authorNum/allNum,'排名第',sorted(authorDict.values(),reverse=True).index(authorNum)+1) #李白数量\n",
"\n",
"#求tanc结构 即title、author、note、content\n",
"tanc = defaultdict(list)\n",
"for key, t,a,n in zip(authorAll.values(),titleAll.keys(),authorAll.keys(),noteAll.keys()): #每个作者的字典值包括标题、作者、注解、诗句所在的行数\n",
" tanc[key].append([t,a,n,list(range(a+1,n))])\n",
"print(tanc['张九龄']) #用于测试结构\n",
"\n",
"data_lst=poem(tanc[author], data) #返回该作者全部诗句列表\n",
"#print(data_lst) #用于测试获得改作者全部诗词\n",
"jb=jieba.lcut_for_search(''.join(data_lst)) #生成jieba分词\n",
"print( '双字词中词频大于1次的包括',doubleword(jb))\n",
"\n",
"w = WordCloud(font_path=\"C:\\\\Windows\\\\Fonts\\\\STFANGSO.ttf\")\n",
"my_wordcloud=w.generate(' '.join(jb)) #生成词云\n",
"plt.imshow(my_wordcloud)\n",
"plt.axis(\"off\")\n",
"plt.show()"
]
}
],
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}