{ "cells": [ { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import InteractiveShell #执行该代码可以使得当前nb支持多输出\n", "InteractiveShell.ast_node_interactivity = \"all\" \n", "import numpy as np \n", "import pandas as pd \n", "import re" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**作业**" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "《唐诗三百首》总共有 313 首诗,其中 李白 有 29 首诗,占比为 0.0926517571884984 排名第 2\n", "[[1, 2, 7, [3, 4, 5, 6]], [17, 22, 28, [23, 24, 25, 26, 27]], [1692, 3054, 3059, [3055, 3056, 3057, 3058]]]\n", "双字词中,词频大于1次的包括: [('苍苍', 2), ('幽径', 2), ('美酒', 2), ('举杯', 2), ('明月', 4), ('行乐', 2), ('断肠', 3), ('春风', 4), ('长风', 3), ('万里', 5), ('不见', 5), ('高楼', 2), ('长安', 3), ('秋风', 2), ('门前', 2), ('低头', 2), ('不可', 4), ('猿声', 2), ('天上', 2), ('一生', 2), ('不能', 2), ('红颜', 2), ('黄鹤', 3), ('五岳', 2), ('名山', 2), ('庐山', 3), ('倒挂', 2), ('不到', 2), ('天长', 2), ('我心', 2), ('彩云', 2), ('天姥', 2), ('四万', 2), ('对此', 2), ('迷花', 2), ('日月', 2), ('纷纷', 2), ('如麻', 2), ('古来', 2), ('流水', 3), ('何时', 2), ('不得', 2), ('请君', 2), ('可以', 2), ('青天', 7), ('人生', 2), ('明朝', 2), ('蜀道', 3), ('之难', 3), ('难于', 3), ('上青', 3), ('茫然', 2), ('峨嵋', 2), ('下有', 2), ('长叹', 2), ('相思', 2), ('凤凰', 3), ('明镜', 2), ('十千', 2), ('黄河', 2), ('行路', 2), ('千金', 2), ('夫子', 2), ('故乡', 2), ('浮云', 2), ('挥手', 2), ('秋月', 2)]\n" ] }, { "ename": "OSError", "evalue": "cannot open resource", "output_type": "error", "traceback": [ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[0;31mOSError\u001B[0m Traceback (most recent call last)", "Input \u001B[0;32mIn [62]\u001B[0m, in \u001B[0;36m\u001B[0;34m()\u001B[0m\n\u001B[1;32m 77\u001B[0m \u001B[38;5;28mprint\u001B[39m( \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m双字词中,词频大于1次的包括:\u001B[39m\u001B[38;5;124m'\u001B[39m,doubleword(jb))\n\u001B[1;32m 79\u001B[0m w \u001B[38;5;241m=\u001B[39m WordCloud(font_path\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mC:\u001B[39m\u001B[38;5;130;01m\\\\\u001B[39;00m\u001B[38;5;124mWindows\u001B[39m\u001B[38;5;130;01m\\\\\u001B[39;00m\u001B[38;5;124mFonts\u001B[39m\u001B[38;5;130;01m\\\\\u001B[39;00m\u001B[38;5;124mSTFANGSO.ttf\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m---> 80\u001B[0m my_wordcloud\u001B[38;5;241m=\u001B[39m\u001B[43mw\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43m \u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mjoin\u001B[49m\u001B[43m(\u001B[49m\u001B[43mjb\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;66;03m#生成词云\u001B[39;00m\n\u001B[1;32m 81\u001B[0m plt\u001B[38;5;241m.\u001B[39mimshow(my_wordcloud)\n\u001B[1;32m 82\u001B[0m plt\u001B[38;5;241m.\u001B[39maxis(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124moff\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", "File \u001B[0;32m~/Library/Python/3.10/lib/python/site-packages/wordcloud/wordcloud.py:639\u001B[0m, in \u001B[0;36mWordCloud.generate\u001B[0;34m(self, text)\u001B[0m\n\u001B[1;32m 624\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mgenerate\u001B[39m(\u001B[38;5;28mself\u001B[39m, text):\n\u001B[1;32m 625\u001B[0m \u001B[38;5;124;03m\"\"\"Generate wordcloud from text.\u001B[39;00m\n\u001B[1;32m 626\u001B[0m \n\u001B[1;32m 627\u001B[0m \u001B[38;5;124;03m The input \"text\" is expected to be a natural text. If you pass a sorted\u001B[39;00m\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 637\u001B[0m \u001B[38;5;124;03m self\u001B[39;00m\n\u001B[1;32m 638\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[0;32m--> 639\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate_from_text\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtext\u001B[49m\u001B[43m)\u001B[49m\n", "File \u001B[0;32m~/Library/Python/3.10/lib/python/site-packages/wordcloud/wordcloud.py:621\u001B[0m, in \u001B[0;36mWordCloud.generate_from_text\u001B[0;34m(self, text)\u001B[0m\n\u001B[1;32m 604\u001B[0m \u001B[38;5;124;03m\"\"\"Generate wordcloud from text.\u001B[39;00m\n\u001B[1;32m 605\u001B[0m \n\u001B[1;32m 606\u001B[0m \u001B[38;5;124;03mThe input \"text\" is expected to be a natural text. If you pass a sorted\u001B[39;00m\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 618\u001B[0m \u001B[38;5;124;03mself\u001B[39;00m\n\u001B[1;32m 619\u001B[0m \u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m 620\u001B[0m words \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mprocess_text(text)\n\u001B[0;32m--> 621\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate_from_frequencies\u001B[49m\u001B[43m(\u001B[49m\u001B[43mwords\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 622\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\n", "File \u001B[0;32m~/Library/Python/3.10/lib/python/site-packages/wordcloud/wordcloud.py:453\u001B[0m, in \u001B[0;36mWordCloud.generate_from_frequencies\u001B[0;34m(self, frequencies, max_font_size)\u001B[0m\n\u001B[1;32m 451\u001B[0m font_size \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mheight\n\u001B[1;32m 452\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m--> 453\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgenerate_from_frequencies\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mdict\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mfrequencies\u001B[49m\u001B[43m[\u001B[49m\u001B[43m:\u001B[49m\u001B[38;5;241;43m2\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 454\u001B[0m \u001B[43m \u001B[49m\u001B[43mmax_font_size\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mheight\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 455\u001B[0m \u001B[38;5;66;03m# find font sizes\u001B[39;00m\n\u001B[1;32m 456\u001B[0m sizes \u001B[38;5;241m=\u001B[39m [x[\u001B[38;5;241m1\u001B[39m] \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlayout_]\n", "File \u001B[0;32m~/Library/Python/3.10/lib/python/site-packages/wordcloud/wordcloud.py:503\u001B[0m, in \u001B[0;36mWordCloud.generate_from_frequencies\u001B[0;34m(self, frequencies, max_font_size)\u001B[0m\n\u001B[1;32m 500\u001B[0m tried_other_orientation \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mFalse\u001B[39;00m\n\u001B[1;32m 501\u001B[0m \u001B[38;5;28;01mwhile\u001B[39;00m \u001B[38;5;28;01mTrue\u001B[39;00m:\n\u001B[1;32m 502\u001B[0m \u001B[38;5;66;03m# try to find a position\u001B[39;00m\n\u001B[0;32m--> 503\u001B[0m font \u001B[38;5;241m=\u001B[39m \u001B[43mImageFont\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtruetype\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfont_path\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mfont_size\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 504\u001B[0m \u001B[38;5;66;03m# transpose font optionally\u001B[39;00m\n\u001B[1;32m 505\u001B[0m transposed_font \u001B[38;5;241m=\u001B[39m ImageFont\u001B[38;5;241m.\u001B[39mTransposedFont(\n\u001B[1;32m 506\u001B[0m font, orientation\u001B[38;5;241m=\u001B[39morientation)\n", "File \u001B[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/PIL/ImageFont.py:844\u001B[0m, in \u001B[0;36mtruetype\u001B[0;34m(font, size, index, encoding, layout_engine)\u001B[0m\n\u001B[1;32m 841\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m FreeTypeFont(font, size, index, encoding, layout_engine)\n\u001B[1;32m 843\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 844\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfreetype\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfont\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 845\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mOSError\u001B[39;00m:\n\u001B[1;32m 846\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m isPath(font):\n", "File \u001B[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/PIL/ImageFont.py:841\u001B[0m, in \u001B[0;36mtruetype..freetype\u001B[0;34m(font)\u001B[0m\n\u001B[1;32m 840\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfreetype\u001B[39m(font):\n\u001B[0;32m--> 841\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mFreeTypeFont\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfont\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msize\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mindex\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mencoding\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mlayout_engine\u001B[49m\u001B[43m)\u001B[49m\n", "File \u001B[0;32m/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/PIL/ImageFont.py:193\u001B[0m, in \u001B[0;36mFreeTypeFont.__init__\u001B[0;34m(self, font, size, index, encoding, layout_engine)\u001B[0m\n\u001B[1;32m 191\u001B[0m load_from_bytes(f)\n\u001B[1;32m 192\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m\n\u001B[0;32m--> 193\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mfont \u001B[38;5;241m=\u001B[39m \u001B[43mcore\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgetfont\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 194\u001B[0m \u001B[43m \u001B[49m\u001B[43mfont\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msize\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mindex\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mencoding\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mlayout_engine\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mlayout_engine\u001B[49m\n\u001B[1;32m 195\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 196\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 197\u001B[0m load_from_bytes(font)\n", "\u001B[0;31mOSError\u001B[0m: cannot open resource" ] } ], "source": [ "import re\n", "import jieba\n", "from collections import defaultdict\n", "from collections import Counter\n", "import matplotlib.pyplot as plt\n", "from wordcloud import WordCloud\n", "\"\"\"\n", "这段代码实现了对《唐诗三百首》中某位作者的诗词进行分析,并生成了该作者诗词中出现频率大于1的双字词的词云。\n", "代码中的data_pre函数将原始数据按照作者、标题、注解等信息进行分类整理,并返回一个字典,其中包含每个作者的诗词行号、每首诗的标题行号、每首诗的注解行号等信息。\n", "doubleword函数用于遍历词列表并找到所有双字词,并将频次写入字典,再从字典中找出符合要求的词。\n", "poem函数用于将每个作者的诗词行号转换为诗词内容,并将所有诗词拼接成一个字符串列表。\n", "最后,使用了jieba分词库对诗词进行分词,并使用WordCloud库生成词云图。\n", "\"\"\"\n", "\n", "def data_pre(data):\n", " authorAll = {}\n", " titleAll = {}\n", " noteAll = {}\n", " lineNum = 0 # 循环读取行号\n", " for line in data:\n", " if(line != \"\\n\"):\n", " if('作者:' in line):\n", " # 利用正则查找冒号后的字\n", " author = re.search(r'作者:(.+)', line).group(1)\n", " authorAll[lineNum] = author\n", " elif '《' in line and '》' in line:\n", " # 用正则查找书名号中的字\n", " title = re.search(r'《(.+)》', line).group(1)\n", " titleAll[lineNum] = title\n", " elif '【注解】:' in line:\n", " # 用正则找注解后的几行\n", " note = re.findall(r'【注解】:\\n(.+?)\\n', line, re.DOTALL)\n", " noteAll[lineNum] = note\n", " lineNum += 1\n", " print(type(authorAll.values()))\n", " return authorAll, titleAll, noteAll\n", "\n", "def doubleword (words_lst):\n", " \"\"\"\n", " 遍历列表以找到所有双词,并将频次写入字典,再从字典中找出符合要求的词\n", " \"\"\"\n", " result_lst = []\n", " doublewords = {}\n", " for word in words_lst:\n", " # 用len过滤掉非双词\n", " if len(word) == 2:\n", " if word in doublewords:\n", " doublewords[word] += 1\n", " else:\n", " doublewords[word] = 1\n", " # 遍历所有词与频次\n", " for word, freq in doublewords.items():\n", " if freq > 1:\n", " # 用元组打包一对值\n", " result_lst.append((word, freq))\n", " return result_lst\n", "\n", "def poem (tanc_author, data):\n", " poems_lst = []\n", " for poems in tanc_author:\n", " content = ''\n", " # 取每一行拼成全部,也可使用切片操作\n", " lines = poems[-1]\n", " for line in lines:\n", " content += data[line]\n", " poems_lst.append(content.replace('\\n', ''))\n", " # 加入列表并去除换行标记\n", " return poems_lst\n", "\n", "data = open('唐诗三百首完整版.txt','rt',encoding = 'utf-8').readlines()\n", "authorAll,titleAll,noteAll = data_pre(data) #所有作者的行号、所有题目的行号、所有注解的行号\n", "\n", "# author = input(\"输入作者名:\")\n", "author='李白'\n", "authorDict = Counter(authorAll.values())\n", "authorNum= authorDict[author]\n", "allNum =len(authorAll) #sum(authorDict.values())\n", "print('《唐诗三百首》总共有',allNum,'首诗,其中', author ,'有',authorNum,'首诗,占比为',authorNum/allNum,'排名第',sorted(authorDict.values(),reverse=True).index(authorNum)+1) #李白数量\n", "\n", "#求tanc结构 ,即title、author、note、content\n", "tanc = defaultdict(list)\n", "for key, t,a,n in zip(authorAll.values(),titleAll.keys(),authorAll.keys(),noteAll.keys()): #每个作者的字典值包括标题、作者、注解、诗句所在的行数\n", " tanc[key].append([t,a,n,list(range(a+1,n))])\n", "print(tanc['张九龄']) #用于测试结构\n", "\n", "data_lst=poem(tanc[author], data) #返回该作者全部诗句列表\n", "#print(data_lst) #用于测试获得改作者全部诗词\n", "jb=jieba.lcut_for_search(''.join(data_lst)) #生成jieba分词\n", "print( '双字词中,词频大于1次的包括:',doubleword(jb))\n", "\n", "w = WordCloud(font_path=\"C:\\\\Windows\\\\Fonts\\\\STFANGSO.ttf\")\n", "my_wordcloud=w.generate(' '.join(jb)) #生成词云\n", "plt.imshow(my_wordcloud)\n", "plt.axis(\"off\")\n", "plt.show()" ] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }