python-archieve-projects/政策文本爬虫/2.crawler.py

58 lines
1.9 KiB
Python

import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r'/Users/fallcity/Vvvvv/Coding/tools/chromedriver')
def crawl_website(site: str, url: str):
print(f"正在爬取 {site}: {url}")
driver.get(url)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
try:
if ".doc" in url:
return ""
elif site == '重庆市人民政府':
div = soup.find('div', class_='view TRS_UEDITOR trs_paper_default trs_word')
if div:
text = div.get_text(strip=True)
return text
elif site == '中华人民共和国国家发展和改革委员会':
div = soup.find('div', class_='TRS_Editor')
if div:
text = div.get_text(strip=True)
return text
elif site == '中华人民共和国住房和城乡建设部':
div = soup.find('div', class_='editor-content')
if div:
text = div.get_text(strip=True)
return text
elif site == '中华人民共和国中央人民政府':
div = soup.find('div', id='UCAP-CONTENT')
if div:
text = div.get_text(strip=True)
return text
except TimeoutError:
exit(1)
if __name__ == '__main__':
df = pd.read_excel('智慧社区文本.xlsx', engine='openpyxl')
df['文本内容'] = ""
for index, row in df.iterrows():
# 调用get_text函数获取文本内容
text_content = crawl_website(row['站点'], row['链接'])
# 将文本内容填入'文本内容'列
df.at[index, '文本内容'] = text_content
# 休眠2秒
time.sleep(2)
df = df.drop_duplicates(subset=['文本内容'])
df.to_excel('智慧社区文本.xlsx', index=False)
driver.quit()