58 lines
1.9 KiB
Python
58 lines
1.9 KiB
Python
import time
|
|
|
|
import pandas as pd
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from selenium import webdriver
|
|
|
|
driver = webdriver.Chrome(executable_path=r'/Users/fallcity/Vvvvv/Coding/tools/chromedriver')
|
|
|
|
|
|
def crawl_website(site: str, url: str):
|
|
print(f"正在爬取 {site}: {url}")
|
|
|
|
driver.get(url)
|
|
page_source = driver.page_source
|
|
soup = BeautifulSoup(page_source, 'html.parser')
|
|
|
|
try:
|
|
if ".doc" in url:
|
|
return ""
|
|
elif site == '重庆市人民政府':
|
|
div = soup.find('div', class_='view TRS_UEDITOR trs_paper_default trs_word')
|
|
if div:
|
|
text = div.get_text(strip=True)
|
|
return text
|
|
elif site == '中华人民共和国国家发展和改革委员会':
|
|
div = soup.find('div', class_='TRS_Editor')
|
|
if div:
|
|
text = div.get_text(strip=True)
|
|
return text
|
|
elif site == '中华人民共和国住房和城乡建设部':
|
|
div = soup.find('div', class_='editor-content')
|
|
if div:
|
|
text = div.get_text(strip=True)
|
|
return text
|
|
elif site == '中华人民共和国中央人民政府':
|
|
div = soup.find('div', id='UCAP-CONTENT')
|
|
if div:
|
|
text = div.get_text(strip=True)
|
|
return text
|
|
except TimeoutError:
|
|
exit(1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
df = pd.read_excel('智慧社区文本.xlsx', engine='openpyxl')
|
|
df['文本内容'] = ""
|
|
for index, row in df.iterrows():
|
|
# 调用get_text函数获取文本内容
|
|
text_content = crawl_website(row['站点'], row['链接'])
|
|
# 将文本内容填入'文本内容'列
|
|
df.at[index, '文本内容'] = text_content
|
|
# 休眠2秒
|
|
time.sleep(2)
|
|
df = df.drop_duplicates(subset=['文本内容'])
|
|
df.to_excel('智慧社区文本.xlsx', index=False)
|
|
driver.quit()
|