python-archieve-projects/政策文本爬虫/1.get_links.py

125 lines
4.8 KiB
Python

import time
import selenium.common.exceptions
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
# 创建一个Chrome浏览器实例
driver = webdriver.Chrome(executable_path=r'/Users/fallcity/Vvvvv/Coding/tools/chromedriver')
df = pd.DataFrame(columns=['站点', '标题', '链接'], )
# 重庆市人民政府
def get_all_links_cq():
url = 'https://www.cq.gov.cn/cqgovsearch/search.html?searchWord=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&tenantId=7&configTenantId=&dataTypeId=7&sign=d46b7b1d-937f-4c52-e9af-0dbac183bf18&areaCode='
driver.get(url)
time.sleep(2)
times = 1
while times <= 15:
try:
print(f"正在爬取重庆市人民政府相关文本 第{times}")
next_page_btn = driver.find_element_by_class_name('layui-laypage-next')
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
lst = soup.find(class_='basic_result_content')
links = lst.find_all('a')
for link in links:
df.loc[len(df)] = ['重庆市人民政府', link.text.strip(), link['href']]
print(links)
time.sleep(1)
next_page_btn.click()
time.sleep(1)
times += 1
except selenium.common.exceptions.NoSuchElementException or selenium.common.exceptions.TimeoutException:
break
# 发改委
def get_all_links_ndrc():
url = 'https://so.ndrc.gov.cn/s?qt=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&siteCode=bm04000007&tab=all&toolsStatus=1'
driver.get(url)
time.sleep(2)
times = 1
while times <= 7:
print(f"正在爬取中华人民共和国国家发展和改革委员会相关文本 第{times}")
next_page_btn = driver.find_elements_by_class_name('next')[-1]
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
lst = soup.find(class_='content fl')
links = lst.find_all('a')
for link in links:
try:
if 'javascript' not in link['href']:
df.loc[len(df)] = ['中华人民共和国国家发展和改革委员会', link.text.strip(), link['href']]
except KeyError:
pass
print(links)
next_page_btn.click()
time.sleep(1)
times += 1
# 住建部
def get_all_links_mohurd():
url = 'https://www.mohurd.gov.cn/ess/?ty=a&query=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&ukl=&uka=&ukf=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&ukt=&sl=&ts=&te=&upg=1'
driver.get(url)
time.sleep(2)
times = 1
while times <= 15:
try:
print(f"正在爬取中华人民共和国住房和城乡建设部相关文本 第{times}")
next_page_btn = driver.find_element_by_class_name('next')
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
lst = soup.find(class_='result-list')
links = lst.find_all('a')
for link in links:
try:
if 'javascript' not in link['href']:
df.loc[len(df)] = ['中华人民共和国住房和城乡建设部', link.text.strip(), link['href']]
except KeyError:
pass
print(links)
next_page_btn.click()
time.sleep(1)
times += 1
except selenium.common.exceptions.NoSuchElementException:
break
def get_all_links_gov():
url = 'https://sousuo.www.gov.cn/sousuo/search.shtml?code=17da70961a7&dataTypeId=107&searchWord=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA'
driver.get(url)
time.sleep(2)
times = 1
while times <= 3:
try:
print(f"正在爬取中华人民共和国中央人民政府相关文本 第{times}")
next_page_btn = driver.find_element_by_class_name('next')
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
lst = soup.find(class_='left js_basic_result_left')
links = lst.find_all('a')
for link in links:
try:
if 'javascript' not in link['href'] and link['href'] != 'url':
df.loc[len(df)] = ['中华人民共和国中央人民政府', link.text.strip(), link['href']]
except KeyError:
pass
print(links)
next_page_btn.click()
time.sleep(1)
times += 1
except selenium.common.exceptions.NoSuchElementException:
break
if __name__ == '__main__':
get_all_links_cq()
get_all_links_ndrc()
get_all_links_mohurd()
get_all_links_gov()
driver.quit() # 安全退出
df.to_excel('智慧社区文本.xlsx', index=False)