125 lines
4.8 KiB
Python
125 lines
4.8 KiB
Python
import time
|
|
|
|
import selenium.common.exceptions
|
|
from selenium import webdriver
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
|
|
# 创建一个Chrome浏览器实例
|
|
driver = webdriver.Chrome(executable_path=r'/Users/fallcity/Vvvvv/Coding/tools/chromedriver')
|
|
|
|
df = pd.DataFrame(columns=['站点', '标题', '链接'], )
|
|
|
|
|
|
# 重庆市人民政府
|
|
def get_all_links_cq():
|
|
url = 'https://www.cq.gov.cn/cqgovsearch/search.html?searchWord=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&tenantId=7&configTenantId=&dataTypeId=7&sign=d46b7b1d-937f-4c52-e9af-0dbac183bf18&areaCode='
|
|
driver.get(url)
|
|
time.sleep(2)
|
|
times = 1
|
|
while times <= 15:
|
|
try:
|
|
print(f"正在爬取重庆市人民政府相关文本 第{times}页")
|
|
next_page_btn = driver.find_element_by_class_name('layui-laypage-next')
|
|
page_source = driver.page_source
|
|
soup = BeautifulSoup(page_source, 'html.parser')
|
|
lst = soup.find(class_='basic_result_content')
|
|
links = lst.find_all('a')
|
|
for link in links:
|
|
df.loc[len(df)] = ['重庆市人民政府', link.text.strip(), link['href']]
|
|
print(links)
|
|
time.sleep(1)
|
|
next_page_btn.click()
|
|
time.sleep(1)
|
|
times += 1
|
|
except selenium.common.exceptions.NoSuchElementException or selenium.common.exceptions.TimeoutException:
|
|
break
|
|
|
|
# 发改委
|
|
def get_all_links_ndrc():
|
|
url = 'https://so.ndrc.gov.cn/s?qt=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&siteCode=bm04000007&tab=all&toolsStatus=1'
|
|
driver.get(url)
|
|
time.sleep(2)
|
|
times = 1
|
|
while times <= 7:
|
|
print(f"正在爬取中华人民共和国国家发展和改革委员会相关文本 第{times}页")
|
|
next_page_btn = driver.find_elements_by_class_name('next')[-1]
|
|
page_source = driver.page_source
|
|
soup = BeautifulSoup(page_source, 'html.parser')
|
|
lst = soup.find(class_='content fl')
|
|
links = lst.find_all('a')
|
|
for link in links:
|
|
try:
|
|
if 'javascript' not in link['href']:
|
|
df.loc[len(df)] = ['中华人民共和国国家发展和改革委员会', link.text.strip(), link['href']]
|
|
except KeyError:
|
|
pass
|
|
print(links)
|
|
next_page_btn.click()
|
|
time.sleep(1)
|
|
times += 1
|
|
|
|
|
|
# 住建部
|
|
def get_all_links_mohurd():
|
|
url = 'https://www.mohurd.gov.cn/ess/?ty=a&query=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&ukl=&uka=&ukf=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&ukt=&sl=&ts=&te=&upg=1'
|
|
driver.get(url)
|
|
time.sleep(2)
|
|
times = 1
|
|
while times <= 15:
|
|
try:
|
|
print(f"正在爬取中华人民共和国住房和城乡建设部相关文本 第{times}页")
|
|
next_page_btn = driver.find_element_by_class_name('next')
|
|
page_source = driver.page_source
|
|
soup = BeautifulSoup(page_source, 'html.parser')
|
|
lst = soup.find(class_='result-list')
|
|
links = lst.find_all('a')
|
|
for link in links:
|
|
try:
|
|
if 'javascript' not in link['href']:
|
|
df.loc[len(df)] = ['中华人民共和国住房和城乡建设部', link.text.strip(), link['href']]
|
|
except KeyError:
|
|
pass
|
|
print(links)
|
|
next_page_btn.click()
|
|
time.sleep(1)
|
|
times += 1
|
|
except selenium.common.exceptions.NoSuchElementException:
|
|
break
|
|
|
|
|
|
def get_all_links_gov():
|
|
url = 'https://sousuo.www.gov.cn/sousuo/search.shtml?code=17da70961a7&dataTypeId=107&searchWord=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA'
|
|
driver.get(url)
|
|
time.sleep(2)
|
|
times = 1
|
|
while times <= 3:
|
|
try:
|
|
print(f"正在爬取中华人民共和国中央人民政府相关文本 第{times}页")
|
|
next_page_btn = driver.find_element_by_class_name('next')
|
|
page_source = driver.page_source
|
|
soup = BeautifulSoup(page_source, 'html.parser')
|
|
lst = soup.find(class_='left js_basic_result_left')
|
|
links = lst.find_all('a')
|
|
for link in links:
|
|
try:
|
|
if 'javascript' not in link['href'] and link['href'] != 'url':
|
|
df.loc[len(df)] = ['中华人民共和国中央人民政府', link.text.strip(), link['href']]
|
|
except KeyError:
|
|
pass
|
|
print(links)
|
|
next_page_btn.click()
|
|
time.sleep(1)
|
|
times += 1
|
|
except selenium.common.exceptions.NoSuchElementException:
|
|
break
|
|
|
|
|
|
if __name__ == '__main__':
|
|
get_all_links_cq()
|
|
get_all_links_ndrc()
|
|
get_all_links_mohurd()
|
|
get_all_links_gov()
|
|
driver.quit() # 安全退出
|
|
df.to_excel('智慧社区文本.xlsx', index=False)
|