import time import selenium.common.exceptions from selenium import webdriver from bs4 import BeautifulSoup import pandas as pd # 创建一个Chrome浏览器实例 driver = webdriver.Chrome(executable_path=r'/Users/fallcity/Vvvvv/Coding/tools/chromedriver') df = pd.DataFrame(columns=['站点', '标题', '链接'], ) # 重庆市人民政府 def get_all_links_cq(): url = 'https://www.cq.gov.cn/cqgovsearch/search.html?searchWord=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&tenantId=7&configTenantId=&dataTypeId=7&sign=d46b7b1d-937f-4c52-e9af-0dbac183bf18&areaCode=' driver.get(url) time.sleep(2) times = 1 while times <= 15: try: print(f"正在爬取重庆市人民政府相关文本 第{times}页") next_page_btn = driver.find_element_by_class_name('layui-laypage-next') page_source = driver.page_source soup = BeautifulSoup(page_source, 'html.parser') lst = soup.find(class_='basic_result_content') links = lst.find_all('a') for link in links: df.loc[len(df)] = ['重庆市人民政府', link.text.strip(), link['href']] print(links) time.sleep(1) next_page_btn.click() time.sleep(1) times += 1 except selenium.common.exceptions.NoSuchElementException or selenium.common.exceptions.TimeoutException: break # 发改委 def get_all_links_ndrc(): url = 'https://so.ndrc.gov.cn/s?qt=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&siteCode=bm04000007&tab=all&toolsStatus=1' driver.get(url) time.sleep(2) times = 1 while times <= 7: print(f"正在爬取中华人民共和国国家发展和改革委员会相关文本 第{times}页") next_page_btn = driver.find_elements_by_class_name('next')[-1] page_source = driver.page_source soup = BeautifulSoup(page_source, 'html.parser') lst = soup.find(class_='content fl') links = lst.find_all('a') for link in links: try: if 'javascript' not in link['href']: df.loc[len(df)] = ['中华人民共和国国家发展和改革委员会', link.text.strip(), link['href']] except KeyError: pass print(links) next_page_btn.click() time.sleep(1) times += 1 # 住建部 def get_all_links_mohurd(): url = 'https://www.mohurd.gov.cn/ess/?ty=a&query=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&ukl=&uka=&ukf=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&ukt=&sl=&ts=&te=&upg=1' driver.get(url) time.sleep(2) times = 1 while times <= 15: try: print(f"正在爬取中华人民共和国住房和城乡建设部相关文本 第{times}页") next_page_btn = driver.find_element_by_class_name('next') page_source = driver.page_source soup = BeautifulSoup(page_source, 'html.parser') lst = soup.find(class_='result-list') links = lst.find_all('a') for link in links: try: if 'javascript' not in link['href']: df.loc[len(df)] = ['中华人民共和国住房和城乡建设部', link.text.strip(), link['href']] except KeyError: pass print(links) next_page_btn.click() time.sleep(1) times += 1 except selenium.common.exceptions.NoSuchElementException: break def get_all_links_gov(): url = 'https://sousuo.www.gov.cn/sousuo/search.shtml?code=17da70961a7&dataTypeId=107&searchWord=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA' driver.get(url) time.sleep(2) times = 1 while times <= 3: try: print(f"正在爬取中华人民共和国中央人民政府相关文本 第{times}页") next_page_btn = driver.find_element_by_class_name('next') page_source = driver.page_source soup = BeautifulSoup(page_source, 'html.parser') lst = soup.find(class_='left js_basic_result_left') links = lst.find_all('a') for link in links: try: if 'javascript' not in link['href'] and link['href'] != 'url': df.loc[len(df)] = ['中华人民共和国中央人民政府', link.text.strip(), link['href']] except KeyError: pass print(links) next_page_btn.click() time.sleep(1) times += 1 except selenium.common.exceptions.NoSuchElementException: break if __name__ == '__main__': get_all_links_cq() get_all_links_ndrc() get_all_links_mohurd() get_all_links_gov() driver.quit() # 安全退出 df.to_excel('智慧社区文本.xlsx', index=False)