python-archieve-projects/政策文本爬虫/1.get_links.py

import time

import selenium.common.exceptions
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

# 创建一个Chrome浏览器实例
driver = webdriver.Chrome(executable_path=r'/Users/fallcity/Vvvvv/Coding/tools/chromedriver')

df = pd.DataFrame(columns=['站点', '标题', '链接'], )


# 重庆市人民政府
def get_all_links_cq():
    url = 'https://www.cq.gov.cn/cqgovsearch/search.html?searchWord=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&tenantId=7&configTenantId=&dataTypeId=7&sign=d46b7b1d-937f-4c52-e9af-0dbac183bf18&areaCode='
    driver.get(url)
    time.sleep(2)
    times = 1
    while times <= 15:
        try:
            print(f"正在爬取重庆市人民政府相关文本 第{times}页")
            next_page_btn = driver.find_element_by_class_name('layui-laypage-next')
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            lst = soup.find(class_='basic_result_content')
            links = lst.find_all('a')
            for link in links:
                df.loc[len(df)] = ['重庆市人民政府', link.text.strip(), link['href']]
            print(links)
            time.sleep(1)
            next_page_btn.click()
            time.sleep(1)
            times += 1
        except selenium.common.exceptions.NoSuchElementException or selenium.common.exceptions.TimeoutException:
            break

# 发改委
def get_all_links_ndrc():
    url = 'https://so.ndrc.gov.cn/s?qt=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&siteCode=bm04000007&tab=all&toolsStatus=1'
    driver.get(url)
    time.sleep(2)
    times = 1
    while times <= 7:
        print(f"正在爬取中华人民共和国国家发展和改革委员会相关文本 第{times}页")
        next_page_btn = driver.find_elements_by_class_name('next')[-1]
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        lst = soup.find(class_='content fl')
        links = lst.find_all('a')
        for link in links:
            try:
                if 'javascript' not in link['href']:
                    df.loc[len(df)] = ['中华人民共和国国家发展和改革委员会', link.text.strip(), link['href']]
            except KeyError:
                pass
        print(links)
        next_page_btn.click()
        time.sleep(1)
        times += 1


# 住建部
def get_all_links_mohurd():
    url = 'https://www.mohurd.gov.cn/ess/?ty=a&query=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&ukl=&uka=&ukf=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA&ukt=&sl=&ts=&te=&upg=1'
    driver.get(url)
    time.sleep(2)
    times = 1
    while times <= 15:
        try:
            print(f"正在爬取中华人民共和国住房和城乡建设部相关文本 第{times}页")
            next_page_btn = driver.find_element_by_class_name('next')
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            lst = soup.find(class_='result-list')
            links = lst.find_all('a')
            for link in links:
                try:
                    if 'javascript' not in link['href']:
                        df.loc[len(df)] = ['中华人民共和国住房和城乡建设部', link.text.strip(), link['href']]
                except KeyError:
                    pass
            print(links)
            next_page_btn.click()
            time.sleep(1)
            times += 1
        except selenium.common.exceptions.NoSuchElementException:
            break


def get_all_links_gov():
    url = 'https://sousuo.www.gov.cn/sousuo/search.shtml?code=17da70961a7&dataTypeId=107&searchWord=%E6%99%BA%E6%85%A7%E7%A4%BE%E5%8C%BA'
    driver.get(url)
    time.sleep(2)
    times = 1
    while times <= 3:
        try:
            print(f"正在爬取中华人民共和国中央人民政府相关文本 第{times}页")
            next_page_btn = driver.find_element_by_class_name('next')
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            lst = soup.find(class_='left js_basic_result_left')
            links = lst.find_all('a')
            for link in links:
                try:
                    if 'javascript' not in link['href'] and link['href'] != 'url':
                        df.loc[len(df)] = ['中华人民共和国中央人民政府', link.text.strip(), link['href']]
                except KeyError:
                    pass
            print(links)
            next_page_btn.click()
            time.sleep(1)
            times += 1
        except selenium.common.exceptions.NoSuchElementException:
            break


if __name__ == '__main__':
    get_all_links_cq()
    get_all_links_ndrc()
    get_all_links_mohurd()
    get_all_links_gov()
    driver.quit()               # 安全退出
    df.to_excel('智慧社区文本.xlsx', index=False)