import time import pandas as pd import requests from bs4 import BeautifulSoup from selenium import webdriver driver = webdriver.Chrome(executable_path=r'/Users/fallcity/Vvvvv/Coding/tools/chromedriver') def crawl_website(site: str, url: str): print(f"正在爬取 {site}: {url}") driver.get(url) page_source = driver.page_source soup = BeautifulSoup(page_source, 'html.parser') try: if ".doc" in url: return "" elif site == '重庆市人民政府': div = soup.find('div', class_='view TRS_UEDITOR trs_paper_default trs_word') if div: text = div.get_text(strip=True) return text elif site == '中华人民共和国国家发展和改革委员会': div = soup.find('div', class_='TRS_Editor') if div: text = div.get_text(strip=True) return text elif site == '中华人民共和国住房和城乡建设部': div = soup.find('div', class_='editor-content') if div: text = div.get_text(strip=True) return text elif site == '中华人民共和国中央人民政府': div = soup.find('div', id='UCAP-CONTENT') if div: text = div.get_text(strip=True) return text except TimeoutError: exit(1) if __name__ == '__main__': df = pd.read_excel('智慧社区文本.xlsx', engine='openpyxl') df['文本内容'] = "" for index, row in df.iterrows(): # 调用get_text函数获取文本内容 text_content = crawl_website(row['站点'], row['链接']) # 将文本内容填入'文本内容'列 df.at[index, '文本内容'] = text_content # 休眠2秒 time.sleep(2) df = df.drop_duplicates(subset=['文本内容']) df.to_excel('智慧社区文本.xlsx', index=False) driver.quit()