import time import requests from bs4 import BeautifulSoup def get_text(url, page): r = requests.get(url) r.encoding = 'utf-8' content = r.text bs = BeautifulSoup(content, 'html.parser') text = "" # try_1 = bs.find('div', attrs={'class':'xxtt'}) # if try_1 is None: # text = bs.find('div', attrs={'class':'sss'}) # else: # text = try_1 text = bs.findAll('p') with open('红楼梦.txt', 'at') as f: f.write("第" + str(page) + "回") for para in text: ls = str(para).replace('

', '').replace('

','') f.write(ls) f.write('\n') server = 'http://www.gudianmingzhu.com/guji/hongloumeng/' # i 为URL的规律 page = 1 for index in range(11368, 11488): url = server + str(index) + '.html' print('正在爬取' + url) get_text(url, page) time.sleep(3) page += 1 # get_text('http://www.gudianmingzhu.com/guji/hongloumeng/11370.html', 1)