37 lines
961 B
Python
37 lines
961 B
Python
import time
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
def get_text(url, page):
|
|
r = requests.get(url)
|
|
r.encoding = 'utf-8'
|
|
content = r.text
|
|
bs = BeautifulSoup(content, 'html.parser')
|
|
text = ""
|
|
# try_1 = bs.find('div', attrs={'class':'xxtt'})
|
|
# if try_1 is None:
|
|
# text = bs.find('div', attrs={'class':'sss'})
|
|
# else:
|
|
# text = try_1
|
|
text = bs.findAll('p')
|
|
with open('红楼梦.txt', 'at') as f:
|
|
f.write("第" + str(page) + "回")
|
|
for para in text:
|
|
ls = str(para).replace('<p>', '').replace('</p>','')
|
|
f.write(ls)
|
|
f.write('\n')
|
|
|
|
|
|
|
|
server = 'http://www.gudianmingzhu.com/guji/hongloumeng/'
|
|
# i 为URL的规律
|
|
page = 1
|
|
for index in range(11368, 11488):
|
|
url = server + str(index) + '.html'
|
|
print('正在爬取' + url)
|
|
get_text(url, page)
|
|
time.sleep(3)
|
|
page += 1
|
|
|
|
# get_text('http://www.gudianmingzhu.com/guji/hongloumeng/11370.html', 1) |