python-archieve-projects/M303/crawler.py

37 lines
961 B
Python

import time
import requests
from bs4 import BeautifulSoup
def get_text(url, page):
r = requests.get(url)
r.encoding = 'utf-8'
content = r.text
bs = BeautifulSoup(content, 'html.parser')
text = ""
# try_1 = bs.find('div', attrs={'class':'xxtt'})
# if try_1 is None:
# text = bs.find('div', attrs={'class':'sss'})
# else:
# text = try_1
text = bs.findAll('p')
with open('红楼梦.txt', 'at') as f:
f.write("" + str(page) + "")
for para in text:
ls = str(para).replace('<p>', '').replace('</p>','')
f.write(ls)
f.write('\n')
server = 'http://www.gudianmingzhu.com/guji/hongloumeng/'
# i 为URL的规律
page = 1
for index in range(11368, 11488):
url = server + str(index) + '.html'
print('正在爬取' + url)
get_text(url, page)
time.sleep(3)
page += 1
# get_text('http://www.gudianmingzhu.com/guji/hongloumeng/11370.html', 1)