python-archieve-projects/M303/crawler.py

import time

import requests
from bs4 import BeautifulSoup

def get_text(url, page):
    r = requests.get(url)
    r.encoding = 'utf-8'
    content = r.text
    bs = BeautifulSoup(content, 'html.parser')
    text = ""
    # try_1 = bs.find('div', attrs={'class':'xxtt'})
    # if try_1 is None:
    #     text = bs.find('div', attrs={'class':'sss'})
    # else:
    #     text = try_1
    text = bs.findAll('p')
    with open('红楼梦.txt', 'at') as f:
        f.write("第" + str(page) + "回")
        for para in text:
            ls = str(para).replace('<p>', '').replace('</p>','')
            f.write(ls)
        f.write('\n')


server = 'http://www.gudianmingzhu.com/guji/hongloumeng/'
# i 为URL的规律
page = 1
for index in range(11368, 11488):
    url = server + str(index) + '.html'
    print('正在爬取' + url)
    get_text(url, page)
    time.sleep(3)
    page += 1

# get_text('http://www.gudianmingzhu.com/guji/hongloumeng/11370.html', 1)