python-archieve-projects/2.27 论文爬虫/爬虫.py

43 lines
1.4 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import wget
from bs4 import BeautifulSoup
from urllib import request
def getUrl(url):
html = request.urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all('a')
file = open('./url.txt', mode='w', encoding='utf-8')
# 自增变量
i = 1
for item in data:
if item.string is not None and item['href'] != 'javascript:;' and item['href'] != '#':
if not item['href'] == '/' and 'proceedings' not in item['href']:
file.write((str(i) + '-' + item.string+ '[]').replace('?',''))
href = item['href'].split('/')[-1].split('-')[0]
# TODO改这里的年份
url = 'https://proceedings.neurips.cc/paper/2020/file/' + href + '-Paper.pdf'
file.write(str.__add__(url, '\n'))
i += 1
file.close()
def downFile(filepath):
file = open('url.txt', mode='r', encoding='utf-8')
for url_txt in file.readlines():
path = url_txt.split('[]')[0]+'.pdf'
file_name = (url_txt.split('[]')[0]+'.pdf')
u = url_txt.split('[]')[-1]
print(file_name)
print(url_txt.split('[]')[-1])
print("已下载。")
wget.download(u, path) # 下载
if __name__ == '__main__':
# TODO改这里的年份
url = 'https://proceedings.neurips.cc/paper/2020'
getUrl(url)
#downFile('./url.txt')