抓取网页中所有PDF链接并下载的Python脚本
说点什么
没什么好说的,工具脚本,代码都在下面,可能改一下后可以适用于其他地方。
代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 6 11:04:53 2017
@author: chih
"""
from bs4 import BeautifulSoup
import requests
BASE_URL = 'http://www.vldb.org/pvldb/vol10.html'
class VldbCrawler:
def __init__(self, base_url=BASE_URL):
self.base_url = base_url
def fetch_download_link(self):
page = requests.get(BASE_URL)
a = BeautifulSoup(page.text, "lxml").findAll('a')
pdf_map = dict(map(lambda i: {i.text.strip(), i.attrs['href']}, a))
for item in pdf_map:
if ".pdf" in pdf_map[item] and "www.vldb.org" in pdf_map[item]:
pdf_r = requests.get(pdf_map[item])
with open(item + ".pdf", "wb") as pdf_file:
pdf_file.write(pdf_r.content)
def run(self):
self.fetch_download_link()
if __name__ == '__main__':
vc = VldbCrawler()
vc.run()