说点什么

没什么好说的,工具脚本,代码都在下面,可能改一下后可以适用于其他地方。

代码

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jul  6 11:04:53 2017

@author: chih
"""

from bs4 import BeautifulSoup
import requests

BASE_URL = 'http://www.vldb.org/pvldb/vol10.html'

class VldbCrawler:

    def __init__(self, base_url=BASE_URL):
        self.base_url = base_url

    def fetch_download_link(self):
        page = requests.get(BASE_URL)
        a = BeautifulSoup(page.text, "lxml").findAll('a')
        pdf_map = dict(map(lambda i: {i.text.strip(), i.attrs['href']}, a))

        for item in pdf_map:
            if ".pdf" in pdf_map[item] and "www.vldb.org" in pdf_map[item]:
                pdf_r = requests.get(pdf_map[item])
                with open(item + ".pdf", "wb") as pdf_file:
                    pdf_file.write(pdf_r.content)

    def run(self):
        self.fetch_download_link()


if __name__ == '__main__':
    vc = VldbCrawler()
    vc.run()