will not be cool to download all file type from a site, just to not delay to much time to do it.
There are some challenges to do it with Python:
1. how to accept cookies (cookiejar)
2. how to decode a site into elements (BeautifulSoup)
Here is the code:
"""
downloadAllPdfs.py
Downloads all the pdf on the supplied URL, and saves them to the
specified output file ("/test/" by default)
Usage:
python downloadAllPdfs.py http://example.com/ [output]
"""
from bs4 import BeautifulSoup
import urllib.request as urllib2
import http.cookiejar
import sys
def _downloadFileFromServer(finalFileUri,outputpath):
print("==> _downloadFileFromServer %s" % finalFileUri )
filename=finalFileUri.rsplit('/', 1)
cj = http.cookiejar.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
page = opener.open(finalFileUri)
data = page.read()
page.close()
opener.close()
print("==> _downloadFileFromServer %s" % outputpath+filename[1] )
FILE = open(outputpath+filename[1], "wb")
FILE.write(data)
FILE.close()
def _downloadFolder(url,outputpath):
print("\tdownload %s" % url )
cj = http.cookiejar.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
page = opener.open(url)
soup = BeautifulSoup(page)
#print (soup.prettify(None,"minimal"))
for lnk in soup.findAll("a"):
print ("\t\tcheck lnk %s" % lnk)
if lnk.has_attr('href'):
filename = lnk["href"]
else:
continue
if filename is None:
continue
if filename.find(".pdf") >=0 or filename.find(".zip") >=0 :
finalFile=filename
if filename.find("http://")<0 :
finalFile=url + filename
_downloadFileFromServer(finalFile,outputpath)
print("DONE download %s" % url )
print("" )
def main():
print ("==init==")
url=sys.argv[1]
outputpath=sys.argv[2]+"\\"
print ("download from %s"%url)
print ("to %s"%outputpath)
_downloadFolder(url,outputpath)
print ("==done==")
if __name__ == "__main__":
main()