lunasuavis: python, download all files from a site

Hi ,
will not be cool to download all file type from a site, just to not delay to much time to do it.

There are some challenges to do it with Python:

1. how to accept cookies (cookiejar)
2. how to decode a site into elements (BeautifulSoup)

Here is the code:

"""
downloadAllPdfs.py
    Downloads all the pdf on the supplied URL, and saves them to the
    specified output file ("/test/" by default)

Usage:
    python downloadAllPdfs.py http://example.com/ [output]
"""

from bs4 import BeautifulSoup 
import urllib.request as urllib2
import http.cookiejar
import sys

def _downloadFileFromServer(finalFileUri,outputpath):
    print("==> _downloadFileFromServer %s" % finalFileUri )
    filename=finalFileUri.rsplit('/', 1)

    cj = http.cookiejar.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    page = opener.open(finalFileUri)
    data = page.read()
    page.close()
    opener.close()

    print("==> _downloadFileFromServer %s" % outputpath+filename[1] )

    FILE = open(outputpath+filename[1], "wb")
    FILE.write(data)
    FILE.close()

def _downloadFolder(url,outputpath):
    print("\tdownload %s" % url )
    cj = http.cookiejar.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    
    page = opener.open(url)
    soup = BeautifulSoup(page)
    #print (soup.prettify(None,"minimal"))
    
    for lnk in soup.findAll("a"):
        print ("\t\tcheck lnk %s" % lnk)
        
        
        if lnk.has_attr('href'):
            filename = lnk["href"]
        else:
            continue
        
        
        if filename is None:
            continue


        if filename.find(".pdf") >=0 or filename.find(".zip") >=0  :
            finalFile=filename
            if filename.find("http://")<0 :
                finalFile=url + filename
            _downloadFileFromServer(finalFile,outputpath)
            
    print("DONE download %s" % url )
    print("" )
            

def main():
    print ("==init==")
    url=sys.argv[1]
    outputpath=sys.argv[2]+"\\"
    print ("download from %s"%url)
    print ("to %s"%outputpath)

    _downloadFolder(url,outputpath)
    print ("==done==")


if __name__ == "__main__":
    main()

lunasuavis

domingo, 24 de janeiro de 2016

python, download all files from a site

Java Obfuscate Properties

Hits

Denunciar abuso