#!/usr/bin/python # Usage: # # In a terminal/command line, cd to the directory where this file lives. Then... # # With embedded urls: ( download the hardcoded list of files in the 'files =' block below) # # python ./download-all-2024-09-12_08-56-51.py # # Download all files in a Metalink/CSV: (downloaded from ASF Vertex) # # python ./download-all-2024-09-12_08-56-51.py /path/to/downloads.metalink localmetalink.metalink localcsv.csv # # Compatibility: python >= 2.6.5, 2.7.5, 3.0 # # If downloading from a trusted source with invalid SSL Certs, use --insecure to ignore # # For more information on bulk downloads, navigate to: # https://www.asf.alaska.edu/data-tools/bulk-download/ # # # # This script was generated by the Alaska Satellite Facility's bulk download service. # For more information on the service, navigate to: # http://bulk-download.asf.alaska.edu/help # import sys, csv import os, os.path import tempfile, shutil import re import base64 import time import getpass import ssl import signal import socket import xml.etree.ElementTree as ET ############# # This next block is a bunch of Python 2/3 compatability try: # Python 2.x Libs from urllib2 import build_opener, install_opener, Request, urlopen, HTTPError from urllib2 import URLError, HTTPSHandler, HTTPHandler, HTTPCookieProcessor from cookielib import MozillaCookieJar from StringIO import StringIO except ImportError as e: # Python 3.x Libs from urllib.request import build_opener, install_opener, Request, urlopen from urllib.request import HTTPHandler, HTTPSHandler, HTTPCookieProcessor from urllib.error import HTTPError, URLError from http.cookiejar import MozillaCookieJar from io import StringIO ### # Global variables intended for cross-thread modification abort = False ### # A routine that handles trapped signals def signal_handler(sig, frame): global abort sys.stderr.output("\n > Caught Signal. Exiting!\n") abort = True # necessary to cause the program to stop raise SystemExit # this will only abort the thread that the ctrl+c was caught in class bulk_downloader: def __init__(self): # List of files to download self.files = [ ] # Local stash of cookies so we don't always have to ask self.cookie_jar_path = os.path.join( os.path.expanduser('~'), ".bulk_download_cookiejar.txt") self.cookie_jar = None self.asf_urs4 = { 'url': 'https://urs.earthdata.nasa.gov/oauth/authorize', 'client': 'BO_n7nTIlMljdvU6kRRB3g', 'redir': 'https://auth.asf.alaska.edu/login'} # Make sure we can write it our current directory if os.access(os.getcwd(), os.W_OK) is False: print ("WARNING: Cannot write to current path! Check permissions for {0}".format(os.getcwd())) exit(-1) # For SSL self.context = {} # Check if user handed in a Metalink or CSV: if len(sys.argv) > 0: download_files = [] input_files = [] for arg in sys.argv[1:]: if arg == '--insecure': try: ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE self.context['context'] = ctx except AttributeError: # Python 2.6 won't complain about SSL Validation pass elif arg.endswith('.metalink') or arg.endswith('.csv'): if os.path.isfile( arg ): input_files.append( arg ) if arg.endswith('.metalink'): new_files = self.process_metalink(arg) else: new_files = self.process_csv(arg) if new_files is not None: for file_url in (new_files): download_files.append( file_url ) else: print (" > I cannot find the input file you specified: {0}".format(arg)) else: print (" > Command line argument '{0}' makes no sense, ignoring.".format(arg)) if len(input_files) > 0: if len(download_files) > 0: print (" > Processing {0} downloads from {1} input files. ".format(len(download_files), len(input_files))) self.files = download_files else: print (" > I see you asked me to download files from {0} input files, but they had no downloads!".format(len(input_files))) print (" > I'm super confused and exiting.") exit(-1) # Make sure cookie_jar is good to go! self.get_cookie() # summary self.total_bytes = 0 self.total_time = 0 self.cnt = 0 self.success = [] self.failed = [] self.skipped = [] # Get and validate a cookie def get_cookie(self): if os.path.isfile(self.cookie_jar_path): self.cookie_jar = MozillaCookieJar() self.cookie_jar.load(self.cookie_jar_path) # make sure cookie is still valid if self.check_cookie(): print(" > Re-using previous cookie jar.") return True else: print(" > Could not validate old cookie Jar") # We don't have a valid cookie, prompt user or creds print ("No existing URS cookie found, please enter Earthdata username & password:") print ("(Credentials will not be stored, saved or logged anywhere)") # Keep trying 'till user gets the right U:P while self.check_cookie() is False: self.get_new_cookie() return True # Validate cookie before we begin def check_cookie(self): if self.cookie_jar is None: print (" > Cookiejar is bunk: {0}".format(self.cookie_jar)) return False # File we know is valid, used to validate cookie file_check = 'https://urs.earthdata.nasa.gov/profile' # Apply custom Redirect Hanlder opener = build_opener(HTTPCookieProcessor(self.cookie_jar), HTTPHandler(), HTTPSHandler(**self.context)) install_opener(opener) # Attempt a HEAD request request = Request(file_check) request.get_method = lambda : 'HEAD' try: print (" > attempting to download {0}".format(file_check)) response = urlopen(request, timeout=30) resp_code = response.getcode() # Make sure we're logged in if not self.check_cookie_is_logged_in(self.cookie_jar): return False # Save cookiejar self.cookie_jar.save(self.cookie_jar_path) except HTTPError: # If we ge this error, again, it likely means the user has not agreed to current EULA print ("\nIMPORTANT: ") print ("Your user appears to lack permissions to download data from the ASF Datapool.") print ("\n\nNew users: you must first log into Vertex and accept the EULA. In addition, your Study Area must be set at Earthdata https://urs.earthdata.nasa.gov") exit(-1) # This return codes indicate the USER has not been approved to download the data if resp_code in (300, 301, 302, 303): try: redir_url = response.info().getheader('Location') except AttributeError: redir_url = response.getheader('Location') #Funky Test env: if ("vertex-retired.daac.asf.alaska.edu" in redir_url and "test" in self.asf_urs4['redir']): print ("Cough, cough. It's dusty in this test env!") return True print ("Redirect ({0}) occured, invalid cookie value!".format(resp_code)) return False # These are successes! if resp_code in (200, 307): return True return False def get_new_cookie(self): # Start by prompting user to input their credentials # Another Python2/3 workaround try: new_username = raw_input("Username: ") except NameError: new_username = input("Username: ") new_password = getpass.getpass(prompt="Password (will not be displayed): ") # Build URS4 Cookie request auth_cookie_url = self.asf_urs4['url'] + '?client_id=' + self.asf_urs4['client'] + '&redirect_uri=' + self.asf_urs4['redir'] + '&response_type=code&state=' try: #python2 user_pass = base64.b64encode (bytes(new_username+":"+new_password)) except TypeError: #python3 user_pass = base64.b64encode (bytes(new_username+":"+new_password, "utf-8")) user_pass = user_pass.decode("utf-8") # Authenticate against URS, grab all the cookies self.cookie_jar = MozillaCookieJar() opener = build_opener(HTTPCookieProcessor(self.cookie_jar), HTTPHandler(), HTTPSHandler(**self.context)) request = Request(auth_cookie_url, headers={"Authorization": "Basic {0}".format(user_pass)}) # Watch out cookie rejection! try: response = opener.open(request) except HTTPError as e: if "WWW-Authenticate" in e.headers and "Please enter your Earthdata Login credentials" in e.headers["WWW-Authenticate"]: print (" > Username and Password combo was not successful. Please try again.") return False else: # If an error happens here, the user most likely has not confirmed EULA. print ("\nIMPORTANT: There was an error obtaining a download cookie!") print ("Your user appears to lack permission to download data from the ASF Datapool.") print ("\n\nNew users: you must first log into Vertex and accept the EULA. In addition, your Study Area must be set at Earthdata https://urs.earthdata.nasa.gov") exit(-1) except URLError as e: print ("\nIMPORTANT: There was a problem communicating with URS, unable to obtain cookie. ") print ("Try cookie generation later.") exit(-1) # Did we get a cookie? if self.check_cookie_is_logged_in(self.cookie_jar): #COOKIE SUCCESS! self.cookie_jar.save(self.cookie_jar_path) return True # if we aren't successful generating the cookie, nothing will work. Stop here! print ("WARNING: Could not generate new cookie! Cannot proceed. Please try Username and Password again.") print ("Response was {0}.".format(response.getcode())) print ("\n\nNew users: you must first log into Vertex and accept the EULA. In addition, your Study Area must be set at Earthdata https://urs.earthdata.nasa.gov") exit(-1) # make sure we're logged into URS def check_cookie_is_logged_in(self, cj): for cookie in cj: if cookie.name == 'urs_user_already_logged': # Only get this cookie if we logged in successfully! return True return False # Download the file def download_file_with_cookiejar(self, url, file_count, total, recursion=False): # see if we've already download this file and if it is that it is the correct size download_file = os.path.basename(url).split('?')[0] if os.path.isfile(download_file): try: request = Request(url) request.get_method = lambda : 'HEAD' response = urlopen(request, timeout=30) remote_size = self.get_total_size(response) # Check that we were able to derive a size. if remote_size: local_size = os.path.getsize(download_file) if remote_size < (local_size+(local_size*.01)) and remote_size > (local_size-(local_size*.01)): print (" > Download file {0} exists! \n > Skipping download of {1}. ".format(download_file, url)) return None,None #partial file size wasn't full file size, lets blow away the chunk and start again print (" > Found {0} but it wasn't fully downloaded. Removing file and downloading again.".format(download_file)) os.remove(download_file) except ssl.CertificateError as e: print (" > ERROR: {0}".format(e)) print (" > Could not validate SSL Cert. You may be able to overcome this using the --insecure flag") return False,None except HTTPError as e: if e.code == 401: print (" > IMPORTANT: Your user may not have permission to download this type of data!") else: print (" > Unknown Error, Could not get file HEAD: {0}".format(e)) except URLError as e: print ("URL Error (from HEAD): {0}, {1}".format( e.reason, url)) if "ssl.c" in "{0}".format(e.reason): print ("IMPORTANT: Remote location may not be accepting your SSL configuration. This is a terminal error.") return False,None # attempt https connection try: request = Request(url) response = urlopen(request, timeout=30) # Watch for redirect if response.geturl() != url: # See if we were redirect BACK to URS for re-auth. if 'https://urs.earthdata.nasa.gov/oauth/authorize' in response.geturl(): if recursion: print (" > Entering seemingly endless auth loop. Aborting. ") return False, None # make this easier. If there is no app_type=401, add it new_auth_url = response.geturl() if "app_type" not in new_auth_url: new_auth_url += "&app_type=401" print (" > While attempting to download {0}....".format(url)) print (" > Need to obtain new cookie from {0}".format(new_auth_url)) old_cookies = [cookie.name for cookie in self.cookie_jar] opener = build_opener(HTTPCookieProcessor(self.cookie_jar), HTTPHandler(), HTTPSHandler(**self.context)) request = Request(new_auth_url) try: response = opener.open(request) for cookie in self.cookie_jar: if cookie.name not in old_cookies: print (" > Saved new cookie: {0}".format(cookie.name)) # A little hack to save session cookies if cookie.discard: cookie.expires = int(time.time()) + 60*60*24*30 print (" > Saving session Cookie that should have been discarded! ") self.cookie_jar.save(self.cookie_jar_path, ignore_discard=True, ignore_expires=True) except HTTPError as e: print ("HTTP Error: {0}, {1}".format( e.code, url)) return False,None # Okay, now we have more cookies! Lets try again, recursively! print (" > Attempting download again with new cookies!") return self.download_file_with_cookiejar(url, file_count, total, recursion=True) print (" > 'Temporary' Redirect download @ Remote archive:\n > {0}".format(response.geturl())) # seems to be working print ("({0}/{1}) Downloading {2}".format(file_count, total, url)) # Open our local file for writing and build status bar tf = tempfile.NamedTemporaryFile(mode='w+b', delete=False, dir='.') self.chunk_read(response, tf, report_hook=self.chunk_report) # Reset download status sys.stdout.write('\n') tempfile_name = tf.name tf.close() #handle errors except HTTPError as e: print ("HTTP Error: {0}, {1}".format( e.code, url)) if e.code == 401: print (" > IMPORTANT: Your user does not have permission to download this type of data!") if e.code == 403: print (" > Got a 403 Error trying to download this file. ") print (" > You MAY need to log in this app and agree to a EULA. ") return False,None except URLError as e: print ("URL Error (from GET): {0}, {1}, {2}".format(e, e.reason, url)) if "ssl.c" in "{0}".format(e.reason): print ("IMPORTANT: Remote location may not be accepting your SSL configuration. This is a terminal error.") return False,None except socket.timeout as e: print (" > timeout requesting: {0}; {1}".format(url, e)) return False,None except ssl.CertificateError as e: print (" > ERROR: {0}".format(e)) print (" > Could not validate SSL Cert. You may be able to overcome this using the --insecure flag") return False,None # Return the file size shutil.copy(tempfile_name, download_file) os.remove(tempfile_name) file_size = self.get_total_size(response) actual_size = os.path.getsize(download_file) if file_size is None: # We were unable to calculate file size. file_size = actual_size return actual_size,file_size def get_redirect_url_from_error(self, error): find_redirect = re.compile(r"id=\"redir_link\"\s+href=\"(\S+)\"") print ("error file was: {}".format(error)) redirect_url = find_redirect.search(error) if redirect_url: print("Found: {0}".format(redirect_url.group(0))) return (redirect_url.group(0)) return None # chunk_report taken from http://stackoverflow.com/questions/2028517/python-urllib2-progress-hook def chunk_report(self, bytes_so_far, file_size): if file_size is not None: percent = float(bytes_so_far) / file_size percent = round(percent*100, 2) sys.stdout.write(" > Downloaded %d of %d bytes (%0.2f%%)\r" % (bytes_so_far, file_size, percent)) else: # We couldn't figure out the size. sys.stdout.write(" > Downloaded %d of unknown Size\r" % (bytes_so_far)) # chunk_read modified from http://stackoverflow.com/questions/2028517/python-urllib2-progress-hook def chunk_read(self, response, local_file, chunk_size=8192, report_hook=None): file_size = self.get_total_size(response) bytes_so_far = 0 while 1: try: chunk = response.read(chunk_size) except: sys.stdout.write("\n > There was an error reading data. \n") break try: local_file.write(chunk) except TypeError: local_file.write(chunk.decode(local_file.encoding)) bytes_so_far += len(chunk) if not chunk: break if report_hook: report_hook(bytes_so_far, file_size) return bytes_so_far def get_total_size(self, response): try: file_size = response.info().getheader('Content-Length').strip() except AttributeError: try: file_size = response.getheader('Content-Length').strip() except AttributeError: print ("> Problem getting size") return None return int(file_size) # Get download urls from a metalink file def process_metalink(self, ml_file): print ("Processing metalink file: {0}".format(ml_file)) with open(ml_file, 'r') as ml: xml = ml.read() # Hack to remove annoying namespace it = ET.iterparse(StringIO(xml)) for _, el in it: if '}' in el.tag: el.tag = el.tag.split('}', 1)[1] # strip all namespaces root = it.root dl_urls = [] ml_files = root.find('files') for dl in ml_files: dl_urls.append(dl.find('resources').find('url').text) if len(dl_urls) > 0: return dl_urls else: return None # Get download urls from a csv file def process_csv(self, csv_file): print ("Processing csv file: {0}".format(csv_file)) dl_urls = [] with open(csv_file, 'r') as csvf: try: csvr = csv.DictReader(csvf) for row in csvr: dl_urls.append(row['URL']) except csv.Error as e: print ("WARNING: Could not parse file %s, line %d: %s. Skipping." % (csv_file, csvr.line_num, e)) return None except KeyError as e: print ("WARNING: Could not find URL column in file %s. Skipping." % (csv_file)) if len(dl_urls) > 0: return dl_urls else: return None # Download all the files in the list def download_files(self): for file_name in self.files: # make sure we haven't ctrl+c'd or some other abort trap if abort == True: raise SystemExit # download counter self.cnt += 1 # set a timer start = time.time() # run download size,total_size = self.download_file_with_cookiejar(file_name, self.cnt, len(self.files)) # calculte rate end = time.time() # stats: if size is None: self.skipped.append(file_name) # Check to see that the download didn't error and is the correct size elif size is not False and (total_size < (size+(size*.01)) and total_size > (size-(size*.01))): # Download was good! elapsed = end - start elapsed = 1.0 if elapsed < 1 else elapsed rate = (size/1024**2)/elapsed print ("Downloaded {0}b in {1:.2f}secs, Average Rate: {2:.2f}MB/sec".format(size, elapsed, rate)) # add up metrics self.total_bytes += size self.total_time += elapsed self.success.append( {'file':file_name, 'size':size } ) else: print ("There was a problem downloading {0}".format(file_name)) self.failed.append(file_name) def print_summary(self): # Print summary: print ("\n\nDownload Summary ") print ("--------------------------------------------------------------------------------") print (" Successes: {0} files, {1} bytes ".format(len(self.success), self.total_bytes)) for success_file in self.success: print (" - {0} {1:.2f}MB".format(success_file['file'],(success_file['size']/1024.0**2))) if len(self.failed) > 0: print (" Failures: {0} files".format(len(self.failed))) for failed_file in self.failed: print (" - {0}".format(failed_file)) if len(self.skipped) > 0: print (" Skipped: {0} files".format(len(self.skipped))) for skipped_file in self.skipped: print (" - {0}".format(skipped_file)) if len(self.success) > 0: print (" Average Rate: {0:.2f}MB/sec".format( (self.total_bytes/1024.0**2)/self.total_time)) print ("--------------------------------------------------------------------------------") if __name__ == "__main__": # Setup a signal trap for SIGINT (Ctrl+C) signal.signal(signal.SIGINT, signal_handler) downloader = bulk_downloader() downloader.download_files() downloader.print_summary()