Source code for pyTMD.datasets.fetch_test_data

#!/usr/bin/env python
"""
fetch_test_data.py
Written by Tyler Sutterley (04/2026)
Download files necessary to run the test suite

CALLING SEQUENCE:
    python fetch_test_data.py

COMMAND LINE OPTIONS:
    --help: list the command line options
    -D X, --directory X: working data directory
    -p X, --provider X: data provider ('figshare' or 'zenodo')
    -t X, --timeout X: timeout in seconds for blocking operations
    -M X, --mode X: Local permissions mode of the files downloaded

PYTHON DEPENDENCIES:
    future: Compatibility layer between Python 2 and Python 3
        https://python-future.org/

PROGRAM DEPENDENCIES:
    utilities.py: download and management utilities for syncing files

UPDATE HISTORY:
    Updated 04/2026: check if needing to include algorithm in the hash
    Updated 03/2026: try multiple providers for fetching data
    Updated 12/2025: use URL class to build and operate on URLs
        add function to download from a zenodo article
        change default provider for test data to zenodo
    Updated 10/2025: change default directory for tide models to cache
    Written 10/2025
"""

import re
import ssl
import shutil
import logging
import pathlib
import zipfile
import argparse
import pyTMD.utilities

# default working data directory for tide models
_default_directory = pyTMD.utilities.get_cache_path()
# default ssl context
_default_ssl_context = pyTMD.utilities._default_ssl_context
# repository API urls
_figshare_api_url = "https://api.figshare.com/v2"
_zenodo_api_url = "https://zenodo.org/api"


[docs] def fetch_test_data( directory: str | pathlib.Path = _default_directory, provider: str = "zenodo", mode: oct = 0o775, **kwargs, ): """ Download files necessary to run the test suite Parameters ---------- directory: str or pathlib.Path Download directory provider: str, default 'zenodo' Data provider - ``'figshare'`` - ``'zenodo'`` mode: oct, default 0o775 Permissions mode of output local files kwargs: dict Additional keyword arguments for data provider functions """ # create download directory if it doesn't exist directory = pyTMD.utilities.Path(directory).resolve() directory.mkdir(parents=True, exist_ok=True, mode=mode) # create logger for verbosity level logger = pyTMD.utilities.build_logger(__name__, level=logging.INFO) if provider == "figshare": _figshare(directory=directory, logger=logger, **kwargs) elif provider == "zenodo": _zenodo(directory=directory, logger=logger, **kwargs) else: raise ValueError(f"Unknown data provider: {provider}")
# PURPOSE: download data files from figshare def _figshare( directory: str | pathlib.Path = _default_directory, article: str = "30260326", timeout: int | None = None, context: ssl.SSLContext = _default_ssl_context, chunk: int = 16384, logger: logging.Logger | None = None, mode: oct = 0o775, **kwargs, ): """ Download files necessary to run the test suite from figshare Parameters ---------- directory: str or pathlib.Path Download directory article: str, default '30260326' figshare article number timeout: int or NoneType, default None Timeout in seconds for blocking operations context: obj, default pyTMD.utilities._default_ssl_context ``SSL`` context for ``urllib`` opener object chunk: int, default 16384 Chunk size for transfer encoding logger: logging.logger object Logger for outputting file transfer information mode: oct, default 0o775 Permissions mode of output local file """ # figshare API host HOST = pyTMD.utilities.URL(_figshare_api_url) articles_api = HOST.joinpath("articles", article) # Create and submit request and load JSON response response = articles_api.load(timeout=timeout, context=context) # for each file in the JSON response for f in response["files"]: # check if needing to include algorithm in the hash comparison include_algorithm = re.match(r"md5\:", f["supplied_md5"]) # check if file already exists by matching MD5 checksums local_file = directory.joinpath(f["name"]) original_md5 = pyTMD.utilities.get_hash( local_file, include_algorithm=include_algorithm ) # skip download if checksums match if original_md5 == f["supplied_md5"]: continue # download url for remote file download = pyTMD.utilities.URL(f["download_url"]) # output file information logger.info(download.urlname) # get remote file as a byte-stream remote_buffer = download.get(timeout=timeout, context=context) # verify MD5 checksums computed_md5 = pyTMD.utilities.get_hash( remote_buffer, include_algorithm=include_algorithm ) # raise exception if checksums do not match if computed_md5 != f["supplied_md5"]: raise Exception(f"Checksum mismatch: {download.urlname}") # download file or extract files from zip if pathlib.Path(f["name"]).suffix == ".zip": # extract the zip file into the local directory with zipfile.ZipFile(remote_buffer) as z: # extract each file and set permissions for member in z.filelist: z.extract(path=directory, member=member) local_file = directory.joinpath(member.filename) local_file.chmod(mode=mode) else: # write the file to the local directory with local_file.open(mode="wb") as f: shutil.copyfileobj(remote_buffer, f, chunk) # change the permissions mode local_file.chmod(mode=mode) # PURPOSE: download data files from zenodo def _zenodo( directory: str | pathlib.Path = _default_directory, record: str = "18091740", timeout: int | None = None, context: ssl.SSLContext = _default_ssl_context, chunk: int = 16384, logger: logging.Logger | None = None, mode: oct = 0o775, **kwargs, ): """ Download files necessary to run the test suite from zenodo Parameters ---------- directory: str or pathlib.Path Download directory record: str, default '18091740' Zenodo record number timeout: int or NoneType, default None Timeout in seconds for blocking operations context: obj, default pyTMD.utilities._default_ssl_context ``SSL`` context for ``urllib`` opener object chunk: int, default 16384 Chunk size for transfer encoding logger: logging.logger object Logger for outputting file transfer information mode: oct, default 0o775 Permissions mode of output local file """ # zenodo API host HOST = pyTMD.utilities.URL(_zenodo_api_url) records_api = HOST.joinpath("records", record) # Create and submit request and load JSON response records_response = records_api.load(timeout=timeout, context=context) # get files from latest version of record version = str(records_response["id"]) deposit_api = HOST.joinpath("deposit", "depositions", version, "files") # Create and submit request and load JSON response deposit_response = deposit_api.load(timeout=timeout, context=context) # for each file in the JSON response for deposits for f in deposit_response: # check if file already exists by matching MD5 checksums local_file = directory.joinpath(f["filename"]) # check if needing to include algorithm in the hash comparison include_algorithm = re.match(r"md5\:", f["checksum"]) original_md5 = pyTMD.utilities.get_hash( local_file, include_algorithm=include_algorithm ) # skip download if checksums match if original_md5 == f["checksum"]: continue # download url for remote file download = pyTMD.utilities.URL(f["links"]["download"]) # output file information logger.info(download.urlname) # get remote file as a byte-stream remote_buffer = download.get(timeout=timeout, context=context) # verify MD5 checksums computed_md5 = pyTMD.utilities.get_hash( remote_buffer, include_algorithm=include_algorithm ) # raise exception if checksums do not match if computed_md5 != f["checksum"]: raise Exception(f"Checksum mismatch: {download.urlname}") # download file or extract files from zip if pathlib.Path(f["filename"]).suffix == ".zip": # extract the zip file into the local directory with zipfile.ZipFile(remote_buffer) as z: # extract each file and set permissions for member in z.filelist: z.extract(path=directory, member=member) local_file = directory.joinpath(member.filename) local_file.chmod(mode=mode) else: # write the file to the local directory with local_file.open(mode="wb") as f: shutil.copyfileobj(remote_buffer, f, chunk) # change the permissions mode local_file.chmod(mode=mode) # PURPOSE: create argument parser def arguments(): parser = argparse.ArgumentParser( description="""Download models for running the test suite """, fromfile_prefix_chars="@", ) parser.convert_arg_line_to_args = pyTMD.utilities.convert_arg_line_to_args # command line parameters # working data directory for location of tide models parser.add_argument( "--directory", "-D", type=pathlib.Path, default=_default_directory, help="Working data directory", ) # download provider parser.add_argument( "--provider", "-P", metavar="PROVIDER", type=str, nargs="+", default=("zenodo", "figshare"), choices=("figshare", "zenodo"), help="Data provider", ) # connection timeout parser.add_argument( "--timeout", "-t", type=int, default=3600, help="Timeout in seconds for blocking operations", ) # permissions mode of the local directories and files (number in octal) parser.add_argument( "--mode", "-M", type=lambda x: int(x, base=8), default=0o775, help="Permissions mode of the files downloaded", ) # return the parser return parser # This is the main part of the program that calls the individual functions def main(): # Read the system arguments listed after the program parser = arguments() args, _ = parser.parse_known_args() # fetch test data for provider in args.provider: # try to fetch data from provider try: fetch_test_data( directory=args.directory, provider=provider, timeout=args.timeout, mode=args.mode, ) except Exception as exc: # output error message and continue to next provider logging.error(f"Error fetching data from {provider}: {exc}") continue else: # break loop if successful break # run main program if __name__ == "__main__": main()