Source code for pyTMD.datasets.fetch_test_data

#!/usr/bin/env python
"""
fetch_test_data.py
Written by Tyler Sutterley (04/2026)
Download files necessary to run the test suite

CALLING SEQUENCE:
    python fetch_test_data.py

COMMAND LINE OPTIONS:
    --help: list the command line options
    -D X, --directory X: working data directory
    -p X, --provider X: data provider ('figshare' or 'zenodo')
    -t X, --timeout X: timeout in seconds for blocking operations
    -M X, --mode X: Local permissions mode of the files downloaded

PYTHON DEPENDENCIES:
    future: Compatibility layer between Python 2 and Python 3
        https://python-future.org/

PROGRAM DEPENDENCIES:
    utilities.py: download and management utilities for syncing files

UPDATE HISTORY:
    Updated 04/2026: check if needing to include algorithm in the hash
    Updated 03/2026: try multiple providers for fetching data
    Updated 12/2025: use URL class to build and operate on URLs
        add function to download from a zenodo article
        change default provider for test data to zenodo
    Updated 10/2025: change default directory for tide models to cache
    Written 10/2025
"""

import re
import ssl
import shutil
import logging
import pathlib
import zipfile
import argparse
import pyTMD.utilities

# default working data directory for tide models
_default_directory = pyTMD.utilities.get_cache_path()
# default ssl context
_default_ssl_context = pyTMD.utilities._default_ssl_context
# repository API urls
_figshare_api_url = "https://api.figshare.com/v2"
_zenodo_api_url = "https://zenodo.org/api"



[docs]
def fetch_test_data(
    directory: str | pathlib.Path = _default_directory,
    provider: str = "zenodo",
    mode: oct = 0o775,
    **kwargs,
):
    """
    Download files necessary to run the test suite

    Parameters
    ----------
    directory: str or pathlib.Path
        Download directory
    provider: str, default 'zenodo'
        Data provider

        - ``'figshare'``
        - ``'zenodo'``
    mode: oct, default 0o775
        Permissions mode of output local files
    kwargs: dict
        Additional keyword arguments for data provider functions
    """
    # create download directory if it doesn't exist
    directory = pyTMD.utilities.Path(directory).resolve()
    directory.mkdir(parents=True, exist_ok=True, mode=mode)
    # create logger for verbosity level
    logger = pyTMD.utilities.build_logger(__name__, level=logging.INFO)
    if provider == "figshare":
        _figshare(directory=directory, logger=logger, **kwargs)
    elif provider == "zenodo":
        _zenodo(directory=directory, logger=logger, **kwargs)
    else:
        raise ValueError(f"Unknown data provider: {provider}")



# PURPOSE: download data files from figshare
def _figshare(
    directory: str | pathlib.Path = _default_directory,
    article: str = "30260326",
    timeout: int | None = None,
    context: ssl.SSLContext = _default_ssl_context,
    chunk: int = 16384,
    logger: logging.Logger | None = None,
    mode: oct = 0o775,
    **kwargs,
):
    """
    Download files necessary to run the test suite from figshare

    Parameters
    ----------
    directory: str or pathlib.Path
        Download directory
    article: str, default '30260326'
        figshare article number
    timeout: int or NoneType, default None
        Timeout in seconds for blocking operations
    context: obj, default pyTMD.utilities._default_ssl_context
        ``SSL`` context for ``urllib`` opener object
    chunk: int, default 16384
        Chunk size for transfer encoding
    logger: logging.logger object
        Logger for outputting file transfer information
    mode: oct, default 0o775
        Permissions mode of output local file
    """
    # figshare API host
    HOST = pyTMD.utilities.URL(_figshare_api_url)
    articles_api = HOST.joinpath("articles", article)
    # Create and submit request and load JSON response
    response = articles_api.load(timeout=timeout, context=context)
    # for each file in the JSON response
    for f in response["files"]:
        # check if needing to include algorithm in the hash comparison
        include_algorithm = re.match(r"md5\:", f["supplied_md5"])
        # check if file already exists by matching MD5 checksums
        local_file = directory.joinpath(f["name"])
        original_md5 = pyTMD.utilities.get_hash(
            local_file, include_algorithm=include_algorithm
        )
        # skip download if checksums match
        if original_md5 == f["supplied_md5"]:
            continue
        # download url for remote file
        download = pyTMD.utilities.URL(f["download_url"])
        # output file information
        logger.info(download.urlname)
        # get remote file as a byte-stream
        remote_buffer = download.get(timeout=timeout, context=context)
        # verify MD5 checksums
        computed_md5 = pyTMD.utilities.get_hash(
            remote_buffer, include_algorithm=include_algorithm
        )
        # raise exception if checksums do not match
        if computed_md5 != f["supplied_md5"]:
            raise Exception(f"Checksum mismatch: {download.urlname}")
        # download file or extract files from zip
        if pathlib.Path(f["name"]).suffix == ".zip":
            # extract the zip file into the local directory
            with zipfile.ZipFile(remote_buffer) as z:
                # extract each file and set permissions
                for member in z.filelist:
                    z.extract(path=directory, member=member)
                    local_file = directory.joinpath(member.filename)
                    local_file.chmod(mode=mode)
        else:
            # write the file to the local directory
            with local_file.open(mode="wb") as f:
                shutil.copyfileobj(remote_buffer, f, chunk)
            # change the permissions mode
            local_file.chmod(mode=mode)


# PURPOSE: download data files from zenodo
def _zenodo(
    directory: str | pathlib.Path = _default_directory,
    record: str = "18091740",
    timeout: int | None = None,
    context: ssl.SSLContext = _default_ssl_context,
    chunk: int = 16384,
    logger: logging.Logger | None = None,
    mode: oct = 0o775,
    **kwargs,
):
    """
    Download files necessary to run the test suite from zenodo

    Parameters
    ----------
    directory: str or pathlib.Path
        Download directory
    record: str, default '18091740'
        Zenodo record number
    timeout: int or NoneType, default None
        Timeout in seconds for blocking operations
    context: obj, default pyTMD.utilities._default_ssl_context
        ``SSL`` context for ``urllib`` opener object
    chunk: int, default 16384
        Chunk size for transfer encoding
    logger: logging.logger object
        Logger for outputting file transfer information
    mode: oct, default 0o775
        Permissions mode of output local file
    """
    # zenodo API host
    HOST = pyTMD.utilities.URL(_zenodo_api_url)
    records_api = HOST.joinpath("records", record)
    # Create and submit request and load JSON response
    records_response = records_api.load(timeout=timeout, context=context)
    # get files from latest version of record
    version = str(records_response["id"])
    deposit_api = HOST.joinpath("deposit", "depositions", version, "files")
    # Create and submit request and load JSON response
    deposit_response = deposit_api.load(timeout=timeout, context=context)
    # for each file in the JSON response for deposits
    for f in deposit_response:
        # check if file already exists by matching MD5 checksums
        local_file = directory.joinpath(f["filename"])
        # check if needing to include algorithm in the hash comparison
        include_algorithm = re.match(r"md5\:", f["checksum"])
        original_md5 = pyTMD.utilities.get_hash(
            local_file, include_algorithm=include_algorithm
        )
        # skip download if checksums match
        if original_md5 == f["checksum"]:
            continue
        # download url for remote file
        download = pyTMD.utilities.URL(f["links"]["download"])
        # output file information
        logger.info(download.urlname)
        # get remote file as a byte-stream
        remote_buffer = download.get(timeout=timeout, context=context)
        # verify MD5 checksums
        computed_md5 = pyTMD.utilities.get_hash(
            remote_buffer, include_algorithm=include_algorithm
        )
        # raise exception if checksums do not match
        if computed_md5 != f["checksum"]:
            raise Exception(f"Checksum mismatch: {download.urlname}")
        # download file or extract files from zip
        if pathlib.Path(f["filename"]).suffix == ".zip":
            # extract the zip file into the local directory
            with zipfile.ZipFile(remote_buffer) as z:
                # extract each file and set permissions
                for member in z.filelist:
                    z.extract(path=directory, member=member)
                    local_file = directory.joinpath(member.filename)
                    local_file.chmod(mode=mode)
        else:
            # write the file to the local directory
            with local_file.open(mode="wb") as f:
                shutil.copyfileobj(remote_buffer, f, chunk)
            # change the permissions mode
            local_file.chmod(mode=mode)


# PURPOSE: create argument parser
def arguments():
    parser = argparse.ArgumentParser(
        description="""Download models for running the test suite
            """,
        fromfile_prefix_chars="@",
    )
    parser.convert_arg_line_to_args = pyTMD.utilities.convert_arg_line_to_args
    # command line parameters
    # working data directory for location of tide models
    parser.add_argument(
        "--directory",
        "-D",
        type=pathlib.Path,
        default=_default_directory,
        help="Working data directory",
    )
    # download provider
    parser.add_argument(
        "--provider",
        "-P",
        metavar="PROVIDER",
        type=str,
        nargs="+",
        default=("zenodo", "figshare"),
        choices=("figshare", "zenodo"),
        help="Data provider",
    )
    # connection timeout
    parser.add_argument(
        "--timeout",
        "-t",
        type=int,
        default=3600,
        help="Timeout in seconds for blocking operations",
    )
    # permissions mode of the local directories and files (number in octal)
    parser.add_argument(
        "--mode",
        "-M",
        type=lambda x: int(x, base=8),
        default=0o775,
        help="Permissions mode of the files downloaded",
    )
    # return the parser
    return parser


# This is the main part of the program that calls the individual functions
def main():
    # Read the system arguments listed after the program
    parser = arguments()
    args, _ = parser.parse_known_args()

    # fetch test data
    for provider in args.provider:
        # try to fetch data from provider
        try:
            fetch_test_data(
                directory=args.directory,
                provider=provider,
                timeout=args.timeout,
                mode=args.mode,
            )
        except Exception as exc:
            # output error message and continue to next provider
            logging.error(f"Error fetching data from {provider}: {exc}")
            continue
        else:
            # break loop if successful
            break


# run main program
if __name__ == "__main__":
    main()