Source code for pyTMD.io.GOT

#!/usr/bin/env python
"""
GOT.py
Written by Tyler Sutterley (04/2026)

Reads ascii and netCDF4 files from Richard Ray's Goddard Ocean Tide (GOT) model
    https://earth.gsfc.nasa.gov/geo/data/ocean-tide-models

PYTHON DEPENDENCIES:
    numpy: Scientific Computing Tools For Python
        https://numpy.org
        https://numpy.org/doc/stable/user/numpy-for-matlab-users.html
    xarray: N-D labeled arrays and datasets in Python
        https://docs.xarray.dev/en/stable/

UPDATE HISTORY:
    Updated 04/2026: added lineage attributes to save model filename(s)
    Updated 03/2026: use numpy functions to convert from degrees to radians
    Updated 02/2026: make dataset accessor for GOT be a subaccessor from dataset
        some models have units in the second line of the header text
    Updated 12/2025: no longer subclassing pathlib.Path for working directories
        added function to write to output GOT-formatted ascii files
        fixed writing of output constituents to match GOT attribute format
    Updated 11/2025: near-complete rewrite of program to use xarray
    Updated 10/2025: simplify ascii read function to use masked_equal
    Updated 08/2025: use numpy degree to radian conversions
        added option to gap fill when reading constituent grids
    Updated 11/2024: expose buffer distance for cropping tide model data
    Updated 10/2024: fix error when using default bounds in extract_constants
    Updated 07/2024: added crop and bounds keywords for trimming model data
        use parse function from constituents class to extract names
    Updated 04/2023: fix repeated longitudinal convention adjustment
        using pathlib to define and expand tide model paths
    Updated 03/2023: add basic variable typing to function inputs
    Updated 12/2022: refactor tide read programs under io
        new functions to read and interpolate from constituents class
        new functions to read and write GOT netCDF4 files
        refactored interpolation routines into new module
    Updated 11/2022: use f-strings for formatting verbose or ascii output
    Updated 05/2022: reformat arguments to extract_GOT_constants definition
        changed keyword arguments to camel case
    Updated 04/2022: updated docstrings to numpy documentation format
        include utf-8 encoding in reads to be windows compliant
    Updated 12/2021: adjust longitude convention based on model longitude
    Updated 07/2021: added check that tide model files are accessible
    Updated 06/2021: add warning for tide models being entered as string
    Updated 05/2021: added option for extrapolation cutoff in kilometers
    Updated 03/2021: add extrapolation check where there are no invalid points
        prevent ComplexWarning for fill values when calculating amplitudes
        simplified inputs to be similar to binary OTIS read program
        replaced numpy bool/int to prevent deprecation warnings
    Updated 02/2021: set invalid values to nan in extrapolation
        replaced numpy bool to prevent deprecation warning
    Updated 12/2020: added valid data extrapolation with nearest_extrap
    Updated 09/2020: set bounds error to false for regular grid interpolations
        adjust dimensions of input coordinates to be iterable
    Updated 08/2020: replaced griddata with scipy regular grid interpolators
    Updated 07/2020: added function docstrings. separate bilinear interpolation
        update griddata interpolation. add option for compression
    Updated 06/2020: use argmin and argmax in bilinear interpolation
    Updated 11/2019: find invalid mask points for each constituent
    Updated 09/2019: output as numpy masked arrays instead of nan-filled arrays
    Updated 07/2019: interpolate fill value mask with bivariate splines
    Updated 12/2018: python3 compatibility updates for division and zip
    Updated 10/2018: added scale as load tides are in mm and ocean are in cm
    Updated 08/2018: added multivariate spline interpolation option
    Written 07/2018
"""

from __future__ import division, annotations

import re
import gzip
import pathlib
import datetime
import numpy as np
import xarray as xr
import pyTMD.version
import pyTMD.constituents
import pyTMD.utilities
from .dataset import combine_attrs, register_dataset_subaccessor

# attempt imports
dask = pyTMD.utilities.import_dependency("dask")
dask_available = pyTMD.utilities.dependency_available("dask")

__all__ = [
    "open_mfdataset",
    "open_got_dataset",
    "open_got_ascii",
    "open_got_netcdf",
    "GOTDataset",
]


# PURPOSE: read a list of GOT ASCII or netCDF4 files

[docs]
def open_mfdataset(
    model_files: list[str] | list[pathlib.Path],
    parallel: bool = False,
    **kwargs,
):
    """
    Open multiple GOT model files

    Parameters
    ----------
    model_files: list of str or pathlib.Path
        List of OTIS model files
    parallel: bool, default False
        Open files in parallel using ``dask.delayed``
    kwargs: dict
        Additional keyword arguments for opening GOT files

    Returns
    -------
    ds: xarray.Dataset
        GOT tide model data
    """
    # merge multiple granules
    if parallel and dask_available:
        opener = dask.delayed(open_got_dataset)
    else:
        opener = open_got_dataset
    # read each file as xarray dataset and append to list
    d = [opener(f, **kwargs) for f in model_files]
    # read datasets as dask arrays
    if parallel and dask_available:
        (d,) = dask.compute(d)
    # merge datasets
    ds = xr.merge(d, combine_attrs=combine_attrs, compat="override")
    # return xarray dataset
    return ds



# PURPOSE: reads a GOT ASCII or netCDF4 file

[docs]
def open_got_dataset(
    input_file: str | pathlib.Path,
    **kwargs,
):
    """
    Open GOT-formatted model files

    Parameters
    ----------
    input_file: str or pathlib.Path
        Input transport file
    format: str, default 'netcdf'
        Model format

            - ``'ascii'``: traditional GOT ASCII format
            - ``'netcdf'``: GOT netCDF4 format
    kwargs: dict
        Additional keyword arguments for opening GOT files

    Returns
    -------
    ds: xarray.Dataset
        GOT tide model data
    """
    # detect file format if not provided
    if kwargs.get("format", None) is None:
        kwargs["format"] = pyTMD.utilities.detect_format(input_file)
    # detect if file is compressed if not provided
    if kwargs.get("compressed", None) is None:
        kwargs["compressed"] = pyTMD.utilities.detect_compression(input_file)
    # read constituent from file
    if kwargs["format"] == "ascii":
        # GOT ascii constituent files
        ds = open_got_ascii(input_file, **kwargs)
    elif kwargs["format"] == "netcdf":
        # GOT netCDF4 constituent files
        ds = open_got_netcdf(input_file, **kwargs)
    else:
        raise ValueError(f"Unrecognized file format: {kwargs['format']}")
    # return dataset
    return ds



# PURPOSE: read GOT ASCII files

[docs]
def open_got_ascii(
    input_file: str | pathlib.Path,
    chunks: int | dict | str | None = None,
    **kwargs,
):
    """
    Open GOT-formatted ASCII files

    Parameters
    ----------
    input_file: str or pathlib.Path
        Model file
    chunks: int, dict, str, or None, default None
        Coerce output to specified chunks
    compressed: bool, default False
        Input file is ``gzip`` compressed

    Returns
    -------
    ds: xarray.Dataset
        GOT tide model data
    """
    # set default keyword arguments
    kwargs.setdefault("compressed", False)
    # tilde-expand input file
    input_file = pyTMD.utilities.Path(input_file).resolve()
    if isinstance(input_file, pathlib.Path) and not input_file.exists():
        raise FileNotFoundError(f"File not found: {input_file}")
    # read the ASCII-format file
    if kwargs["compressed"]:
        # read gzipped ascii file
        with gzip.open(input_file, "rb") as f:
            file_contents = f.read().decode("utf8").splitlines()
    else:
        with open(input_file, mode="r", encoding="utf8") as f:
            file_contents = f.read().splitlines()
    # parse header text
    # constituent identifier
    cons = pyTMD.constituents._parse_name(file_contents[0])
    # get units from header if available
    rx = re.compile(r"\((\w+m)\)", re.IGNORECASE)
    # GOT headers from Richard Ray have units on the first line
    # some other models have units on the second line
    if rx.search(file_contents[0]):
        units = rx.findall(file_contents[0], re.IGNORECASE)
    elif rx.search(file_contents[1]):
        units = rx.findall(file_contents[1], re.IGNORECASE)
    else:
        units = None
    # grid dimensions
    nlat, nlon = np.array(file_contents[2].split(), dtype=int)
    # longitude range
    ilat = np.array(file_contents[3].split(), dtype=np.float64)
    # latitude range
    ilon = np.array(file_contents[4].split(), dtype=np.float64)
    # mask fill value
    masked_values = file_contents[5].split()
    fill_value = np.array(masked_values[0], dtype=np.float32)
    # number of columns in ascii file
    ncol = 11
    # create latitude and longitude arrays
    lat = np.linspace(ilat[0], ilat[1], nlat)
    lon = np.linspace(ilon[0], ilon[1], nlon)
    # data dictionary
    var = dict(dims=("y", "x"), coords={}, data_vars={})
    var["coords"]["y"] = dict(data=lat.copy(), dims="y")
    var["coords"]["x"] = dict(data=lon.copy(), dims="x")
    # input amplitude and phase
    amp = np.zeros((nlat, nlon), dtype=np.float32)
    ph = np.zeros((nlat, nlon), dtype=np.float32)
    # starting lines to fill amplitude and phase variables
    l1 = 7
    l2 = 14 + int(nlon // ncol) * nlat + nlat
    # for each latitude
    for i in range(nlat):
        for j in range(nlon // ncol):
            j1 = j * ncol
            # amplitude and phase are on two separate rows
            amp[i, j1 : j1 + ncol] = np.array(file_contents[l1].split())
            ph[i, j1 : j1 + ncol] = np.array(file_contents[l2].split())
            l1 += 1
            l2 += 1
        # add last row of tidal variables
        j1 = (j + 1) * ncol
        j2 = nlon % ncol
        # amplitude and phase are on two separate rows
        amp[i, j1 : j1 + j2] = np.array(file_contents[l1].split())
        ph[i, j1 : j1 + j2] = np.array(file_contents[l2].split())
        l1 += 1
        l2 += 1
    # convert to masked arrays
    amp = np.ma.masked_equal(amp, fill_value)
    ph = np.ma.masked_equal(ph, fill_value)
    # store the data variables
    var["data_vars"][cons] = {}
    var["data_vars"][cons]["dims"] = ("y", "x")
    var["data_vars"][cons]["data"] = amp * np.exp(-1j * np.radians(ph))
    # convert to xarray Dataset from the data dictionary
    ds = xr.Dataset.from_dict(var)
    # coerce to specified chunks
    if chunks is not None:
        ds = ds.chunk(chunks)
    # add attributes
    ds.attrs["group"] = "z"
    ds.attrs["lineage"] = pathlib.Path(input_file).name
    if units:
        ds[cons].attrs["units"] = units[0].lower()
    # return xarray dataset
    return ds



# PURPOSE: read GOT netCDF4 files

[docs]
def open_got_netcdf(
    input_file: str | pathlib.Path,
    chunks: int | dict | str | None = None,
    **kwargs,
):
    """
    Open GOT-formatted netCDF4 files

    Parameters
    ----------
    input_file: str or pathlib.Path
        Model file
    chunks: int, dict, str, or None, default None
        Variable chunk sizes for dask (see ``xarray.open_dataset``)
    compressed: bool, default False
        Input file is ``gzip`` compressed

    Returns
    -------
    ds: xarray.Dataset
        GOT tide model data
    """
    # set default keyword arguments
    kwargs.setdefault("compressed", False)
    # tilde-expand input file
    input_file = pyTMD.utilities.Path(input_file).resolve()
    if isinstance(input_file, pathlib.Path) and not input_file.exists():
        raise FileNotFoundError(f"File not found: {input_file}")
    # read the netCDF4-format file
    if kwargs["compressed"]:
        # read gzipped netCDF4 file
        f = gzip.open(input_file, "rb")
        tmp = xr.open_dataset(
            f, mask_and_scale=True, decode_coords=False, chunks=chunks
        )
    else:
        tmp = xr.open_dataset(
            input_file, mask_and_scale=True, decode_coords=False, chunks=chunks
        )
    # extract constituent from attribute
    cons = pyTMD.constituents._parse_name(tmp.attrs["Constituent"])
    # create output xarray dataset for file
    ds = xr.Dataset()
    # assign coordinates
    ds.coords["x"] = tmp.longitude
    ds.coords["y"] = tmp.latitude
    # calculate complex form of constituent oscillation
    ds[cons] = tmp.amplitude * np.exp(-1j * np.radians(tmp.phase))
    # rename dimensions
    mapping_coords = dict(lon="x", lat="y")
    ds = ds.rename(mapping_coords)
    # add attributes
    ds.attrs["group"] = "z"
    ds.attrs["lineage"] = pathlib.Path(input_file).name
    ds[cons].attrs["units"] = tmp["amplitude"].attrs.get("units")
    # return xarray dataset
    return ds



# PURPOSE: GOT utilities for xarray Datasets

[docs]
@register_dataset_subaccessor("got")
class GOTDataset:
    """``xarray.Dataset`` utilities for GOT tidal models"""

    def __init__(self, ds):
        self._ds = ds


[docs]
    def to_ascii(
        self,
        path: str | pathlib.Path,
        fill_value: float = 999.0,
        mode: str = "w",
        **kwargs,
    ):
        """
        Writes tidal constituents to ASCII files in GOT format

        Parameters
        ----------
        path: str | pathlib.Path
            Output directory for ASCII files
        fill_value: float, default -999.0
            Fill value for missing data
        mode: str, default 'w'
            File mode
        kwargs: dict
            Additional keyword arguments for ASCII writer
        """
        # tilde-expand output path
        path = pyTMD.utilities.Path(path).resolve()
        # for each variable
        for v in self._ds.data_vars.keys():
            # output dataset
            ds = xr.Dataset()
            # calculate amplitude and phase with fill values
            ds["amplitude"] = self._ds[v].tmd.amplitude.fillna(fill_value)
            ds["phase"] = self._ds[v].tmd.phase.fillna(fill_value)
            # get min and max of coordinates
            nlat, nlon = self._ds[v].shape
            ymin, ymax = ds["y"].values.min(), ds["y"].values.max()
            xmin, xmax = ds["x"].values.min(), ds["x"].values.max()
            # number of columns in ascii file
            ncol = 11
            # write GOT ASCII file
            FILE = path.joinpath(f"{v}.d")
            with open(FILE, mode=mode, encoding="utf8") as f:
                # write header information for amplitude
                units = ds["amplitude"].attrs.get("units", "")
                f.write(f"{v.upper()} tide amplitude ({units})\n")
                f.write(f"Generated by pyTMD {pyTMD.version.full_version}\n")
                f.write(f"{nlat:20d} {nlon:20d}\n")
                f.write(f"{ymin:20.4f} {ymax:20.4f}\n")
                f.write(f"{xmin:20.4f} {xmax:20.4f}\n")
                f.write(f"{fill_value:20.2f} {fill_value:20.2f}\n")
                f.write("\n")
                # write amplitude  data
                for i in range(nlat):
                    amp = ds["amplitude"].isel(y=i).values
                    fmt = [f"{v:7.2f}" for v in amp]
                    for j in range(nlon // ncol + 1):
                        j1 = j * ncol
                        f.write("".join(fmt[j1 : j1 + ncol]) + "\n")
                # write header information for phase
                units = ds["phase"].attrs.get("units", "")
                f.write(f"{v.upper()} tide phase lags ({units})\n")
                f.write(f"Generated by pyTMD {pyTMD.version.full_version}\n")
                f.write(f"{nlat:20d} {nlon:20d}\n")
                f.write(f"{ymin:20.4f} {ymax:20.4f}\n")
                f.write(f"{xmin:20.4f} {xmax:20.4f}\n")
                f.write(f"{fill_value:20.2f} {fill_value:20.2f}\n")
                f.write("\n")
                # write phase data
                for i in range(nlat):
                    ph = ds["phase"].isel(y=i).values
                    fmt = [f"{v:7.2f}" for v in ph]
                    for j in range(nlon // ncol + 1):
                        j1 = j * ncol
                        f.write("".join(fmt[j1 : j1 + ncol]) + "\n")


    # PURPOSE: output tidal constituent file in GOT netCDF format

[docs]
    def to_netcdf(
        self,
        path: str | pathlib.Path,
        mode: str = "w",
        encoding: dict = {"zlib": True, "complevel": 9},
        **kwargs,
    ):
        """
        Writes tidal constituents to netCDF4 files in GOT format

        Parameters
        ----------
        path: str | pathlib.Path
            Output directory for netCDF4 files
        mode: str, default 'w'
            netCDF4 file mode
        encoding: dict, default {"zlib": True, "complevel": 9}
            netCDF4 variable compression settings
        kwargs: dict
            Additional keyword arguments for ``xarray`` netCDF4 writer
        """
        # tilde-expand output path
        path = pyTMD.utilities.Path(path).resolve()
        # set default encoding
        kwargs.setdefault("encoding", dict(amplitude=encoding, phase=encoding))
        # coordinate remapping
        mapping_coords = dict(x="longitude", y="latitude")
        attrs = dict(longitude={}, latitude={})
        attrs["longitude"] = dict(units="degrees_east", long_name="longitude")
        attrs["latitude"] = dict(units="degrees_north", long_name="latitude")
        # get longitude and latitude arrays
        # for each variable
        for v in self._ds.data_vars.keys():
            ds = xr.Dataset()
            # calculate amplitude and phase
            ds["amplitude"] = self._ds[v].tmd.amplitude
            ds["phase"] = self._ds[v].tmd.phase
            ds["amplitude"].attrs["units"] = self._ds[v].attrs.get("units", "")
            ds["phase"].attrs["units"] = "degrees"
            ds["amplitude"].attrs["long_name"] = "Tide amplitude"
            ds["phase"].attrs["long_name"] = "Greenwich tide phase lag"
            # rename dimensions
            ds = ds.swap_dims(dict(x="lon", y="lat"))
            # remap coordinates to GOT convention
            ds = ds.rename(mapping_coords)
            # update variable attributes
            for att_name, att_val in attrs.items():
                ds[att_name].attrs.update(att_val)
            # add global attributes
            ds.attrs["title"] = "GOT tidal constituent data"
            ds.attrs["authors"] = "Richard Ray"
            ds.attrs["institution"] = "NASA Goddard Space Flight Center"
            # define and fill constituent ID
            ds.attrs["Constituent"] = v.upper().encode("utf8")
            ds.attrs["date_created"] = datetime.datetime.now().isoformat()
            ds.attrs["software_reference"] = pyTMD.version.project_name
            ds.attrs["software_version"] = pyTMD.version.full_version
            # write GOT netCDF4 file
            FILE = path.joinpath(f"{v}.nc")
            ds.to_netcdf(FILE, mode=mode, **kwargs)