Source code for pinnicle.modeldata.netcdf_data

from . import DataBase
from ..parameter import SingleDataParameter
from ..physics import Constants
from ..utils import down_sample
from netCDF4 import Dataset
import numpy as np



[docs]
class NetCDFData(DataBase, Constants):
    """ data loaded from a `.nc` file
    """
    _DATA_TYPE = "nc"
    def __init__(self, parameters=SingleDataParameter()):
        Constants.__init__(self)
        super().__init__(parameters)


[docs]
    def get_ice_coordinates(self, mask=None):
        """ stack the coordinates `x` and `y`, assuming all the data in .mat 
            are in the ice covered region. This function is currently only 
            called by plotting to generate ice covered region.
        """
        # get the coordinates
        if mask is None:
            X_mask = np.hstack([self.X_dict[k].flatten()[:,None] for k in self.parameters.X_map if k in self.X_dict])
        else:
            X_mask = np.hstack([self.X_dict[k][mask].flatten()[:,None] for k in self.parameters.X_map if k in self.X_dict])

        return X_mask


    def _coordinate_slice(self, coord, lower, upper, name):
        """Return a contiguous slice for a coordinate vector and requested bounds."""
        coord = np.asarray(coord)
        if coord.ndim != 1:
            inds = np.where((coord >= lower) & (coord <= upper))[0]
            if len(inds) == 0:
                raise ValueError(f"No {name} indices found in range.")
            return slice(inds[0], inds[-1] + 1)

        if coord.size == 0:
            raise ValueError(f"No {name} coordinates found.")

        is_increasing = coord[0] <= coord[-1]
        if is_increasing:
            start = np.searchsorted(coord, lower, side="left")
            end = np.searchsorted(coord, upper, side="right")
        else:
            coord_rev = coord[::-1]
            rev_start = np.searchsorted(coord_rev, lower, side="left")
            rev_end = np.searchsorted(coord_rev, upper, side="right")
            start = coord.size - rev_end
            end = coord.size - rev_start

        if start >= end:
            raise ValueError(f"No {name} indices found in range.")
        return slice(start, end)


[docs]
    def load_data(self, domain=None, physics=None):
        """ load grid data from a `.nc` file, based on the domain, return a dict with the required data
        """
        with Dataset(self.parameters.data_path, "r") as data:
            # pre load x, y, the spatial coordinates, from now on, use X and its keys only, X_map translate the data already
            X = {}
            for k, v in self.parameters.X_map.items():
                if v in data.variables:
                    X[k] = data.variables[v]
                else:
                    raise KeyError(
                        f"{v} is not found in the data from {self.parameters.data_path}, "
                        "please specify the mapping in 'X_map'"
                    )

            # use the order in physics.input_var to determine x and y names
            if physics:
                xkeys = physics.input_var[0:2]
            else:
                xkeys = list(X.keys())

            # Load coordinate arrays once. They are tiny compared with gridded fields
            # and are reused for slicing, mesh generation, and optional polygon masks.
            x_coord = {k: np.asarray(X[k][:]) for k in xkeys}

            # get the bbox from domain, set the rectangle, works for both static and time dependent domain
            if domain:
                bbox = domain.bbox()
                xmin = [bbox[0][0], bbox[0][1]]
                xmax = [bbox[1][0], bbox[1][1]]
            else:
                xmin = [np.nanmin(x_coord[k]) for k in xkeys]
                xmax = [np.nanmax(x_coord[k]) for k in xkeys]

            coord_slices = {
                xkeys[i]: self._coordinate_slice(x_coord[xkeys[i]], xmin[i], xmax[i], xkeys[i])
                for i in range(2)
            }

            # load and generate the coordinates
            x_slice = x_coord[xkeys[0]][coord_slices[xkeys[0]]]
            y_slice = x_coord[xkeys[1]][coord_slices[xkeys[1]]]
            self.X_dict[xkeys[0]] = np.tile(x_slice, y_slice.size).reshape(-1, 1)
            self.X_dict[xkeys[1]] = np.repeat(y_slice, x_slice.size).reshape(-1, 1)

            # load all variables from parameters.name_map
            data_slice = (coord_slices[xkeys[1]], coord_slices[xkeys[0]])
            for k, v in self.parameters.name_map.items():
                if v not in data.variables:
                    raise KeyError(
                        f"{v} is not found in the data from {self.parameters.data_path}, "
                        "please specify the mapping in 'name_map'"
                    )
                scaling = self.parameters.scaling.get(k, 1.0)
                self.data_dict[k] = (np.ma.asarray(data.variables[v][data_slice]).reshape(-1, 1))*scaling

            if self.parameters.sample_only_inside:
                P = np.hstack((self.X_dict[xkeys[0]], self.X_dict[xkeys[1]]))
                mask = np.asarray(domain.inside(P)).astype(bool).reshape(-1)
                self.X_dict[xkeys[0]] = self.X_dict[xkeys[0]][mask]
                self.X_dict[xkeys[1]] = self.X_dict[xkeys[1]][mask]
                for k, v in self.parameters.name_map.items():
                    self.data_dict[k] = self.data_dict[k][mask]




[docs]
    def plot(self, data_names=[], vranges={}, axs=None, **kwargs):
        """ TODO: scatter plot of the selected data from data_names
        """
        pass



[docs]
    def prepare_training_data(self, data_size=None):
        """ prepare data for PINNs according to the settings in `data_size`
        """
        if data_size is None:
            data_size = self.parameters.data_size

        # initialize
        self.X = {}
        self.sol = {}

        # go through all keys in data_dict
        for k in self.data_dict:
            # if datasize has the key, then add to X and sol
            if k in data_size:
                if data_size[k] is not None:
                    # apply nan mask
                    _temp = np.ma.masked_invalid(np.ma.asarray(self.data_dict[k]).reshape(-1, 1))
                    mask = ~np.ma.getmaskarray(_temp).reshape(-1)
                    sol_temp = np.asarray(_temp.compressed()).reshape(-1, 1)

                    # prepare x,y coordinates
                    X_temp = self.get_ice_coordinates(mask=mask)

                    # random choose to a downscale sampling of the scatter data
                    idx = down_sample(X_temp, data_size[k])
                    self.X[k] = X_temp[idx, :]
                    self.sol[k] = sol_temp[idx, :]
                else:
                    # if the size is None, then only use boundary conditions
                    raise ValueError(f"{k} can not be set to None in .mat data. \
                                     If {k} is not needed in training, please remove it from `data_size`")