Source code for ninolearn.IO.read_post

from os.path import join
import pandas as pd
import xarray as xr
import gc

from ninolearn.pathes import postdir
from ninolearn.utils import generateFileName

#TODO: Write a routine that generates this list
csv_vars = ['nino3.4M','nino3.4S', 'wwv']


[docs]class data_reader(object): def __init__(self, startdate='1980-01', enddate='2018-12', lon_min=120, lon_max=280, lat_min=-30, lat_max=30): """ Data reader for different kind of El Nino related data. :param startdate:year and month from which on data should be loaded :param enddate: year and month to which data should be loaded :lon_min: eastern boundary of data set in degrees east :lon_max: western boundary of data set in degrees east :lat_min: southern boundary of data set in degrees north :lat_max: northern boundary of data set in degrees north """ self.startdate = pd.to_datetime(startdate) self.enddate = pd.to_datetime(enddate) + pd.tseries.offsets.MonthEnd(0) self.lon_min = lon_min self.lon_max = lon_max self.lat_min = lat_min self.lat_max = lat_max def __del__(self): gc.collect() def shift_window(self, month=1): self.startdate = self.startdate + pd.DateOffset(months=month) self.enddate = self.enddate + pd.DateOffset(months=month) \ + pd.tseries.offsets.MonthEnd(0)
[docs] def read_csv(self, variable, processed='anom'): """ get data from processed csv """ data = pd.read_csv(join(postdir, f"{variable}.csv"), index_col=0, parse_dates=True) self._check_dates(data, f"{variable}") return data[processed].loc[self.startdate:self.enddate]
[docs] def read_netcdf(self, variable, dataset='', processed='', chunks=None): """ wrapper for xarray.open_dataarray. :param variable: the name of the variable :param dataset: the name of the dataset :param processed: the postprocessing that was applied :param chunks: same as for xarray.open_dataarray """ filename = generateFileName(variable, dataset, processed=processed, suffix="nc") data = xr.open_dataarray(join(postdir, filename), chunks=chunks) regrided = ['GODAS', 'ERSSTv5', 'ORAS4', 'NODC', 'NCAR'] if processed=='meanclim': return data else: self._check_dates(data, f'{filename[:-3]}') if dataset not in regrided and dataset!='ORAP5' and dataset != 'GFDL-CM3': return data.loc[self.startdate:self.enddate, self.lat_max:self.lat_min, self.lon_min:self.lon_max] elif dataset in regrided or dataset == 'GFDL-CM3': return data.loc[self.startdate:self.enddate, self.lat_min:self.lat_max, self.lon_min:self.lon_max] elif dataset=='ORAP5': return data.loc[self.startdate: self.enddate, :, :].where( (data.nav_lat > self.lat_min) & (data.nav_lat < self.lat_max) & (data.nav_lon > self.lon_min) & (data.nav_lon < self.lon_max), drop=True)
def read_statistic(self, statistic, variable, dataset='', processed=''): filename = generateFileName(variable, dataset, processed=processed, suffix="csv") filename = '-'.join([statistic, filename]) data = pd.read_csv(join(postdir, filename), index_col=0, parse_dates=True) self._check_dates(data, f"{variable} - {statistic}" ) return data.loc[self.startdate:self.enddate]
[docs] def read_other_forecasts(self, model, lead): """ Read forecasts from other models. :type model: str :param model: Model name. """ ds = xr.open_dataset(join(postdir, f'other_forecasts.nc')) data = ds[model].loc[self.startdate:self.enddate, lead] return data
def _check_dates(self, data, name): """ Checks if provided start and end date are in the bounds of the data that should be read. """ if isinstance(data, xr.DataArray): if self.startdate < data.time.values.min(): raise IndexError("The startdate is out of\ bounds for %s data!" % name) if self.enddate > pd.to_datetime(data.time.values.max()) + pd.tseries.offsets.MonthEnd(0): print(data.time.values.max()) print(self.enddate) raise IndexError("The enddate is out of bounds for %s data!" % name) if isinstance(data, pd.DataFrame): if self.startdate < data.index.values.min(): msg = f"The startdate is out of bounds for {name} data!" raise IndexError(msg) if self.enddate > pd.to_datetime(data.index.values.max()) + pd.tseries.offsets.MonthEnd(0): print( self.enddate ) print(data.index.values.max()) raise IndexError("The enddate is out of bounds for %s data!" % name)
if __name__ == "__main__": reader = data_reader(startdate="1981-01", enddate='2018-12', lon_min=120, lon_max=380, lat_min=-30, lat_max=30) data = reader.read_netcdf('sshg', dataset='GODAS', processed='anom') data2 = reader.read_netcdf('zos', dataset='GFDL-CM3', processed='anom')