--- title: External data keywords: fastai sidebar: home_sidebar summary: "Helper functions used to download and extract common time series datasets." description: "Helper functions used to download and extract common time series datasets." nb_path: "nbs/012_data.external.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

DataFrame.__init__[source]

DataFrame.__init__(data=None, index=None, columns=None, dtype=None, copy=None)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

decompress_from_url[source]

decompress_from_url(url, target_dir=None, verbose=False)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

download_data[source]

download_data(url, fname=None, c_key='archive', force_download=False, timeout=4, verbose=False)

Download url to fname.

{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_UCR_univariate_list[source]

get_UCR_univariate_list()

{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_UCR_multivariate_list[source]

get_UCR_multivariate_list()

{% endraw %} {% raw %}
158
{% endraw %} {% raw %}

get_UCR_data[source]

get_UCR_data(dsid, path='.', parent_dir='data/UCR', on_disk=True, mode='c', Xdtype='float32', ydtype=None, return_split=True, split_data=True, force_download=False, verbose=False)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
X_train, y_train, X_valid, y_valid = get_UCR_data('natops')
{% endraw %} {% raw %}
dsid = 'natops' 
X_train, y_train, X_valid, y_valid = get_UCR_data(dsid, verbose=True)
X, y, splits = get_UCR_data(dsid, split_data=False)
test_eq(X[splits[0]], X_train)
test_eq(y[splits[1]], y_valid)
test_eq(X[splits[0]], X_train)
test_eq(y[splits[1]], y_valid)
test_type(X, X_train)
test_type(y, y_train)
Dataset: NATOPS
X_train: (180, 24, 51)
y_train: (180,)
X_valid: (180, 24, 51)
y_valid: (180,) 

{% endraw %} {% raw %}

check_data[source]

check_data(X, y=None, splits=None, show_plot=True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
dsid = 'ECGFiveDays'
X, y, splits = get_UCR_data(dsid, split_data=False, on_disk=False, force_download=False)
check_data(X, y, splits)
check_data(X[:, 0], y, splits)
y = y.astype(np.float32)
check_data(X, y, splits)
y[:10] = np.nan
check_data(X[:, 0], y, splits)
X, y, splits = get_UCR_data(dsid, split_data=False, on_disk=False, force_download=False)
splits = get_splits(y, 3)
check_data(X, y, splits)
check_data(X[:, 0], y, splits)
y[:5]= np.nan
check_data(X[:, 0], y, splits)
X, y, splits = get_UCR_data(dsid, split_data=False, on_disk=False, force_download=False)
X      - shape: [884 samples x 1 features x 136 timesteps]  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 2 (442 samples per class) ['1', '2']  isnan: False
splits - n_splits: 2 shape: [23, 861]  overlap: False
X      - shape: (884, 136)  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 2 (442 samples per class) ['1', '2']  isnan: False
splits - n_splits: 2 shape: [23, 861]  overlap: False
X      - shape: [884 samples x 1 features x 136 timesteps]  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:float32  isnan: 0
splits - n_splits: 2 shape: [23, 861]  overlap: False
X      - shape: (884, 136)  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:float32  isnan: 10
splits - n_splits: 2 shape: [23, 861]  overlap: False
/Users/nacho/opt/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel_launcher.py:25: UserWarning: y must not contain nan values
X      - shape: [884 samples x 1 features x 136 timesteps]  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 2 (442 samples per class) ['1', '2']  isnan: False
splits - n_splits: 3 shape: [[589, 295], [589, 295], [590, 294]]  overlap: [False, False, False]
X      - shape: (884, 136)  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 2 (442 samples per class) ['1', '2']  isnan: False
splits - n_splits: 3 shape: [[589, 295], [589, 295], [590, 294]]  overlap: [False, False, False]
X      - shape: (884, 136)  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 3 (294 samples per class) ['1', '2', 'n']  isnan: False
splits - n_splits: 3 shape: [[589, 295], [589, 295], [590, 294]]  overlap: [False, False, False]
{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_Monash_regression_list[source]

get_Monash_regression_list()

{% endraw %} {% raw %}
15
{% endraw %} {% raw %}

get_Monash_regression_data[source]

get_Monash_regression_data(dsid, path='./data/Monash', on_disk=True, mode='c', Xdtype='float32', ydtype=None, split_data=True, force_download=False, verbose=False, timeout=4)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
dsid = "Covid3Month"
X_train, y_train, X_valid, y_valid = get_Monash_regression_data(dsid, on_disk=False, split_data=True, force_download=False)
X, y, splits = get_Monash_regression_data(dsid, on_disk=True, split_data=False, force_download=False, verbose=True)
if X_train is not None: 
    test_eq(X_train.shape, (140, 1, 84))
if X is not None: 
    test_eq(X.shape, (201, 1, 84))
Dataset: Covid3Month
X      : (201, 1, 84)
y      : (201,)
splits : (#140) [0,1,2,3,4,5,6,7,8,9...] (#61) [140,141,142,143,144,145,146,147,148,149...] 

{% endraw %} {% raw %}

get_forecasting_list[source]

get_forecasting_list()

{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_forecasting_time_series[source]

get_forecasting_time_series(dsid, path='./data/forecasting/', force_download=False, verbose=True, **kwargs)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
ts = get_forecasting_time_series("sunspots", force_download=False)
test_eq(len(ts), 3235)
ts
Dataset: Sunspots
downloading data...
...data downloaded. Path = data/forecasting/Sunspots.csv
Monthly Mean Total Sunspot Number
Date
1749-01-31 96.7
1749-02-28 104.3
1749-03-31 116.7
1749-04-30 92.8
1749-05-31 141.7
... ...
2018-03-31 2.5
2018-04-30 8.9
2018-05-31 13.2
2018-06-30 15.9
2018-07-31 1.6

3235 rows × 1 columns

{% endraw %} {% raw %}
ts = get_forecasting_time_series("weather", force_download=False)
if ts is not None: 
    test_eq(len(ts), 70091)
    print(ts)
Dataset: Weather
downloading data...
...data downloaded. Path = data/forecasting/Weather.csv.zip
       p (mbar)  T (degC)  Tpot (K)  Tdew (degC)  rh (%)  VPmax (mbar)  \
0        996.50     -8.05    265.38        -8.78   94.40          3.33   
1        996.62     -8.88    264.54        -9.77   93.20          3.12   
2        996.84     -8.81    264.59        -9.66   93.50          3.13   
3        996.99     -9.05    264.34       -10.02   92.60          3.07   
4        997.46     -9.63    263.72       -10.65   92.20          2.94   
...         ...       ...       ...          ...     ...           ...   
70086   1002.18     -0.98    272.01        -5.36   72.00          5.69   
70087   1001.40     -1.40    271.66        -6.84   66.29          5.51   
70088   1001.19     -2.75    270.32        -6.90   72.90          4.99   
70089   1000.65     -2.89    270.22        -7.15   72.30          4.93   
70090   1000.11     -3.93    269.23        -8.09   72.60          4.56   

       VPact (mbar)  VPdef (mbar)  sh (g/kg)  H2OC (mmol/mol)  rho (g/m**3)  \
0              3.14          0.19       1.96             3.15       1307.86   
1              2.90          0.21       1.81             2.91       1312.25   
2              2.93          0.20       1.83             2.94       1312.18   
3              2.85          0.23       1.78             2.85       1313.61   
4              2.71          0.23       1.69             2.71       1317.19   
...             ...           ...        ...              ...           ...   
70086          4.09          1.59       2.54             4.08       1280.70   
70087          3.65          1.86       2.27             3.65       1281.87   
70088          3.64          1.35       2.26             3.63       1288.02   
70089          3.57          1.37       2.22             3.57       1288.03   
70090          3.31          1.25       2.06             3.31       1292.41   

             Wx        Wy    max Wx    max Wy       Day sin   Day cos  \
0     -0.204862 -0.046168 -0.614587 -0.138503 -1.776611e-12  1.000000   
1     -0.245971 -0.044701 -0.619848 -0.112645  2.588190e-01  0.965926   
2     -0.175527  0.039879 -0.614344  0.139576  5.000000e-01  0.866025   
3     -0.050000 -0.086603 -0.190000 -0.329090  7.071068e-01  0.707107   
4     -0.368202  0.156292 -0.810044  0.343843  8.660254e-01  0.500000   
...         ...       ...       ...       ...           ...       ...   
70086 -0.855154 -0.160038 -1.336792 -0.250174 -9.990482e-01  0.043619   
70087 -0.716196 -0.726267 -1.348134 -1.367090 -9.537170e-01  0.300706   
70088 -0.661501  0.257908 -1.453438  0.566672 -8.433914e-01  0.537300   
70089 -0.280621 -0.209169 -0.545207 -0.406385 -6.755902e-01  0.737277   
70090 -0.516998 -0.215205 -0.923210 -0.384295 -4.617486e-01  0.887011   

       Year sin  Year cos  
0      0.009332  0.999956  
1      0.010049  0.999950  
2      0.010766  0.999942  
3      0.011483  0.999934  
4      0.012199  0.999926  
...         ...       ...  
70086  0.006183  0.999981  
70087  0.006900  0.999976  
70088  0.007617  0.999971  
70089  0.008334  0.999965  
70090  0.009050  0.999959  

[70091 rows x 19 columns]
{% endraw %} {% raw %}
{% endraw %} {% raw %}

convert_tsf_to_dataframe[source]

convert_tsf_to_dataframe(full_file_path_and_name, replace_missing_vals_with='NaN', value_column_name='series_value')

{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_Monash_forecasting_data[source]

get_Monash_forecasting_data(dsid, path='./data/forecasting/', force_download=False, remove_from_disk=False, verbose=True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
dsid = 'm1_yearly_dataset'
X = get_Monash_forecasting_data(dsid, force_download=False)
if X is not None: 
    test_eq(X.shape, (181, 1, 58))
Dataset: m1_yearly_dataset
downloading data...
...data downloaded
decompressing data...
...data decompressed
converting dataframe to numpy array...
...dataframe converted to numpy array

X.shape: (181, 1, 58)
freq: yearly
forecast_horizon: 6
contain_missing_values: False
contain_equal_length: False
{% endraw %}