In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import pycountry

# Download monitor measurements

In [None]:
import download_measurements

In [None]:
import multiprocessing
from joblib import Parallel, delayed

In [None]:
# Get all raw monitor data + Oxford stringency data

In [None]:
num_cores = 12

## Japan

In [None]:
# Japan
# Get the list of all possible combinations of year, month, province based on
# the data structure
lst_months_jp = []
for year in [2016, 2017]:
    for prov in range(1,48):
        lst_months_jp.append((year, 0, prov))
#for month in range(5,13):
#    for prov in range(1,48):
#        lst_months_jp.append((2019, month, prov))
#for month in range(1,7):
#    for prov in range(1,48):
#        lst_months_jp.append((2020, month, prov))

In [None]:
res = Parallel(n_jobs=num_cores)(delayed(download_measurements.jp_monitor_data)(year, month, prov) for (year, month, prov) in lst_months_jp)

In [None]:
# Sequential
download_measurements.jp_monitor_data(year)

## Europe

In [None]:
import os, pycountry

In [None]:
pycountry.countries.get(alpha_2='MK').name

In [None]:
# Europe
short_countries = ['BG','IS','DK','CZ','SI','AD','GI','GR','CH',
                   'HU','RO','LU','ME','LV','LT','CY','RS','MK',
                   'TR','PT','MT','SE','HR','NL','IE','GE',
                   'NO','BA','BE','SK','PL','FI','EE','AT','AL',
                   'FR', 'DE', 'GB', 'IT', 'ES']
# long_countries = [pycountry.countries.get(alpha_2=c).name for c in short_countries]
lst_eu = []
for year in range(2016, 2021):
    for country in short_countries:
        lst_eu.append((year, country))
        download_measurements.eu_monitor_data(year, country)

In [None]:
path = '/net/fs03/d1/gchossie/2020_aq/monitor_data/raw_data/europe/'
for year in range(2016, 2021):
    for country in short_countries:
        for sp in ['NO2', 'PM2.5', 'O3']:
            if not os.path.exists(path + f'{country}_raw_{year}_{sp}.feather'):
                print(country, year, sp)

In [None]:
year, country

In [None]:
# Parallel download
num_cores = 24
Parallel(n_jobs=num_cores)(delayed(download_measurements.eu_monitor_data)(
    year, country) for (year, country) in lst_eu)

## US

In [None]:
# US
Parallel(n_jobs=5)(delayed(download_measurements.us_monitor_data)(year)
                   for year in range(2016, 2021))

In [None]:
download_measurements.us_monitor_data(year)

## China

In [None]:
# China
Parallel(n_jobs=4)(delayed(download_measurements.ch_monitor_data)(year)
                   for year in range(2016, 2020))

In [None]:
download_measurements.ch_monitor_data(year)

## South Korea

In [None]:
# South Korea
download_measurements.kr_monitor_data(year)

In [None]:
Parallel(n_jobs=num_cores)(delayed(download_measurements.kr_monitor_data)(year) for year in range(2016, 2021))

In [None]:
download_measurements.kr_monitor_data_recent(year)

In [None]:
# Need to scrape data from the website for April and May 2020
if year == 2020:
    download_measurements.kr_monitor_data_recent(year)

In [None]:
# Get Oxford strigency index
download_measurements.get_oxford_strigency()

# Get information on monitors

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import get_monitor_info

Download the monitor info the AQ agencies

In [None]:
get_monitor_info.us_monitor_utility()

In [None]:
get_monitor_info.jp_monitor_utility()

In [None]:
get_monitor_info.kr_monitor_utility()
get_monitor_info.geocode_kr_monitor_locations()

In [None]:
get_monitor_info.eu_monitor_utility()

NB: the Chinese monitor data is already provided alongside the code.

Gets a dictionaries of dataframes (1 per species) with a unique ID, 
the country, region, and population. Population of grid cells with more
than 1 monitor is split equally between monitors. The ID column of these df
are used to built the regional time series of measurements

In [None]:
monitors = get_monitor_info.merge_all_monitor_data()

In [None]:
!ls -larth raw_data/europe

In [None]:
eu_monitor_utility()

# Process raw measurements

In [None]:
import process_measurements

In [None]:
year = 2020; sp = 'O3'

In [None]:
year = 2020
df = process_measurements.initialize_df(year, sp)
%prun df = process_measurements.add_kr_measurements(df, year, sp)

In [None]:
df.dropna(axis=0, how='all')

In [None]:
import pickle
import numpy as np

In [None]:
path = '/net/fs03/d1/gchossie/2020_aq/monitor_data/raw_data/'
out = pd.DataFrame()
for year in range(2016,2021):
    with open(path + f'O3_{year}_all_measurements.pkl', 'rb') as f:
        tst = pickle.load(f)
    out = pd.concat([out, tst], axis=1)

In [None]:
out.values[out.values == -9999] = np.nan

In [None]:
out.dropna(axis=0, how='all')#thresh=np.floor(0.9*out.shape[1]))

In [None]:
with open(path + f'O3_all_measurements.pkl', 'wb') as f:
    pickle.dump(out, f)

# Create time-series for testing

In [3]:
datadir = '/net/fs03/d1/gchossie/2020_aq/monitor_data/raw_data/'

In [28]:
import test_significance

In [36]:
(sp, region) = ('NO2', 'Massachusetts')

In [37]:
test_significance.create_ts(sp,region)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0,date,value
0,2016-01-01,14.638693
1,2016-01-02,14.809497
2,2016-01-03,13.901877
3,2016-01-04,12.628944
4,2016-01-05,13.541568
...,...,...
1634,2020-06-22,17.984141
1635,2020-06-23,24.221653
1636,2020-06-24,23.760187
1637,2020-06-25,27.433644


# Interpolate monitor data to create prediction time series

In [32]:
sp = 'O3'

In [38]:
PATH = '/net/fs03/d1/gchossie/2020_aq/monitor_data/raw_data/'
vals = pd.read_pickle(PATH + f'{sp}_all_measurements.pkl')
monitors = pd.read_feather(PATH + f'{sp}_monitor_locations.feather')

In [43]:
vals = vals.reset_index().rename(columns={'index': 'Identifier'}); vals

Unnamed: 0,Identifier,2016-01-01 00:00:00,2016-01-02 00:00:00,2016-01-03 00:00:00,2016-01-04 00:00:00,2016-01-05 00:00:00,2016-01-06 00:00:00,2016-01-07 00:00:00,2016-01-08 00:00:00,2016-01-09 00:00:00,...,2020-06-17 00:00:00,2020-06-18 00:00:00,2020-06-19 00:00:00,2020-06-20 00:00:00,2020-06-21 00:00:00,2020-06-22 00:00:00,2020-06-23 00:00:00,2020-06-24 00:00:00,2020-06-25 00:00:00,2020-06-26 00:00:00
0,STA_ES1964A,,,,,,,,,,...,,,,,,,,,,
1,STA.DE_DENW181,,,,,,,,,,...,,,,,,,,,,
2,STA-BETR804,,,,,,,,,,...,,,,,,,,,,
3,12225080,33.913,33.3043,42.6957,60.9565,51.1304,39.4783,15.9091,24.4348,39.5652,...,,,,,,,,,,
4,STA.IT0912A,9.89492,8.67099,20.101,15.0076,7.79559,,,1.13434,0.984657,...,98.2324,90.1567,103.934,111.554,104.84,122.67,132.906,154.784,135.151,115.525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10774,420750100,60,56,54,62,56,40,8,0,0,...,,,,,,,,,,
10775,1162A,47.25,27.5417,12.2917,21.75,77.25,49.9583,18.8333,38.3333,41,...,100.708,53.875,101.783,102.708,76.9583,62.6522,59.4583,79.2917,74.1667,111.25
10776,STA.DE_DERP063,,,,,,,,,,...,,,,,,,,,,
10777,1390A,63.3333,38.125,24.7917,24.7273,11.3636,33.5833,68.1667,49,45,...,35.375,41.2917,48.0833,47.1667,38.5217,51.375,55.2609,41.619,43.5,40.0417


In [45]:
pd.merge(vals, monitors[['Identifier', 'Latitude', 'Longitude']], on='Identifier').set_index('Identifier')

Unnamed: 0_level_0,2016-01-01 00:00:00,2016-01-02 00:00:00,2016-01-03 00:00:00,2016-01-04 00:00:00,2016-01-05 00:00:00,2016-01-06 00:00:00,2016-01-07 00:00:00,2016-01-08 00:00:00,2016-01-09 00:00:00,2016-01-10 00:00:00,...,2020-06-19 00:00:00,2020-06-20 00:00:00,2020-06-21 00:00:00,2020-06-22 00:00:00,2020-06-23 00:00:00,2020-06-24 00:00:00,2020-06-25 00:00:00,2020-06-26 00:00:00,Latitude,Longitude
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
STA_ES1964A,,,,,,,,,,,...,,,,,,,,,41.386639,2.057392
STA.DE_DENW181,,,,,,,,,,,...,,,,,,,,,51.442993,8.360797
STA-BETR804,,,,,,,,,,,...,,,,,,,,,51.206304,4.441947
12225080,33.913,33.3043,42.6957,60.9565,51.1304,39.4783,15.9091,24.4348,39.5652,38.3478,...,,,,,,,,,35.343056,139.865833
STA.IT0912A,9.89492,8.67099,20.101,15.0076,7.79559,,,1.13434,0.984657,4.24015,...,103.934,111.554,104.84,122.67,132.906,154.784,135.151,115.525,45.193889,9.164722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
STA-PT03075,,,,,,,,,,,...,,,,,,,,,38.720278,-9.145833
1162A,47.25,27.5417,12.2917,21.75,77.25,49.9583,18.8333,38.3333,41,28.375,...,101.783,102.708,76.9583,62.6522,59.4583,79.2917,74.1667,111.25,31.301900,120.591000
STA.DE_DERP063,,,,,,,,,,,...,,,,,,,,,50.350547,7.600248
1390A,63.3333,38.125,24.7917,24.7273,11.3636,33.5833,68.1667,49,45,35.4583,...,48.0833,47.1667,38.5217,51.375,55.2609,41.619,43.5,40.0417,23.012778,113.794444


In [47]:
import interpolate

In [48]:
%time interpolate.preprocess_raw_vals('NO2')

CPU times: user 4.28 s, sys: 1.04 s, total: 5.32 s
Wall time: 5.33 s


Unnamed: 0_level_0,2016-01-01 00:00:00,2016-01-02 00:00:00,2016-01-03 00:00:00,2016-01-04 00:00:00,2016-01-05 00:00:00,2016-01-06 00:00:00,2016-01-07 00:00:00,2016-01-08 00:00:00,2016-01-09 00:00:00,2016-01-10 00:00:00,...,2020-06-19 00:00:00,2020-06-20 00:00:00,2020-06-21 00:00:00,2020-06-22 00:00:00,2020-06-23 00:00:00,2020-06-24 00:00:00,2020-06-25 00:00:00,2020-06-26 00:00:00,Latitude,Longitude
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12225080,33.913,33.3043,42.6957,60.9565,51.1304,39.4783,15.9091,24.4348,39.5652,38.3478,...,,,,,,,,,35.343056,139.865833
STA.IT0912A,9.89492,8.67099,20.101,15.0076,7.79559,,,1.13434,0.984657,4.24015,...,103.934,111.554,104.84,122.67,132.906,154.784,135.151,115.525,45.193889,9.164722
14215010,53.913,47.6522,56.8696,49.2174,60.6957,36.0909,19.7391,42.6087,40.8696,20.8696,...,,,,,,,,,35.446389,139.390556
1368A,76.625,67.125,39.75,37.25,46,70.625,74.6667,72.7917,66.9583,73.25,...,32.375,29.7083,28.125,33.125,34.7083,34.625,30.2083,28.2917,22.229400,113.495000
STA_PL0605A,,,,,,,,,,,...,,,,,,,,,52.672648,19.079261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
STA_PL0768A,,,,,,,,,,,...,74.875,85.375,72.25,77.625,57.5,91.25,69.125,,53.852639,22.984611
22220010,54.5217,54,53.2727,46.087,34.5217,30.5217,33.6522,60.9091,47.0435,28.6957,...,,,,,,,,,35.197500,138.913333
1162A,47.25,27.5417,12.2917,21.75,77.25,49.9583,18.8333,38.3333,41,28.375,...,101.783,102.708,76.9583,62.6522,59.4583,79.2917,74.1667,111.25,31.301900,120.591000
1390A,63.3333,38.125,24.7917,24.7273,11.3636,33.5833,68.1667,49,45,35.4583,...,48.0833,47.1667,38.5217,51.375,55.2609,41.619,43.5,40.0417,23.012778,113.794444
