import os
import geopandas
import numpy as np
import matplotlib.pyplot as plt
import requests
#set up paths for data access
base_folder = os.getcwd() + '/realsat_test3/' # change this to the absolute path of all the zip files
if os.path.isdir(base_folder)==False:
os.mkdir(base_folder)
monthly_shapes_folder = base_folder + 'monthly_shapes/' # path for extracting monthly shapes
if os.path.isdir(monthly_shapes_folder)==False:
os.mkdir(monthly_shapes_folder)
monthly_timeseries_file = base_folder + 'monthly_timeseries.zip' # absolute path of the monthly timeseries zip file
monthly_timeseries_folder = base_folder + 'monthly_timeseries/' # path for extracting monthly timeseries
if os.path.isdir(monthly_timeseries_folder)==False:
os.mkdir(monthly_timeseries_folder)
base_shapefile = 'zip://' + base_folder + 'ReaLSAT.zip' # absolute path of the base shapefile.
#download data
print('downloading base shapefile ...')
os.system('wget https://zenodo.org/record/5039223/files/ReaLSAT.zip -O ' + base_folder + 'ReaLSAT.zip')
print('downloading monthly timeseries zipfile ...')
os.system('wget https://zenodo.org/record/5039223/files/monthly_timeseries.zip -O ' + base_folder + 'monthly_timeseries.zip')
print('downloading monthly shapes zipfile')
for lon in range(-180,180,10):
for lat in range(-90,90,10):
cur_lon_str = str(lon)
cur_lat_str = str(lat)
cur_file_name = 'monthly_shapes_' + cur_lon_str + '_' + cur_lat_str + '.zip'
r = requests.get('https://zenodo.org/record/5039223/files/' + cur_file_name , stream=True)
if r.ok==False:
continue
print(lon,lat)
os.system('wget https://zenodo.org/record/5039223/files/' + cur_file_name + ' -O ' + base_folder + cur_file_name)
The base shapefile contains the reference shape of all the reservoirs. Please see below the description of different attributes -
ID: the unique ID for water body. ID values are used as names of shapefiles that contain monthly shapes.
RESERVOIR: categorizes water bodies into two sets - 1 represents reservoirs manually verified by visual validation, and 0 represents other water bodies. Note that the reservoir list is not exhaustive and water bodies with 0 value could be reservoirs. The reservoir subset was created using a machine learning methodology. Please refer to https://ai4earthscience.github.io/iclr-2020-workshop/papers/ai4earth28.pdf for more details.
AREA: number of LANDSAT 30-m pixels covered by the water body. This area was calculated by counting pixels in a water body that exists as water atleast 10 % of the time.
Hylak_id: The water body ID based on the HydroLAKES dataset. Water bodies that are not present in HdyroLAKEs but are available in ReaLSAT have ID as 0.
Hylak_frc: % overlap between ReaLSAT and HydroLAKES database.
CONTINENT: between 0 and 8 representing different continents -
0:Other 1:Asia 2:North America 3:Europe 4:Africa 5:South America 6:Oceania 7:Australia 8:Antarctica
geometry: reference shape of the reservoir
The remaining attributes listed below have been added to flag lakes where assumptions made by the ORBIT approach (used to create ReaLSAT dataset) might be violated -
RIVR_SCORE: morphological score used to exlude river segments.
COMP_SCORE: score to identify lakes that break to multiple components frequently.
EPHM_SCORE: score to identify lakes that are very transient.
REL_DYN: categorizes the water bodies into two sets - 1 represents water bodies for which changes made by the ORBIT algorithm (correcting erroneous labels in GSW dataset and filling missing labels) are more reliable. Please refer to the paper (http://umnlcc.cs.umn.edu/realsat/data/ReaLSAT.pdf) for more details.
# loading the base shapefile
rf = geopandas.read_file(base_shapefile)
print(rf)
print('Number of water bodies: ' + str(rf.shape[0]))
print('Number of reservoirs: ' + str(np.sum(rf['RESERVOIR'].values==1)))
# display the reference shape of a water body
ID = 396029
idx = np.where(rf['ID'].values==ID)[0][0]
temp = rf['geometry'][idx]
temp
The csv contains monthly timeseries of the reservoir. Each csv contains 4 rows:
Row 1: fill percentage time series
Row 2: update percentage time series
Row 3: area time series from ReaLSAT
Row 4: area time series using GSW labels directly. Specifically, we simply counted number of water pixels in the monthly extent map as the area for that month. Note that the area variations are affected by missing data in GSW labels unlike ReaLSAT based timeseries (Row 3) which imputes missing data using a physics guided machine learning algorithm.
Note that the quality of label updates in ReaLSAT get impacted if there are lot of consecutive timesteps with large amounts of missing data (i.e. very high fill percentage value). Here, we provide a filter to remove timesteps with significant missing data in nearby months.
# extract timeseries data from the zipped file
temp_df = rf[rf['ID'].values==ID]
cx = str(int(np.floor(temp_df.centroid.x.to_numpy()[0]/10)*10)) # bottom left longitude of the 10 deg x 10 deg cell
cy = str(int(np.floor(temp_df.centroid.y.to_numpy()[0]/10)*10)) # bottom left latitude of the 10 deg x 10 deg cell
#extracting only the csv file for the lake of interest
os.system('unzip -o ' + monthly_timeseries_file + ' monthly_timeseries_' + cx + '_' + cy + '/' + str(ID).zfill(6) + '.csv -d ' + monthly_timeseries_folder)
# os.system('unzip -o ' + base_folder + 'monthly_shapes_' + cx + '_' + cy + '.zip monthly_shapes_' + cx + '_' + cy + '/' + str(ID).zfill(6) + '.zip -d ' + monthly_shapes_folder)
# loading timeseries data
timeseries_path = monthly_timeseries_folder + 'monthly_timeseries_' + cx + '_' + cy + '/' + str(ID).zfill(6) + '.csv'
data = np.loadtxt(timeseries_path,delimiter=',')
# preparing time information for plotting
dyear = []
dmonth = []
for i in range(1984,2016):
for j in range(0,12):
dyear.append(i)
dmonth.append(j)
dyear = np.array(dyear)
dmonth = np.array(dmonth)
# check if the average missing percentage before and after any given timestep is more than the threshold.
def prune_time_steps(fill_ts,window_size):
T = fill_ts.shape[0]
prn_ts = np.zeros((T,)).astype(bool)
for j in range(1,T-1):
cur_ts_before = fill_ts[max(0,j-window_size):j]
cur_ts_after = fill_ts[j+1:min(j+window_size+1,T)]
if np.mean(cur_ts_before)>90 or np.mean(cur_ts_after)>90:
prn_ts[j] = 1#max([cur_sm,inp_ts[j]])
if fill_ts[0]>90:
prn_ts[0] = 1
if fill_ts[T-1]>90:
prn_ts[T-1] = 1
return prn_ts
plt.figure(figsize=(10,5))
area_ts = data[2,:].astype(float) # area timeseries in terms of number of pixels
area_ts = area_ts*0.0009 # multiplying by area of a single pixel
fill_ts = data[0,:] # percentage of missing data in each month
update_ts = data[1,:] # percentage of pixels updated by ReaLSAT (corrections as well as imputations)
area_ts_gsw = data[3,:].astype(float)*0.0009
# extracting timesteps where area values were potentially affected by large amount of missing data around them.
bad_ts = prune_time_steps(data[0,:],3)
area_ts[bad_ts]=np.nan # assigning nan values to the missing timesteps
area_ts_gsw[bad_ts]=np.nan # assigning nan values to the missing timesteps
plt.plot(area_ts,'-b',label='ReaLSAT')
plt.plot(area_ts_gsw,'-g',label='GSW')
plt.ylabel('Area (sq. kms.)',fontsize=15)
plt.xlabel('Time',fontsize=15)
plt.xticks(np.arange(0,384,48),dyear[np.arange(0,384,48)],fontsize=15)
plt.xticks(rotation=90)
plt.yticks(fontsize=15)
plt.grid()
plt.show()
Each water body has a shapefile that contains its monthly shapes. Please see below the description of different attributes -
tid: time id. It ranges from 1 to 384 (one for each month starting from Jan-1984 to Dec-2015. Note if a row with a time id is not present in the file, it implies that there was a lot of missing data around that timesteps and hence it was removed. Timesteps when a water body is completely dry will have an empty shape.
id: same as ID
area: number of LANDSAT pixels (at 30m resolution). Multiply it by 0.0009 to get the area in sq. kms.
fill: Percentage of pixels (ranges between 0 and 100) filled (by algorithms used to create ReaLSAT) that had missing label in the original labels in the GSW dataset. .
update: Percentage of pixels (ranges between 0 and 100) changed (by algorithms used to create ReaLSAT.) This number represents the percentage of pixels filled + percentage of pixels where labels were corrected by the algorithms.
month: month of the year
year: year
geometry: reference shape of the reservoir for the given month
# extract ESRI shapefile from the zipped file
temp_df = rf[rf['ID'].values==ID]
cx = str(int(np.floor(temp_df.centroid.x.to_numpy()[0]/10)*10)) # bottom left longitude of the 10 deg x 10 deg cell
cy = str(int(np.floor(temp_df.centroid.y.to_numpy()[0]/10)*10)) # bottom left latitude of the 10 deg x 10 deg cell
os.system('unzip -o ' + base_folder + 'monthly_shapes_' + cx + '_' + cy + '.zip monthly_shapes_' + cx + '_' + cy + '/' + str(ID).zfill(6) + '.zip -d ' + monthly_shapes_folder)
#load shapes
shapefile_path = 'zip://' + monthly_shapes_folder + 'monthly_shapes_' + cx + '_' + cy + '/' + str(ID).zfill(6) + '.zip'
sdf = geopandas.read_file(shapefile_path)
# extract bounding box
minx = np.nanmin(sdf.bounds['minx'])
miny = np.nanmin(sdf.bounds['miny'])
maxx = np.nanmax(sdf.bounds['maxx'])
maxy = np.nanmax(sdf.bounds['maxy'])
#set two timesteps for comparison. The dataset has a total of 384 months
m1 = 200
m2 = 300
# plot surface extent
f,ax = plt.subplots(1,2)
sdf[sdf['tid']==m1].plot(ax=ax[0])
sdf[sdf['tid']==m2].plot(ax=ax[1])
ax[0].set_xticklabels(ax[0].get_xticks(), rotation = 45)
ax[0].set_yticklabels(ax[0].get_yticks(), rotation = 45)
ax[0].set_xlim(minx,maxx)
ax[0].set_ylim(miny,maxy)
ax[1].set_xticklabels(ax[1].get_xticks(), rotation = 45)
ax[1].set_yticklabels(ax[1].get_yticks(), rotation = 45)
ax[1].set_xlim(minx,maxx)
ax[1].set_ylim(miny,maxy)
plt.tight_layout()