Source code for tobler.area_weighted.area_weighted

"""
Area Weighted Interpolation

"""

import numpy as np
import geopandas as gpd
from .vectorized_raster_interpolation import fast_append_profile_in_gdf
import warnings
from scipy.sparse import dok_matrix, diags
import pandas as pd

from tobler.util.util import _check_crs, _nan_check, _check_presence_of_crs


[docs]def area_tables_binning(source_df, target_df): """Construct area allocation and source-target correspondence tables using a spatial indexing approach Parameters ---------- source_df : geopandas.GeoDataFrame GeoDataFrame containing input data and polygons target_df : geopandas.GeoDataFramee GeoDataFrame defining the output geometries Returns ------- tables : scipy.sparse.dok_matrix """ if _check_crs(source_df, target_df): pass else: return None df1 = source_df df2 = target_df l1, b1, r1, t1 = df1.total_bounds l2, b2, r2, t2 = df2.total_bounds total_bounds = [min(l1, l2), min(b1, b2), max(r1, r2), max(t1, t2)] n1, k1 = df1.shape n2, k2 = df2.shape numPoly = n1 + n2 DELTA = 0.000001 # constants for bucket sizes BUCK_SM = 8 BUCK_LG = 80 SHP_SMALL = 1000 shapebox = total_bounds # bucket size if numPoly < SHP_SMALL: bucketmin = numPoly // BUCK_SM + 2 else: bucketmin = numPoly // BUCK_LG + 2 # print 'bucketmin: ', bucketmin # bucket length lengthx = ((shapebox[2] + DELTA) - shapebox[0]) / bucketmin lengthy = ((shapebox[3] + DELTA) - shapebox[1]) / bucketmin # initialize buckets columns1 = [set() for i in range(bucketmin)] rows1 = [set() for i in range(bucketmin)] columns2 = [set() for i in range(bucketmin)] rows2 = [set() for i in range(bucketmin)] minbox = shapebox[:2] * 2 # minx,miny,minx,miny binWidth = [lengthx, lengthy] * 2 # lenx,leny,lenx,leny bbcache = {} poly2Column1 = [set() for i in range(n1)] poly2Row1 = [set() for i in range(n1)] poly2Column2 = [set() for i in range(n2)] poly2Row2 = [set() for i in range(n2)] for i in range(n1): shpObj = df1.geometry.iloc[i] bbcache[i] = shpObj.bounds projBBox = [ int((shpObj.bounds[:][j] - minbox[j]) / binWidth[j]) for j in range(4) ] for j in range(projBBox[0], projBBox[2] + 1): columns1[j].add(i) poly2Column1[i].add(j) for j in range(projBBox[1], projBBox[3] + 1): rows1[j].add(i) poly2Row1[i].add(j) for i in range(n2): shpObj = df2.geometry.iloc[i] bbcache[i] = shpObj.bounds projBBox = [ int((shpObj.bounds[:][j] - minbox[j]) / binWidth[j]) for j in range(4) ] for j in range(projBBox[0], projBBox[2] + 1): columns2[j].add(i) poly2Column2[i].add(j) for j in range(projBBox[1], projBBox[3] + 1): rows2[j].add(i) poly2Row2[i].add(j) table = dok_matrix((n1, n2), dtype=np.float32) for polyId in range(n1): idRows = poly2Row1[polyId] idCols = poly2Column1[polyId] rowNeighbors = set() colNeighbors = set() for row in idRows: rowNeighbors = rowNeighbors.union(rows2[row]) for col in idCols: colNeighbors = colNeighbors.union(columns2[col]) neighbors = rowNeighbors.intersection(colNeighbors) for neighbor in neighbors: if df1.geometry.iloc[polyId].intersects(df2.geometry.iloc[neighbor]): intersection = df1.geometry.iloc[polyId].intersection( df2.geometry.iloc[neighbor] ) table[polyId, neighbor] = intersection.area return table
[docs]def area_tables(source_df, target_df): """ Construct area allocation and source-target correspondence tables. Parameters ---------- source_df : geopandas.GeoDataFrame target_df : geopandas.GeoDataFrame Returns ------- tables : tuple (optional) two 2-D numpy arrays SU: area of intersection of source geometry i with union geometry j UT: binary mapping of union geometry j to target geometry t Notes ----- The assumption is both dataframes have the same coordinate reference system. Union geometry is a geometry formed by the intersection of a source geometry and a target geometry SU Maps source geometry to union geometry, UT maps union geometry to target geometry """ if _check_crs(source_df, target_df): pass else: return None n_s = source_df.shape[0] n_t = target_df.shape[0] _left = np.arange(n_s) _right = np.arange(n_t) source_df.loc[:, "_left"] = _left # create temporary index for union target_df.loc[:, "_right"] = _right # create temporary index for union res_union = gpd.overlay(source_df, target_df, how="union") n_u, _ = res_union.shape SU = np.zeros( (n_s, n_u) ) # holds area of intersection of source geom with union geom UT = np.zeros((n_u, n_t)) # binary table mapping union geom to target geom for index, row in res_union.iterrows(): # only union polygons that intersect both a source and a target geometry matter if not np.isnan(row["_left"]) and not np.isnan(row["_right"]): s_id = int(row["_left"]) t_id = int(row["_right"]) SU[s_id, index] = row["geometry"].area UT[index, t_id] = 1 source_df.drop(["_left"], axis=1, inplace=True) target_df.drop(["_right"], axis=1, inplace=True) return SU, UT
def area_interpolate_binning( source_df, target_df, extensive_variables=None, intensive_variables=None, table=None, allocate_total=True, ): """ Area interpolation for extensive and intensive variables. Parameters ---------- source_df : geopandas.GeoDataFrame target_df : geopandas.GeoDataFrame extensive_variables : list columns in dataframes for extensive variables intensive_variables : list columns in dataframes for intensive variables table : scipy.sparse.dok_matrix allocate_total : boolean True if total value of source area should be allocated. False if denominator is area of i. Note that the two cases would be identical when the area of the source polygon is exhausted by intersections. See Notes for more details. Returns ------- estimates : geopandas.GeoDataFrame new geodaraframe with interpolated variables as columns and target_df geometry as output geometry Notes ----- The assumption is both dataframes have the same coordinate reference system. For an extensive variable, the estimate at target polygon j (default case) is: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \\sum_k a_{i,k} If the area of the source polygon is not exhausted by intersections with target polygons and there is reason to not allocate the complete value of an extensive attribute, then setting allocate_total=False will use the following weights: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / a_i where a_i is the total area of source polygon i. For an intensive variable, the estimate at target polygon j is: .. math:: v_j = \\sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \\sum_k a_{k,j} """ if _check_crs(source_df, target_df): pass else: return None if table is None: table = area_tables_binning(source_df, target_df) den = source_df["geometry"].area.values if allocate_total: den = np.asarray(table.sum(axis=1)) den = den + (den == 0) den = 1.0 / den n = den.shape[0] den = den.reshape((n,)) den = diags([den], [0]) weights = den.dot(table) # row standardize table dfs = [] extensive = [] if extensive_variables: for variable in extensive_variables: vals = _nan_check(source_df, variable) estimates = diags([vals], [0]).dot(weights) estimates = estimates.sum(axis=0) extensive.append(estimates.tolist()[0]) extensive = np.asarray(extensive) extensive = np.array(extensive) extensive = pd.DataFrame(extensive.T, columns=extensive_variables) area = np.asarray(table.sum(axis=0)) den = 1.0 / (area + (area == 0)) n, k = den.shape den = den.reshape((k,)) den = diags([den], [0]) weights = table.dot(den) intensive = [] if intensive_variables: for variable in intensive_variables: vals = _nan_check(source_df, variable) n = vals.shape[0] vals = vals.reshape((n,)) estimates = diags([vals], [0]) estimates = estimates.dot(weights).sum(axis=0) intensive.append(estimates.tolist()[0]) intensive = np.asarray(intensive) intensive = pd.DataFrame(intensive.T, columns=intensive_variables) if extensive_variables: dfs.append(extensive) if intensive_variables: dfs.append(intensive) df = pd.concat(dfs, axis=0) df["geometry"] = target_df["geometry"] df = gpd.GeoDataFrame(df) return df
[docs]def area_interpolate( source_df, target_df, extensive_variables=None, intensive_variables=None, tables=None, allocate_total=True, ): """ Area interpolation for extensive and intensive variables. Parameters ---------- source_df : geopandas.GeoDataFrame (required) geodataframe with polygon geometries target_df : geopandas.GeoDataFrame (required) geodataframe with polygon geometries extensive_variables : list, (optional) columns in dataframes for extensive variables intensive_variables : list, (optional) columns in dataframes for intensive variables tables : tuple (optional) two 2-D numpy arrays SU: area of intersection of source geometry i with union geometry j UT: binary mapping of union geometry j to target geometry t allocate_total : boolean True if total value of source area should be allocated. False if denominator is area of i. Note that the two cases would be identical when the area of the source polygon is exhausted by intersections. See Notes for more details. Returns ------- estimates : geopandas.GeoDataFrame new geodaraframe with interpolated variables as columns and target_df geometry as output geometry Notes ----- The assumption is both dataframes have the same coordinate reference system. For an extensive variable, the estimate at target polygon j (default case) is: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \sum_k a_{i,k} If the area of the source polygon is not exhausted by intersections with target polygons and there is reason to not allocate the complete value of an extensive attribute, then setting allocate_total=False will use the following weights: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / a_i where a_i is the total area of source polygon i. For an intensive variable, the estimate at target polygon j is: v_j = \sum_i v_i w_{i,j} w_{i,j} = a_{i,j} / \sum_k a_{k,j} """ if _check_crs(source_df, target_df): pass else: return None if tables is None: SU, UT = area_tables(source_df, target_df) else: SU, UT = tables den = source_df["geometry"].area.values if allocate_total: den = SU.sum(axis=1) den = den + (den == 0) weights = np.dot(np.diag(1 / den), SU) dfs = [] extensive = [] if extensive_variables: for variable in extensive_variables: vals = _nan_check(source_df, variable) estimates = np.dot(np.diag(vals), weights) estimates = np.dot(estimates, UT) estimates = estimates.sum(axis=0) extensive.append(estimates) extensive = np.array(extensive) extensive = pd.DataFrame(extensive.T, columns=extensive_variables) ST = np.dot(SU, UT) area = ST.sum(axis=0) den = np.diag(1.0 / (area + (area == 0))) weights = np.dot(ST, den) intensive = [] if intensive_variables: for variable in intensive_variables: vals = _nan_check(source_df, variable) vals.shape = (len(vals), 1) est = (vals * weights).sum(axis=0) intensive.append(est) intensive = np.array(intensive) intensive = pd.DataFrame(intensive.T, columns=intensive_variables) if extensive_variables: dfs.append(extensive) if intensive_variables: dfs.append(intensive) df = pd.concat(dfs, axis=0) df["geometry"] = target_df["geometry"] df = gpd.GeoDataFrame(df) return df
[docs]def area_tables_raster( source_df, target_df, raster_path, codes=[21, 22, 23, 24], force_crs_match=True ): """ Construct area allocation and source-target correspondence tables according to a raster 'populated' areas Parameters ---------- source_df : geopandas.GeoDataFrame geeodataframe with geometry column of polygon type target_df : geopandas.GeoDataFrame geodataframe with geometry column of polygon type raster_path : str the path to the associated raster image. codes : list list of integer code values that should be considered as 'populated'. Since this draw inspiration using the National Land Cover Database (NLCD), the default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity). The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html Only taken into consideration for harmonization raster based. force_crs_match : bool (default is True) Whether the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. It is recommended to let this argument as True. Returns ------- tables: tuple (optional) two 2-D numpy arrays SU: area of intersection of source geometry i with union geometry j UT: binary mapping of union geometry j to target geometry t Notes ----- The assumption is both dataframes have the same coordinate reference system. Union geometry is a geometry formed by the intersection of a source geometry and a target geometry SU Maps source geometry to union geometry, UT maps union geometry to target geometry """ if _check_crs(source_df, target_df): pass else: return None n_s = source_df.shape[0] n_t = target_df.shape[0] _left = np.arange(n_s) _right = np.arange(n_t) source_df.loc[:, "_left"] = _left # create temporary index for union target_df.loc[:, "_right"] = _right # create temporary index for union res_union_pre = gpd.overlay(source_df, target_df, how="union") # Establishing a CRS for the generated union warnings.warn( "The CRS for the generated union will be set to be the same as source_df." ) res_union_pre.crs = source_df.crs # The 'append_profile_in_gdf' function is present in nlcd.py script res_union = fast_append_profile_in_gdf( res_union_pre, raster_path, force_crs_match=force_crs_match ) str_codes = [str(i) for i in codes] str_list = ["Type_" + i for i in str_codes] # Extract list of code names that actually appear in the appended dataset str_list_ok = [col for col in res_union.columns if col in str_list] res_union["Populated_Pixels"] = res_union[str_list_ok].sum(axis=1) n_u, _ = res_union.shape SU = np.zeros( (n_s, n_u) ) # holds area of intersection of source geom with union geom UT = np.zeros((n_u, n_t)) # binary table mapping union geom to target geom for index, row in res_union.iterrows(): # only union polygons that intersect both a source and a target geometry matter if not np.isnan(row["_left"]) and not np.isnan(row["_right"]): s_id = int(row["_left"]) t_id = int(row["_right"]) SU[s_id, index] = row["Populated_Pixels"] UT[index, t_id] = 1 source_df.drop(["_left"], axis=1, inplace=True) target_df.drop(["_right"], axis=1, inplace=True) return SU, UT