"""
Spatial based Segregation Metrics
"""
__author__ = "Renan X. Cortes <renanc@ucr.edu>, Sergio J. Rey <sergio.rey@ucr.edu> and Elijah Knaap <elijah.knaap@ucr.edu>"
import numpy as np
import pandas as pd
import geopandas as gpd
import warnings
import libpysal
from libpysal.weights import Queen, Kernel, lag_spatial
from libpysal.weights.util import fill_diagonal
from numpy import inf
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances, haversine_distances
from scipy.ndimage.interpolation import shift
from scipy.sparse.csgraph import floyd_warshall
from scipy.sparse import csr_matrix
from segregation.aspatial.aspatial_indexes import _dissim, MinMax
from segregation.aspatial.multigroup_aspatial_indexes import MultiInformationTheory, MultiDivergence
from segregation.network import calc_access
from libpysal.weights.util import attach_islands
from segregation.util.util import _dep_message, DeprecationHelper, _nan_handle
# Including old and new api in __all__ so users can use both
__all__ = [
'Spatial_Prox_Prof',
'SpatialProxProf',
'Spatial_Dissim',
'SpatialDissim',
'Boundary_Spatial_Dissim',
'BoundarySpatialDissim',
'Perimeter_Area_Ratio_Spatial_Dissim',
'PerimeterAreaRatioSpatialDissim',
'SpatialMinMax',
'Distance_Decay_Isolation',
'DistanceDecayIsolation',
'Distance_Decay_Exposure',
'DistanceDecayExposure',
'Spatial_Proximity',
'SpatialProximity',
'Absolute_Clustering',
'AbsoluteClustering',
'Relative_Clustering',
'RelativeClustering',
'Delta',
'Absolute_Concentration',
'AbsoluteConcentration',
'Relative_Concentration',
'RelativeConcentration',
'Absolute_Centralization',
'AbsoluteCentralization',
'Relative_Centralization',
'RelativeCentralization',
'SpatialInformationTheory',
'SpatialDivergence',
'compute_segregation_profile'
]
# The Deprecation calls of the classes are located in the end of this script #
# suppress numpy divide by zero warnings because it occurs a lot during the
# calculation of many indices
np.seterr(divide='ignore', invalid='ignore')
def _build_local_environment(data, groups, w):
"""Convert observations into spatially-weighted sums.
Parameters
----------
data : DataFrame
dataframe with local observations
w : libpysal.weights object
weights matrix defining the local environment
Returns
-------
DataFrame
Spatialized data
"""
new_data = []
w = fill_diagonal(w)
for y in data[groups]:
new_data.append(lag_spatial(w, data[y]))
new_data = pd.DataFrame(dict(zip(groups, new_data)))
return new_data
def _return_length_weighted_w(data):
"""
Returns a PySAL weights object that the weights represent the length of the common boundary of two areal units that share border.
Author: Levi Wolf <levi.john.wolf@gmail.com>.
Thank you, Levi!
Parameters
----------
data : a geopandas DataFrame with a 'geometry' column.
Notes
-----
Currently it's not making any projection.
"""
w = libpysal.weights.Rook.from_dataframe(
data, ids=data.index.tolist(), geom_col=data._geometry_column_name)
if (len(w.islands) == 0):
w = w
else:
warnings('There are some islands in the GeoDataFrame.')
w_aux = libpysal.weights.KNN.from_dataframe(
data,
ids=data.index.tolist(),
geom_col=data._geometry_column_name,
k=1)
w = attach_islands(w, w_aux)
adjlist = w.to_adjlist()
islands = pd.DataFrame.from_records([{
'focal': island,
'neighbor': island,
'weight': 0
} for island in w.islands])
merged = adjlist.merge(data.geometry.to_frame('geometry'), left_on='focal',
right_index=True, how='left')\
.merge(data.geometry.to_frame('geometry'), left_on='neighbor',
right_index=True, how='left', suffixes=("_focal", "_neighbor"))\
# Transforming from pandas to geopandas
merged = gpd.GeoDataFrame(merged, geometry='geometry_focal')
merged['geometry_neighbor'] = gpd.GeoSeries(merged.geometry_neighbor)
# Getting the shared boundaries
merged['shared_boundary'] = merged.geometry_focal.intersection(
merged.set_geometry('geometry_neighbor'))
# Putting it back to a matrix
merged['weight'] = merged.set_geometry('shared_boundary').length
merged_with_islands = pd.concat((merged, islands))
length_weighted_w = libpysal.weights.W.from_adjlist(
merged_with_islands[['focal', 'neighbor', 'weight']])
for island in w.islands:
length_weighted_w.neighbors[island] = []
del length_weighted_w.weights[island]
length_weighted_w._reset()
return length_weighted_w
def _spatial_prox_profile(data, group_pop_var, total_pop_var, m=1000):
"""
Calculation of Spatial Proximity Profile
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
m : int
a numeric value indicating the number of thresholds to be used. Default value is 1000.
A large value of m creates a smoother-looking graph and a more precise spatial proximity profile value but slows down the calculation speed.
Returns
----------
statistic : float
Spatial Proximity Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Notes
-----
Based on Hong, Seong-Yun, and Yukio Sadahiro. "Measuring geographic segregation: a graph-based approach." Journal of Geographical Systems 16.2 (2014): 211-231.
Reference: :cite:`hong2014measuring`.
"""
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if (type(m) is not int):
raise TypeError('m must be a string.')
if (m < 2):
raise ValueError('m must be greater than 1.')
if ((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
raise TypeError('group_pop_var and total_pop_var must be strings')
if ((group_pop_var not in data.columns)
or (total_pop_var not in data.columns)):
raise ValueError(
'group_pop_var and total_pop_var must be variables of data')
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
if any(data.total_pop_var < data.group_pop_var):
raise ValueError(
'Group of interest population must equal or lower than the total population of the units.'
)
# Create the shortest distance path between two pair of units using Shimbel matrix. This step was well discussed in https://github.com/pysal/segregation/issues/5.
w_libpysal = Queen.from_dataframe(data)
graph = csr_matrix(w_libpysal.full()[0])
delta = floyd_warshall(csgraph=graph, directed=False)
def calculate_etat(t):
g_t_i = np.where(data.group_pop_var / data.total_pop_var >= t, True,
False)
k = g_t_i.sum()
# i and j only varies in the units subset within the threshold in eta_t of Hong (2014).
sub_delta_ij = delta[g_t_i, :][:, g_t_i]
den = sub_delta_ij.sum()
eta_t = (k**2 - k) / den
return eta_t
grid = np.linspace(0, 1, m)
aux = np.array(list(map(calculate_etat, grid)))
aux[aux == inf] = 0
aux[aux == -inf] = 0
curve = np.nan_to_num(aux, 0)
threshold = data.group_pop_var.sum() / data.total_pop_var.sum()
SPP = ((threshold - ((curve[grid < threshold]).sum() / m -
(curve[grid >= threshold]).sum() / m)) /
(1 - threshold))
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
return SPP, grid, curve, core_data
[docs]class SpatialProxProf:
"""
Calculation of Spatial Proximity Profile
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
m : int
a numeric value indicating the number of thresholds to be used. Default value is 1000.
A large value of m creates a smoother-looking graph and a more precise spatial proximity profile value but slows down the calculation speed.
Attributes
----------
statistic : float
Spatial Proximity Profile Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Examples
--------
In this example, we will calculate the spatial proximity profile (SPP) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset.
Firstly, we need to perform some import the modules and the respective function.
>>> import pandas as pd
>>> import geopandas as gpd
>>> import segregation
>>> from segregation.spatial import SpatialProxProf
Secondly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/LTDB_Std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['tractid', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'tractid')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
>>> spat_prox_index = SpatialProxProf(gdf, 'nhblk10', 'pop10')
>>> spat_prox_index.statistic
0.11217269612149207
You can plot the profile curve with the plot method.
>>> spat_prox_index.plot()
Notes
-----
Based on Hong, Seong-Yun, and Yukio Sadahiro. "Measuring geographic segregation: a graph-based approach." Journal of Geographical Systems 16.2 (2014): 211-231.
Reference: :cite:`hong2014measuring`.
"""
[docs] def __init__(self, data, group_pop_var, total_pop_var, m=1000):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _spatial_prox_profile(data, group_pop_var, total_pop_var, m)
self.statistic = aux[0]
self.grid = aux[1]
self.curve = aux[2]
self.core_data = aux[3]
self._function = _spatial_prox_profile
def plot(self):
"""
Plot the Spatial Proximity Profile
"""
try:
import matplotlib.pyplot as plt
except ImportError:
warnings.warn('This method relies on importing `matplotlib`')
graph = plt.scatter(self.grid, self.curve, s=0.1)
return graph
def _spatial_dissim(data,
group_pop_var,
total_pop_var,
w=None,
standardize=False):
"""
Calculation of Spatial Dissimilarity index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
w : W
A PySAL weights object. If not provided, Queen contiguity matrix is used.
standardize : boolean
A condition for row standardisation of the weights matrices. If True, the values of cij in the formulas gets row standardized.
For the sake of comparison, the seg R package of Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767.
works by default with row standardization.
Returns
----------
statistic : float
Spatial Dissimilarity Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Notes
-----
Based on Morrill, R. L. (1991) "On the Measure of Geographic Segregation". Geography Research Forum.
Reference: :cite:`morrill1991measure`.
"""
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if (type(standardize) is not bool):
raise TypeError('std is not a boolean object')
if w is None:
w_object = Queen.from_dataframe(data)
else:
w_object = w
if (not issubclass(type(w_object), libpysal.weights.W)):
raise TypeError('w is not a PySAL weights object')
D = _dissim(data, group_pop_var, total_pop_var)[0]
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
x = np.array(data.group_pop_var)
t = np.array(data.total_pop_var)
# If a unit has zero population, the group of interest frequency is zero
pi = np.where(t == 0, 0, x / t)
if not standardize:
cij = w_object.full()[0]
else:
cij = w_object.full()[0]
cij = cij / cij.sum(axis=1).reshape((cij.shape[0], 1))
# Inspired in (second solution): https://stackoverflow.com/questions/22720864/efficiently-calculating-a-euclidean-distance-matrix-using-numpy
# Distance Matrix
abs_dist = abs(pi[..., np.newaxis] - pi)
# manhattan_distances used to compute absolute distances
num = np.multiply(abs_dist, cij).sum()
den = cij.sum()
SD = D - num / den
SD
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
return SD, core_data
[docs]class SpatialDissim:
"""
Calculation of Spatial Dissimilarity index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
w : W
A PySAL weights object. If not provided, Queen contiguity matrix is used.
standardize : boolean
A condition for row standardisation of the weights matrices. If True, the values of cij in the formulas gets row standardized.
For the sake of comparison, the seg R package of Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767.
works by default with row standardization.
Attributes
----------
statistic : float
Spatial Dissimilarity Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Examples
--------
In this example, we will calculate the degree of spatial dissimilarity (D) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset. The neighborhood contiguity matrix is used.
Firstly, we need to perform some import the modules and the respective function.
>>> import pandas as pd
>>> import geopandas as gpd
>>> import segregation
>>> from segregation.spatial import SpatialDissim
Secondly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/LTDB_Std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['tractid', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'tractid')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
The value is estimated below.
>>> spatial_dissim_index = SpatialDissim(gdf, 'nhblk10', 'pop10')
>>> spatial_dissim_index.statistic
0.2864885055405311
To use different neighborhood matrices:
>>> from libpysal.weights import Rook, KNN
Assuming K-nearest neighbors with k = 4
>>> knn = KNN.from_dataframe(gdf, k=4)
>>> spatial_dissim_index = Spatial_Dissim(gdf, 'nhblk10', 'pop10', w = knn)
>>> spatial_dissim_index.statistic
0.28544347200877285
Assuming Rook contiguity neighborhood
>>> roo = Rook.from_dataframe(gdf)
>>> spatial_dissim_index = Spatial_Dissim(gdf, 'nhblk10', 'pop10', w = roo)
>>> spatial_dissim_index.statistic
0.2866269198707091
Notes
-----
Based on Morrill, R. L. (1991) "On the Measure of Geographic Segregation". Geography Research Forum.
Reference: :cite:`morrill1991measure`.
"""
[docs] def __init__(self,
data,
group_pop_var,
total_pop_var,
w=None,
standardize=False):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _spatial_dissim(data, group_pop_var, total_pop_var, w,
standardize)
self.statistic = aux[0]
self.core_data = aux[1]
self._function = _spatial_dissim
def _boundary_spatial_dissim(data,
group_pop_var,
total_pop_var,
standardize=False):
"""
Calculation of Boundary Spatial Dissimilarity index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
standardize : boolean
A condition for row standardisation of the weights matrices. If True, the values of cij in the formulas gets row standardized.
For the sake of comparison, the seg R package of Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767.
works by default without row standardization. That is, directly with border length.
Returns
----------
statistic : float
Boundary Spatial Dissimilarity Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Notes
-----
The formula is based on Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767.
Original paper by Wong, David WS. "Spatial indices of segregation." Urban studies 30.3 (1993): 559-572.
References: :cite:`hong2014implementing` and :cite:`wong1993spatial`.
"""
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if (type(standardize) is not bool):
raise TypeError('std is not a boolean object')
D = _dissim(data, group_pop_var, total_pop_var)[0]
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
# If a unit has zero population, the group of interest frequency is zero
data = data.assign(
pi=np.where(data.total_pop_var == 0, 0, data.group_pop_var /
data.total_pop_var))
if not standardize:
cij = _return_length_weighted_w(data).full()[0]
else:
cij = _return_length_weighted_w(data).full()[0]
cij = cij / cij.sum(axis=1).reshape((cij.shape[0], 1))
# manhattan_distances used to compute absolute distances
num = np.multiply(manhattan_distances(data[['pi']]), cij).sum()
den = cij.sum()
BSD = D - num / den
BSD
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
return BSD, core_data
[docs]class BoundarySpatialDissim:
"""
Calculation of Boundary Spatial Dissimilarity index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
standardize : boolean
A condition for row standardisation of the weights matrices. If True, the values of cij in the formulas gets row standardized.
For the sake of comparison, the seg R package of Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767.
works by default without row standardization. That is, directly with border length.
Attributes
----------
statistic : float
Boundary Spatial Dissimilarity Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Examples
--------
In this example, we will calculate the degree of boundary spatial dissimilarity (D) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset.
Firstly, we need to perform some import the modules and the respective function.
>>> import pandas as pd
>>> import geopandas as gpd
>>> import segregation
>>> from segregation.spatial import BoundarySpatialDissim
Secondly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/LTDB_Std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['tractid', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'tractid')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
The value is estimated below.
>>> boundary_spatial_dissim_index = BoundarySpatialDissim(gdf, 'nhblk10', 'pop10')
>>> boundary_spatial_dissim_index.statistic
0.28869903953453163
Notes
-----
The formula is based on Hong, Seong-Yun, David O'Sullivan, and Yukio Sadahiro. "Implementing spatial segregation measures in R." PloS one 9.11 (2014): e113767.
Original paper by Wong, David WS. "Spatial indices of segregation." Urban studies 30.3 (1993): 559-572.
References: :cite:`hong2014implementing` and :cite:`wong1993spatial`.
"""
[docs] def __init__(self, data, group_pop_var, total_pop_var, standardize=False):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _boundary_spatial_dissim(data, group_pop_var, total_pop_var,
standardize)
self.statistic = aux[0]
self.core_data = aux[1]
self._function = _boundary_spatial_dissim
def _perimeter_area_ratio_spatial_dissim(data,
group_pop_var,
total_pop_var,
standardize=True):
"""
Calculation of Perimeter/Area Ratio Spatial Dissimilarity index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
standardize : boolean
A condition for standardisation of the weights matrices.
If True, the values of cij in the formulas gets standardized and the overall sum is 1.
Returns
----------
statistic : float
Perimeter/Area Ratio Spatial Dissimilarity Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Notes
-----
Originally based on Wong, David WS. "Spatial indices of segregation." Urban studies 30.3 (1993): 559-572.
However, Tivadar, Mihai. "OasisR: An R Package to Bring Some Order to the World of Segregation Measurement." Journal of Statistical Software 89.1 (2019): 1-39.
points out that in Wong’s original there is an issue with the formula which is an extra division by 2 in the spatial interaction component.
This function follows the formula present in the first Appendix of Tivadar, Mihai. "OasisR: An R Package to Bring Some Order to the World of Segregation Measurement." Journal of Statistical Software 89.1 (2019): 1-39.
References: :cite:`wong1993spatial` and :cite:`tivadar2019oasisr`.
"""
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if (type(standardize) is not bool):
raise TypeError('std is not a boolean object')
D = _dissim(data, group_pop_var, total_pop_var)[0]
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
# If a unit has zero population, the group of interest frequency is zero
data = data.assign(
pi=np.where(data.total_pop_var == 0, 0, data.group_pop_var /
data.total_pop_var))
if not standardize:
cij = _return_length_weighted_w(data).full()[0]
else:
cij = _return_length_weighted_w(data).full()[0]
cij = cij / cij.sum()
peri = data.length
ai = data.area
aux_sum = np.add(
np.array(list((peri / ai))),
np.array(list((peri / ai))).reshape((len(list((peri / ai))), 1)))
max_pa = max(peri / ai)
num = np.multiply(np.multiply(manhattan_distances(data[['pi']]), cij),
aux_sum).sum()
den = 2 * max_pa
PARD = D - (num / den)
PARD
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
return PARD, core_data
[docs]class PerimeterAreaRatioSpatialDissim:
"""
Calculation of Perimeter/Area Ratio Spatial Dissimilarity index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
standardize : boolean
A condition for standardisation of the weights matrices.
If True, the values of cij in the formulas gets standardized and the overall sum is 1.
Attributes
----------
statistic : float
Perimeter/Area Ratio Spatial Dissimilarity Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Examples
--------
In this example, we will calculate the degree of perimeter/area ratio spatial dissimilarity (PARD) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset.
Firstly, we need to perform some import the modules and the respective function.
>>> import pandas as pd
>>> import geopandas as gpd
>>> import segregation
>>> from segregation.spatial import PerimeterAreaRatioSpatialDissim
Secondly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/LTDB_Std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['tractid', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'tractid')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
The value is estimated below.
>>> perimeter_area_ratio_spatial_dissim_index = PerimeterAreaRatioSpatialDissim(gdf, 'nhblk10', 'pop10')
>>> perimeter_area_ratio_spatial_dissim_index.statistic
0.31260876347432687
Notes
-----
Originally based on Wong, David WS. "Spatial indices of segregation." Urban studies 30.3 (1993): 559-572.
However, Tivadar, Mihai. "OasisR: An R Package to Bring Some Order to the World of Segregation Measurement." Journal of Statistical Software 89.1 (2019): 1-39.
points out that in Wong’s original there is an issue with the formula which is an extra division by 2 in the spatial interaction component.
This function follows the formula present in the first Appendix of Tivadar, Mihai. "OasisR: An R Package to Bring Some Order to the World of Segregation Measurement." Journal of Statistical Software 89.1 (2019): 1-39.
References: :cite:`wong1993spatial` and :cite:`tivadar2019oasisr`.
"""
[docs] def __init__(self, data, group_pop_var, total_pop_var, standardize=True):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _perimeter_area_ratio_spatial_dissim(data, group_pop_var,
total_pop_var, standardize)
self.statistic = aux[0]
self.core_data = aux[1]
self._function = _perimeter_area_ratio_spatial_dissim
[docs]class SpatialMinMax(MinMax):
"""Spatial MinMax Index.
This class calculates the spatial version of the MinMax
index. The data are "spatialized" by converting each observation
to a "local environment" by creating a weighted sum of the focal unit with
its neighboring observations, where the neighborhood is defined by a
libpysal weights matrix or a pandana Network instance.
Parameters
----------
data : geopandas.GeoDataFrame
geodataframe with
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
w : libpysal.W
distance-based PySAL spatial weights matrix instance
network : pandana.Network
pandana.Network instance. This is likely created with `get_osm_network`
or via helper functions from OSMnet or UrbanAccess.
distance : int
maximum distance to consider `accessible` (the default is 2000).
decay : str
decay type pandana should use "linear", "exp", or "flat"
(which means no decay). The default is "linear".
precompute: bool
Whether the pandana.Network instance should precompute the range
queries.This is true by default, but if you plan to calculate several
indices using the same network, then you can set this
parameter to `False` to avoid precomputing repeatedly inside the
function
Attributes
----------
statistic : float
SpatialMinMax Index
core_data : a pandas DataFrame
A pandas DataFrame that contains the columns used to perform the estimate.
Notes
-----
Based on O'Sullivan & Wong (2007). A Surface‐Based Approach to Measuring Spatial Segregation.
Geographical Analysis 39 (2). https://doi.org/10.1111/j.1538-4632.2007.00699.x
Reference: :cite:`osullivanwong2007surface`.
We'd like to thank @AnttiHaerkoenen for this contribution!
"""
[docs] def __init__(self,
data,
group_pop_var,
total_pop_var,
network=None,
w=None,
decay='linear',
distance=2000,
precompute=True):
data = data.rename(columns={group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'})
data['group_2_pop_var'] = data['total_pop_var'] - data['group_pop_var']
groups = ['group_pop_var', 'group_2_pop_var']
if w is None and network is None:
points = [(p.x, p.y) for p in data.centroid]
w = Kernel(points)
if w and network:
raise (
"must pass either a pandana network or a pysal weights object\
but not both")
elif network:
df = calc_access(data,
variables=groups,
network=network,
distance=distance,
decay=decay,
precompute=precompute)
groups = ["acc_" + group for group in groups]
else:
df = _build_local_environment(data, groups, w)
df['resulting_total'] = df['group_pop_var'] + df['group_2_pop_var']
super().__init__(df, 'group_pop_var', 'resulting_total')
def _distance_decay_isolation(data,
group_pop_var,
total_pop_var,
alpha=0.6,
beta=0.5,
metric='euclidean'):
"""
Calculation of Distance Decay Isolation index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
alpha : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.6
beta : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.5
metric : string. Can be 'euclidean' or 'haversine'. Default is 'euclidean'.
The metric used for the distance between spatial units.
If the projection of the CRS of the geopandas DataFrame field is in degrees, this should be set to 'haversine'.
Returns
----------
statistic : float
Distance Decay Isolation Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Notes
-----
It may be interpreted as the probability that the next person a group member meets anywhere in space is from the same group.
Based on Morgan, Barrie S. "A distance-decay based interaction index to measure residential segregation." Area (1983): 211-217.
The pairwise distance between unit i and itself is (alpha * area_of_unit_i) ^ beta.
Reference: :cite:`morgan1983distance`.
"""
if not metric in ['euclidean', 'haversine']:
raise ValueError('metric must one of \'euclidean\', \'haversine\'')
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if ((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
raise TypeError('group_pop_var and total_pop_var must be strings')
if ((group_pop_var not in data.columns)
or (total_pop_var not in data.columns)):
raise ValueError(
'group_pop_var and total_pop_var must be variables of data')
if (alpha < 0):
raise ValueError('alpha must be greater than zero.')
if (beta < 0):
raise ValueError('beta must be greater than zero.')
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
x = np.array(data.group_pop_var)
t = np.array(data.total_pop_var)
if any(t < x):
raise ValueError(
'Group of interest population must equal or lower than the total population of the units.'
)
X = x.sum()
c_lons = np.array(data.centroid.x)
c_lats = np.array(data.centroid.y)
if (metric == 'euclidean'):
dist = euclidean_distances(
pd.DataFrame({
'c_lats': c_lats,
'c_lons': c_lons
}))
if (metric == 'haversine'):
dist = haversine_distances(
pd.DataFrame({
'c_lats': c_lats,
'c_lons': c_lons
})) # This needs to be latitude first!
c = np.exp(-dist)
if c.sum() < 10 ** (-15):
raise ValueError('It not possible to determine accurately the exponential of the negative distances. This is probably due to the large magnitude of the centroids numbers. It is recommended to reproject the geopandas DataFrame. Also, if this is a not lat-long CRS, it is recommended to set metric to \'haversine\'')
np.fill_diagonal(c, val = np.exp(-(alpha * data.area)**(beta)))
Pij = np.multiply(c, t) / np.sum(np.multiply(c, t), axis=1)
DDxPx = (np.array(x / X) *
np.nansum(np.multiply(Pij, np.array(x / t)), axis=1)).sum()
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
return DDxPx, core_data
[docs]class DistanceDecayIsolation:
"""
Calculation of Distance Decay Isolation index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
alpha : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.6
beta : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.5
metric : string. Can be 'euclidean' or 'haversine'. Default is 'euclidean'.
The metric used for the distance between spatial units.
If the projection of the CRS of the geopandas DataFrame field is in degrees, this should be set to 'haversine'.
Attributes
----------
statistic : float
Distance Decay Isolation Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Examples
--------
In this example, we will calculate the distance decay isolation index (DDxPx) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset.
Firstly, we need to perform some import the modules and the respective function.
>>> import pandas as pd
>>> import geopandas as gpd
>>> import segregation
>>> from segregation.spatial import DistanceDecayIsolation
Secondly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/LTDB_Std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['tractid', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'tractid')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
The value is estimated below.
>>> spatial_isolation_index = DistanceDecayIsolation(gdf, 'nhblk10', 'pop10')
>>> spatial_isolation_index.statistic
0.07214112078134231
Notes
-----
It may be interpreted as the probability that the next person a group member meets anywhere in space is from the same group.
Based on Morgan, Barrie S. "A distance-decay based interaction index to measure residential segregation." Area (1983): 211-217.
The pairwise distance between unit i and itself is (alpha * area_of_unit_i) ^ beta.
Reference: :cite:`morgan1983distance`.
"""
[docs] def __init__(self,
data,
group_pop_var,
total_pop_var,
alpha=0.6,
beta=0.5,
metric='euclidean'):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _distance_decay_isolation(data, group_pop_var, total_pop_var,
alpha, beta, metric)
self.statistic = aux[0]
self.core_data = aux[1]
self._function = _distance_decay_isolation
def _distance_decay_exposure(data,
group_pop_var,
total_pop_var,
alpha=0.6,
beta=0.5,
metric='euclidean'):
"""
Calculation of Distance Decay Exposure index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
alpha : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.6
beta : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.5
metric : string. Can be 'euclidean' or 'haversine'. Default is 'euclidean'.
The metric used for the distance between spatial units.
If the projection of the CRS of the geopandas DataFrame field is in degrees, this should be set to 'haversine'.
Returns
----------
statistic : float
Distance Decay Exposure Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Notes
-----
It may be interpreted as the probability that the next person a group member meets anywhere in space is from the other group.
Based on Morgan, Barrie S. "A distance-decay based interaction index to measure residential segregation." Area (1983): 211-217.
The pairwise distance between unit i and itself is (alpha * area_of_unit_i) ^ beta.
Reference: :cite:`morgan1983distance`.
"""
if not metric in ['euclidean', 'haversine']:
raise ValueError('metric must one of \'euclidean\', \'haversine\'')
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if ((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
raise TypeError('group_pop_var and total_pop_var must be strings')
if ((group_pop_var not in data.columns)
or (total_pop_var not in data.columns)):
raise ValueError(
'group_pop_var and total_pop_var must be variables of data')
if (alpha < 0):
raise ValueError('alpha must be greater than zero.')
if (beta < 0):
raise ValueError('beta must be greater than zero.')
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
x = np.array(data.group_pop_var)
t = np.array(data.total_pop_var)
if any(t < x):
raise ValueError(
'Group of interest population must equal or lower than the total population of the units.'
)
y = t - x
X = x.sum()
c_lons = np.array(data.centroid.x)
c_lats = np.array(data.centroid.y)
if (metric == 'euclidean'):
dist = euclidean_distances(
pd.DataFrame({
'c_lats': c_lats,
'c_lons': c_lons
}))
if (metric == 'haversine'):
dist = haversine_distances(
pd.DataFrame({
'c_lats': c_lats,
'c_lons': c_lons
})) # This needs to be latitude first!
c = np.exp(-dist)
if c.sum() < 10 ** (-15):
raise ValueError('It not possible to determine accurately the exponential of the negative distances. This is probably due to the large magnitude of the centroids numbers. It is recommended to reproject the geopandas DataFrame. Also, if this is a not lat-long CRS, it is recommended to set metric to \'haversine\'')
np.fill_diagonal(c, val = np.exp(-(alpha * data.area)**(beta)))
Pij = np.multiply(c, t) / np.sum(np.multiply(c, t), axis=1)
DDxPy = (x / X * np.nansum(np.multiply(Pij, y / t), axis=1)).sum()
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
return DDxPy, core_data
[docs]class DistanceDecayExposure:
"""
Calculation of Distance Decay Exposure index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
alpha : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.6
beta : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.5
metric : string. Can be 'euclidean' or 'haversine'. Default is 'euclidean'.
The metric used for the distance between spatial units.
If the projection of the CRS of the geopandas DataFrame field is in degrees, this should be set to 'haversine'.
Attributes
----------
statistic : float
Distance Decay Exposure Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Examples
--------
In this example, we will calculate the distance decay exposure index (DDxPy) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset.
Firstly, we need to perform some import the modules and the respective function.
>>> import pandas as pd
>>> import geopandas as gpd
>>> import segregation
>>> from segregation.spatial import DistanceDecayExposure
Secondly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/LTDB_Std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['tractid', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'tractid')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
The value is estimated below.
>>> spatial_exposure_index = DistanceDecayExposure(gdf, 'nhblk10', 'pop10')
>>> spatial_exposure_index.statistic
0.9605053172501217
Notes
-----
It may be interpreted as the probability that the next person a group member meets anywhere in space is from the other group.
Based on Morgan, Barrie S. "A distance-decay based interaction index to measure residential segregation." Area (1983): 211-217.
The pairwise distance between unit i and itself is (alpha * area_of_unit_i) ^ beta.
Reference: :cite:`morgan1983distance`.
"""
[docs] def __init__(self,
data,
group_pop_var,
total_pop_var,
alpha=0.6,
beta=0.5,
metric='euclidean'):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _distance_decay_exposure(data, group_pop_var, total_pop_var,
alpha, beta, metric)
self.statistic = aux[0]
self.core_data = aux[1]
self._function = _distance_decay_exposure
def _spatial_proximity(data,
group_pop_var,
total_pop_var,
alpha=0.6,
beta=0.5,
metric='euclidean'):
"""
Calculation of Spatial Proximity index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
alpha : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.6
beta : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.5
metric : string. Can be 'euclidean' or 'haversine'. Default is 'euclidean'.
The metric used for the distance between spatial units.
If the projection of the CRS of the geopandas DataFrame field is in degrees, this should be set to 'haversine'.
Returns
----------
statistic : float
Spatial Proximity Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
The pairwise distance between unit i and itself is (alpha * area_of_unit_i) ^ beta.
Reference: :cite:`massey1988dimensions`.
"""
if not metric in ['euclidean', 'haversine']:
raise ValueError('metric must one of \'euclidean\', \'haversine\'')
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if ((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
raise TypeError('group_pop_var and total_pop_var must be strings')
if ((group_pop_var not in data.columns)
or (total_pop_var not in data.columns)):
raise ValueError(
'group_pop_var and total_pop_var must be variables of data')
if (alpha < 0):
raise ValueError('alpha must be greater than zero.')
if (beta < 0):
raise ValueError('beta must be greater than zero.')
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
if any(data.total_pop_var < data.group_pop_var):
raise ValueError(
'Group of interest population must equal or lower than the total population of the units.'
)
T = data.total_pop_var.sum()
data = data.assign(xi=data.group_pop_var,
yi=data.total_pop_var - data.group_pop_var,
ti=data.total_pop_var)
X = data.xi.sum()
Y = data.yi.sum()
c_lons = np.array(data.centroid.x)
c_lats = np.array(data.centroid.y)
if (metric == 'euclidean'):
dist = euclidean_distances(
pd.DataFrame({
'c_lats': c_lats,
'c_lons': c_lons
}))
if (metric == 'haversine'):
dist = haversine_distances(
pd.DataFrame({
'c_lats': c_lats,
'c_lons': c_lons
})) # This needs to be latitude first!
c = np.exp(-dist)
if c.sum() < 10 ** (-15):
raise ValueError('It not possible to determine accurately the exponential of the negative distances. This is probably due to the large magnitude of the centroids numbers. It is recommended to reproject the geopandas DataFrame. Also, if this is a not lat-long CRS, it is recommended to set metric to \'haversine\'')
np.fill_diagonal(c, val = np.exp(-(alpha * data.area)**(beta)))
Pxx = ((np.array(data.xi) * c).T * np.array(data.xi)).sum() / X**2
Pyy = ((np.array(data.yi) * c).T * np.array(data.yi)).sum() / Y**2
Ptt = ((np.array(data.ti) * c).T * np.array(data.ti)).sum() / T**2
SP = (X * Pxx + Y * Pyy) / (T * Ptt)
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
return SP, core_data
[docs]class SpatialProximity:
"""
Calculation of Spatial Proximity index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
alpha : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.6
beta : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.5
metric : string. Can be 'euclidean' or 'haversine'. Default is 'euclidean'.
The metric used for the distance between spatial units.
If the projection of the CRS of the geopandas DataFrame field is in degrees, this should be set to 'haversine'.
Attributes
----------
statistic : float
Spatial Proximity Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Examples
--------
In this example, we will calculate the degree of spatial proximity (SP) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset.
Firstly, we need to perform some import the modules and the respective function.
>>> import pandas as pd
>>> import geopandas as gpd
>>> import segregation
>>> from segregation.spatial import SpatialProximity
Secondly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/LTDB_Std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['tractid', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'tractid')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
The value is estimated below.
>>> spatial_prox_index = SpatialProximity(gdf, 'nhblk10', 'pop10')
>>> spatial_prox_index.statistic
1.002191883006537
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
The pairwise distance between unit i and itself is (alpha * area_of_unit_i) ^ beta.
Reference: :cite:`massey1988dimensions`.
"""
[docs] def __init__(self,
data,
group_pop_var,
total_pop_var,
alpha=0.6,
beta=0.5,
metric='euclidean'):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _spatial_proximity(data, group_pop_var, total_pop_var, alpha,
beta, metric)
self.statistic = aux[0]
self.core_data = aux[1]
self._function = _spatial_proximity
def _absolute_clustering(data,
group_pop_var,
total_pop_var,
alpha=0.6,
beta=0.5,
metric='euclidean'):
"""
Calculation of Absolute Clustering index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
alpha : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.6
beta : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.5
metric : string. Can be 'euclidean' or 'haversine'. Default is 'euclidean'.
The metric used for the distance between spatial units.
If the projection of the CRS of the geopandas DataFrame field is in degrees, this should be set to 'haversine'.
Returns
----------
statistic : float
Absolute Clustering Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
The pairwise distance between unit i and itself is (alpha * area_of_unit_i) ^ beta.
Reference: :cite:`massey1988dimensions`.
"""
if not metric in ['euclidean', 'haversine']:
raise ValueError('metric must one of \'euclidean\', \'haversine\'')
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if ((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
raise TypeError('group_pop_var and total_pop_var must be strings')
if ((group_pop_var not in data.columns)
or (total_pop_var not in data.columns)):
raise ValueError(
'group_pop_var and total_pop_var must be variables of data')
if (alpha < 0):
raise ValueError('alpha must be greater than zero.')
if (beta < 0):
raise ValueError('beta must be greater than zero.')
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
if any(data.total_pop_var < data.group_pop_var):
raise ValueError(
'Group of interest population must equal or lower than the total population of the units.'
)
data = data.assign(xi=data.group_pop_var,
yi=data.total_pop_var - data.group_pop_var)
X = data.xi.sum()
x = np.array(data.xi)
t = np.array(data.total_pop_var)
n = len(data)
c_lons = np.array(data.centroid.x)
c_lats = np.array(data.centroid.y)
if (metric == 'euclidean'):
dist = euclidean_distances(
pd.DataFrame({
'c_lats': c_lats,
'c_lons': c_lons
}))
if (metric == 'haversine'):
dist = haversine_distances(
pd.DataFrame({
'c_lats': c_lats,
'c_lons': c_lons
})) # This needs to be latitude first!
c = np.exp(-dist)
if c.sum() < 10 ** (-15):
raise ValueError('It not possible to determine accurately the exponential of the negative distances. This is probably due to the large magnitude of the centroids numbers. It is recommended to reproject the geopandas DataFrame. Also, if this is a not lat-long CRS, it is recommended to set metric to \'haversine\'')
np.fill_diagonal(c, val = np.exp(-(alpha * data.area)**(beta)))
ACL = ((((x/X) * (c * x).sum(axis = 1)).sum()) - ((X / n**2) * c.sum())) / \
((((x/X) * (c * t).sum(axis = 1)).sum()) - ((X / n**2) * c.sum()))
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
return ACL, core_data
[docs]class AbsoluteClustering:
"""
Calculation of Absolute Clustering index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
alpha : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.6
beta : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.5
metric : string. Can be 'euclidean' or 'haversine'. Default is 'euclidean'.
The metric used for the distance between spatial units.
If the projection of the CRS of the geopandas DataFrame field is in degrees, this should be set to 'haversine'.
Attributes
----------
statistic : float
Absolute Clustering Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Examples
--------
In this example, we will calculate the absolute clustering measure (ACL) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset.
Firstly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['trtid10', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'trtid10')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
The value is estimated below.
>>> absolute_clust_index = Absolute_Clustering(gdf, 'nhblk10', 'pop10')
>>> absolute_clust_index.statistic
0.20979814508119624
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
The pairwise distance between unit i and itself is (alpha * area_of_unit_i) ^ beta.
Reference: :cite:`massey1988dimensions`.
"""
[docs] def __init__(self,
data,
group_pop_var,
total_pop_var,
alpha=0.6,
beta=0.5,
metric='euclidean'):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _absolute_clustering(data, group_pop_var, total_pop_var, alpha,
beta, metric)
self.statistic = aux[0]
self.core_data = aux[1]
self._function = _absolute_clustering
def _relative_clustering(data,
group_pop_var,
total_pop_var,
alpha=0.6,
beta=0.5,
metric='euclidean'):
"""
Calculation of Relative Clustering index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
alpha : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.6
beta : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.5
metric : string. Can be 'euclidean' or 'haversine'. Default is 'euclidean'.
The metric used for the distance between spatial units.
If the projection of the CRS of the geopandas DataFrame field is in degrees, this should be set to 'haversine'.
Returns
----------
statistic : float
Relative Clustering Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
The pairwise distance between unit i and itself is (alpha * area_of_unit_i) ^ beta.
Reference: :cite:`massey1988dimensions`.
"""
if not metric in ['euclidean', 'haversine']:
raise ValueError('metric must one of \'euclidean\', \'haversine\'')
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if ((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
raise TypeError('group_pop_var and total_pop_var must be strings')
if ((group_pop_var not in data.columns)
or (total_pop_var not in data.columns)):
raise ValueError(
'group_pop_var and total_pop_var must be variables of data')
if (alpha < 0):
raise ValueError('alpha must be greater than zero.')
if (beta < 0):
raise ValueError('beta must be greater than zero.')
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
if any(data.total_pop_var < data.group_pop_var):
raise ValueError(
'Group of interest population must equal or lower than the total population of the units.'
)
data = data.assign(xi=data.group_pop_var,
yi=data.total_pop_var - data.group_pop_var)
X = data.xi.sum()
Y = data.yi.sum()
c_lons = np.array(data.centroid.x)
c_lats = np.array(data.centroid.y)
if (metric == 'euclidean'):
dist = euclidean_distances(
pd.DataFrame({
'c_lats': c_lats,
'c_lons': c_lons
}))
if (metric == 'haversine'):
dist = haversine_distances(
pd.DataFrame({
'c_lats': c_lats,
'c_lons': c_lons
})) # This needs to be latitude first!
c = np.exp(-dist)
if c.sum() < 10 ** (-15):
raise ValueError('It not possible to determine accurately the exponential of the negative distances. This is probably due to the large magnitude of the centroids numbers. It is recommended to reproject the geopandas DataFrame. Also, if this is a not lat-long CRS, it is recommended to set metric to \'haversine\'')
np.fill_diagonal(c, val = np.exp(-(alpha * data.area)**(beta)))
Pxx = ((np.array(data.xi) * c).T * np.array(data.xi)).sum() / X**2
Pyy = ((np.array(data.yi) * c).T * np.array(data.yi)).sum() / Y**2
RCL = Pxx / Pyy - 1
if np.isnan(RCL):
raise ValueError('It not possible to determine the distance between, at least, one pair of units. This is probably due to the magnitude of the number of the centroids. We recommend to reproject the geopandas DataFrame.')
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
return RCL, core_data
[docs]class RelativeClustering:
"""
Calculation of Relative Clustering index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
alpha : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.6
beta : float
A parameter that estimates the extent of the proximity within the same unit. Default value is 0.5
metric : string. Can be 'euclidean' or 'haversine'. Default is 'euclidean'.
The metric used for the distance between spatial units.
If the projection of the CRS of the geopandas DataFrame field is in degrees, this should be set to 'haversine'.
Attributes
----------
statistic : float
Relative Clustering Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Examples
--------
In this example, we will calculate the relative clustering measure (RCL) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset.
Firstly, we need to perform some import the modules and the respective function.
>>> import pandas as pd
>>> import geopandas as gpd
>>> import segregation
>>> from segregation.spatial import RelativeClustering
Secondly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/LTDB_Std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['tractid', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'tractid')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
The value is estimated below.
>>> relative_clust_index = RelativeClustering(gdf, 'nhblk10', 'pop10')
>>> relative_clust_index.statistic
0.12418089857347714
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
The pairwise distance between unit i and itself is (alpha * area_of_unit_i) ^ beta.
Reference: :cite:`massey1988dimensions`.
"""
[docs] def __init__(self,
data,
group_pop_var,
total_pop_var,
alpha=0.6,
beta=0.5,
metric='euclidean'):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _relative_clustering(data, group_pop_var, total_pop_var, alpha,
beta, metric)
self.statistic = aux[0]
self.core_data = aux[1]
self._function = _relative_clustering
def _delta(data, group_pop_var, total_pop_var):
"""
Calculation of Delta index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
Returns
----------
statistic : float
Delta Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
Reference: :cite:`massey1988dimensions`.
"""
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if ((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
raise TypeError('group_pop_var and total_pop_var must be strings')
if ((group_pop_var not in data.columns)
or (total_pop_var not in data.columns)):
raise ValueError(
'group_pop_var and total_pop_var must be variables of data')
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
x = np.array(data.group_pop_var)
t = np.array(data.total_pop_var)
if any(t < x):
raise ValueError(
'Group of interest population must equal or lower than the total population of the units.'
)
area = np.array(data.area)
X = x.sum()
A = area.sum()
DEL = 1 / 2 * abs(x / X - area / A).sum()
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
return DEL, core_data
[docs]class Delta:
"""
Calculation of Delta index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
Attributes
----------
statistic : float
Delta Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Examples
--------
In this example, we will calculate the delta index (D) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset.
Firstly, we need to perform some import the modules and the respective function.
>>> import pandas as pd
>>> import geopandas as gpd
>>> import segregation
>>> from segregation.spatial import Delta
Secondly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/LTDB_Std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['tractid', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'tractid')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
The value is estimated below.
>>> delta_index = Delta(gdf, 'nhblk10', 'pop10')
>>> delta_index.statistic
0.8367330649317353
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
Reference: :cite:`massey1988dimensions`.
"""
[docs] def __init__(self, data, group_pop_var, total_pop_var):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _delta(data, group_pop_var, total_pop_var)
self.statistic = aux[0]
self.core_data = aux[1]
self._function = _delta
def _absolute_concentration(data, group_pop_var, total_pop_var):
"""
Calculation of Absolute Concentration index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
Returns
----------
statistic : float
Absolute Concentration Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
Reference: :cite:`massey1988dimensions`.
"""
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if ((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
raise TypeError('group_pop_var and total_pop_var must be strings')
if ((group_pop_var not in data.columns)
or (total_pop_var not in data.columns)):
raise ValueError(
'group_pop_var and total_pop_var must be variables of data')
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
x = np.array(data.group_pop_var)
t = np.array(data.total_pop_var)
if any(t < x):
raise ValueError(
'Group of interest population must equal or lower than the total population of the units.'
)
area = np.array(data.area)
X = x.sum()
T = t.sum()
# Create the indexes according to the area ordering
des_ind = (-area).argsort()
asc_ind = area.argsort()
# A discussion about the extraction of n1 and n2 can be found in https://github.com/pysal/segregation/issues/43
n1 = np.where(((np.cumsum(t[asc_ind]) / T) < X / T) == False)[0][0] + 1
n2_aux = np.where(((np.cumsum(t[des_ind]) / T) < X / T) == False)[0][0] + 1
n2 = len(data) - n2_aux
n = data.shape[0]
T1 = t[asc_ind][0:n1].sum()
T2 = t[asc_ind][n2:n].sum()
ACO = 1- ((((x[asc_ind] * area[asc_ind] / X).sum()) - ((t[asc_ind] * area[asc_ind] / T1)[0:n1].sum())) / \
(((t[asc_ind] * area[asc_ind] / T2)[n2:n].sum()) - ((t[asc_ind] * area[asc_ind]/T1)[0:n1].sum())))
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
return ACO, core_data
[docs]class AbsoluteConcentration:
"""
Calculation of Absolute Concentration index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
Attributes
----------
statistic : float
Absolute Concentration Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Examples
--------
In this example, we will calculate the absolute concentration index (ACO) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset.
Firstly, we need to perform some import the modules and the respective function.
>>> import pandas as pd
>>> import geopandas as gpd
>>> import segregation
>>> from segregation.spatial import AbsoluteConcentration
Secondly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/LTDB_Std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['tractid', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'tractid')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
The value is estimated below.
>>> absolute_concentration_index = AbsoluteConcentration(gdf, 'nhblk10', 'pop10')
>>> absolute_concentration_index.statistic
0.9577607171503524
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
Reference: :cite:`massey1988dimensions`.
"""
[docs] def __init__(self, data, group_pop_var, total_pop_var):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _absolute_concentration(data, group_pop_var, total_pop_var)
self.statistic = aux[0]
self.core_data = aux[1]
self._function = _absolute_concentration
def _relative_concentration(data, group_pop_var, total_pop_var):
"""
Calculation of Relative Concentration index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
Returns
----------
statistic : float
Relative Concentration Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
Reference: :cite:`massey1988dimensions`.
"""
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if ((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
raise TypeError('group_pop_var and total_pop_var must be strings')
if ((group_pop_var not in data.columns)
or (total_pop_var not in data.columns)):
raise ValueError(
'group_pop_var and total_pop_var must be variables of data')
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
x = np.array(data.group_pop_var)
t = np.array(data.total_pop_var)
if any(t < x):
raise ValueError(
'Group of interest population must equal or lower than the total population of the units.'
)
area = np.array(data.area)
y = t - x
X = x.sum()
Y = y.sum()
T = t.sum()
# Create the indexes according to the area ordering
des_ind = (-area).argsort()
asc_ind = area.argsort()
# A discussion about the extraction of n1 and n2 can be found in https://github.com/pysal/segregation/issues/43
n1 = np.where(((np.cumsum(t[asc_ind]) / T) < X / T) == False)[0][0] + 1
n2_aux = np.where(((np.cumsum(t[des_ind]) / T) < X / T) == False)[0][0] + 1
n2 = len(data) - n2_aux
n = data.shape[0]
T1 = t[asc_ind][0:n1].sum()
T2 = t[asc_ind][n2:n].sum()
RCO = ((((x[asc_ind] * area[asc_ind] / X).sum()) / ((y[asc_ind] * area[asc_ind] / Y).sum())) - 1) / \
((((t[asc_ind] * area[asc_ind])[0:n1].sum() / T1) / ((t[asc_ind] * area[asc_ind])[n2:n].sum() / T2)) - 1)
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
return RCO, core_data
[docs]class RelativeConcentration:
"""
Calculation of Relative Concentration index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
Attributes
----------
statistic : float
Relative Concentration Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
Examples
--------
In this example, we will calculate the relative concentration index (RCO) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset.
Firstly, we need to perform some import the modules and the respective function.
>>> import pandas as pd
>>> import geopandas as gpd
>>> import segregation
>>> from segregation.spatial import RelativeConcentration
Secondly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/LTDB_Std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['tractid', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'tractid')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
The value is estimated below.
>>> relative_concentration_index = RelativeConcentration(gdf, 'nhblk10', 'pop10')
>>> relative_concentration_index.statistic
0.5204046784837685
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
Reference: :cite:`massey1988dimensions`.
"""
[docs] def __init__(self, data, group_pop_var, total_pop_var):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _relative_concentration(data, group_pop_var, total_pop_var)
self.statistic = aux[0]
self.core_data = aux[1]
self._function = _relative_concentration
def _absolute_centralization(data,
group_pop_var,
total_pop_var,
center="mean",
metric='euclidean'):
"""
Calculation of Absolute Centralization index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
center : string, two-dimension values (tuple, list, array) or integer.
This defines what is considered to be the center of the spatial context under study.
If string, this can be set to:
"mean": the center longitude/latitude is the mean of longitudes/latitudes of all units.
"median": the center longitude/latitude is the median of longitudes/latitudes of all units.
"population_weighted_mean": the center longitude/latitude is the mean of longitudes/latitudes of all units weighted by the total population.
"largest_population": the center longitude/latitude is the centroid of the unit with largest total population. If there is a tie in the maximum population, the mean of all coordinates will be taken.
If tuple, list or array, this argument should be the coordinates of the desired center assuming longitude as first value and latitude second value. Therefore, in the form (longitude, latitude), if tuple, or [longitude, latitude] if list or numpy array.
If integer, the center will be the centroid of the polygon from data corresponding to the integer interpreted as index.
For example, if `center = 0` the centroid of the first row of data is used as center, if `center = 1` the second row will be used, and so on.
metric : string. Can be 'euclidean' or 'haversine'. Default is 'euclidean'.
The metric used for the distance between spatial units.
If the projection of the CRS of the geopandas DataFrame field is in degrees, this should be set to 'haversine'.
Returns
----------
statistic : float
Absolute Centralization Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
center_values : list
The center, in the form [longitude, latitude], values used for the calculation of the centralization distances.
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
A discussion of defining the center in this function can be found in https://github.com/pysal/segregation/issues/18.
Reference: :cite:`massey1988dimensions`.
"""
if not metric in ['euclidean', 'haversine']:
raise ValueError('metric must one of \'euclidean\', \'haversine\'')
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if ((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
raise TypeError('group_pop_var and total_pop_var must be strings')
if ((group_pop_var not in data.columns)
or (total_pop_var not in data.columns)):
raise ValueError(
'group_pop_var and total_pop_var must be variables of data')
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
x = np.array(data.group_pop_var)
t = np.array(data.total_pop_var)
if any(t < x):
raise ValueError(
'Group of interest population must equal or lower than the total population of the units.'
)
area = np.array(data.area)
c_lons = np.array(data.centroid.x)
c_lats = np.array(data.centroid.y)
if isinstance(center, str):
if not center in [
'mean', 'median', 'population_weighted_mean',
'largest_population'
]:
raise ValueError(
'The center string must one of \'mean\', \'median\', \'population_weighted_mean\', \'largest_population\''
)
if (center == "mean"):
center_lon = c_lons.mean()
center_lat = c_lats.mean()
if (center == "median"):
center_lon = np.median(c_lons)
center_lat = np.median(c_lats)
if (center == "population_weighted_mean"):
center_lon = np.average(c_lons, weights=t)
center_lat = np.average(c_lats, weights=t)
if (center == "largest_population"):
center_lon = c_lons[np.where(t == t.max())].mean()
center_lat = c_lats[np.where(t == t.max())].mean()
if isinstance(center, tuple) or isinstance(center, list) or isinstance(
center, np.ndarray):
if np.array(center).shape != (2, ):
raise ValueError('The center tuple/list/array must have length 2.')
center_lon = center[0]
center_lat = center[1]
if isinstance(center, int):
if (center > len(data) - 1) or (center < 0):
raise ValueError('The center index must by in the range of data.')
center_lon = data.iloc[[center]].centroid.x.values[0]
center_lat = data.iloc[[center]].centroid.y.values[0]
X = x.sum()
A = area.sum()
dlon = c_lons - center_lon
dlat = c_lats - center_lat
if (metric == 'euclidean'):
center_dist = np.sqrt((dlon)**2 + (dlat)**2)
if (metric == 'haversine'):
center_dist = 2 * np.arcsin(np.sqrt(np.sin(dlat/2)**2 + np.cos(center_lat) * np.cos(c_lats) * np.sin(dlon/2)**2))
if np.isnan(center_dist).sum() > 0:
raise ValueError('It not possible to determine the center distance for, at least, one unit. This is probably due to the magnitude of the number of the centroids. We recommend to reproject the geopandas DataFrame.')
asc_ind = center_dist.argsort()
Xi = np.cumsum(x[asc_ind]) / X
Ai = np.cumsum(area[asc_ind]) / A
ACE = np.nansum(shift(Xi, 1, cval=np.NaN) * Ai) - \
np.nansum(Xi * shift(Ai, 1, cval=np.NaN))
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
center_values = [center_lon, center_lat]
return ACE, core_data, center_values
[docs]class AbsoluteCentralization:
"""
Calculation of Absolute Centralization index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
center : string, two-dimension values (tuple, list, array) or integer.
This defines what is considered to be the center of the spatial context under study.
If string, this can be set to:
"mean": the center longitude/latitude is the mean of longitudes/latitudes of all units.
"median": the center longitude/latitude is the median of longitudes/latitudes of all units.
"population_weighted_mean": the center longitude/latitude is the mean of longitudes/latitudes of all units weighted by the total population.
"largest_population": the center longitude/latitude is the centroid of the unit with largest total population. If there is a tie in the maximum population, the mean of all coordinates will be taken.
If tuple, list or array, this argument should be the coordinates of the desired center assuming longitude as first value and latitude second value. Therefore, in the form (longitude, latitude), if tuple, or [longitude, latitude] if list or numpy array.
If integer, the center will be the centroid of the polygon from data corresponding to the integer interpreted as index.
For example, if `center = 0` the centroid of the first row of data is used as center, if `center = 1` the second row will be used, and so on.
Attributes
----------
statistic : float
Absolute Centralization Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
center_values : list
The center, in the form [longitude, latitude], values used for the calculation of the centralization distances.
Examples
--------
In this example, we will calculate the absolute centralization index (ACE) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset.
Firstly, we need to perform some import the modules and the respective function.
>>> import pandas as pd
>>> import geopandas as gpd
>>> import segregation
>>> from segregation.spatial import AbsoluteCentralization
Secondly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/LTDB_Std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['tractid', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'tractid')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
The value is estimated below.
>>> absolute_centralization_index = AbsoluteCentralization(gdf, 'nhblk10', 'pop10')
>>> absolute_centralization_index.statistic
0.6416113799795511
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
A discussion of defining the center in this function can be found in https://github.com/pysal/segregation/issues/18.
Reference: :cite:`massey1988dimensions`.
"""
[docs] def __init__(self,
data,
group_pop_var,
total_pop_var,
center="mean",
metric='euclidean'):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _absolute_centralization(data, group_pop_var, total_pop_var,
center, metric)
self.statistic = aux[0]
self.core_data = aux[1]
self.center_values = aux[2]
self._function = _absolute_centralization
def _relative_centralization(data,
group_pop_var,
total_pop_var,
center="mean",
metric='euclidean'):
"""
Calculation of Relative Centralization index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
center : string, two-dimension values (tuple, list, array) or integer.
This defines what is considered to be the center of the spatial context under study.
If string, this can be set to:
"mean": the center longitude/latitude is the mean of longitudes/latitudes of all units.
"median": the center longitude/latitude is the median of longitudes/latitudes of all units.
"population_weighted_mean": the center longitude/latitude is the mean of longitudes/latitudes of all units weighted by the total population.
"largest_population": the center longitude/latitude is the centroid of the unit with largest total population. If there is a tie in the maximum population, the mean of all coordinates will be taken.
If tuple, list or array, this argument should be the coordinates of the desired center assuming longitude as first value and latitude second value. Therefore, in the form (longitude, latitude), if tuple, or [longitude, latitude] if list or numpy array.
If integer, the center will be the centroid of the polygon from data corresponding to the integer interpreted as index.
For example, if `center = 0` the centroid of the first row of data is used as center, if `center = 1` the second row will be used, and so on.
metric : string. Can be 'euclidean' or 'haversine'. Default is 'euclidean'.
The metric used for the distance between spatial units.
If the projection of the CRS of the geopandas DataFrame field is in degrees, this should be set to 'haversine'.
Returns
----------
statistic : float
Relative Centralization Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
center_values : list
The center, in the form [longitude, latitude], values used for the calculation of the centralization distances.
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
A discussion of defining the center in this function can be found in https://github.com/pysal/segregation/issues/18.
"""
if not metric in ['euclidean', 'haversine']:
raise ValueError('metric must one of \'euclidean\', \'haversine\'')
if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'):
raise TypeError(
'data is not a GeoDataFrame and, therefore, this index cannot be calculated.'
)
if ('geometry' not in data.columns):
data['geometry'] = data[data._geometry_column_name]
data = data.drop([data._geometry_column_name], axis=1)
data = data.set_geometry('geometry')
if ((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
raise TypeError('group_pop_var and total_pop_var must be strings')
if ((group_pop_var not in data.columns)
or (total_pop_var not in data.columns)):
raise ValueError(
'group_pop_var and total_pop_var must be variables of data')
data = data.rename(columns={
group_pop_var: 'group_pop_var',
total_pop_var: 'total_pop_var'
})
x = np.array(data.group_pop_var)
t = np.array(data.total_pop_var)
if any(t < x):
raise ValueError(
'Group of interest population must equal or lower than the total population of the units.'
)
y = t - x
c_lons = np.array(data.centroid.x)
c_lats = np.array(data.centroid.y)
if isinstance(center, str):
if not center in [
'mean', 'median', 'population_weighted_mean',
'largest_population'
]:
raise ValueError(
'The center string must one of \'mean\', \'median\', \'population_weighted_mean\', \'largest_population\''
)
if (center == "mean"):
center_lon = c_lons.mean()
center_lat = c_lats.mean()
if (center == "median"):
center_lon = np.median(c_lons)
center_lat = np.median(c_lats)
if (center == "population_weighted_mean"):
center_lon = np.average(c_lons, weights=t)
center_lat = np.average(c_lats, weights=t)
if (center == "largest_population"):
center_lon = c_lons[np.where(t == t.max())].mean()
center_lat = c_lats[np.where(t == t.max())].mean()
if isinstance(center, tuple) or isinstance(center, list) or isinstance(
center, np.ndarray):
if np.array(center).shape != (2, ):
raise ValueError('The center tuple/list/array must have length 2.')
center_lon = center[0]
center_lat = center[1]
if isinstance(center, int):
if (center > len(data) - 1) or (center < 0):
raise ValueError('The center index must by in the range of data.')
center_lon = data.iloc[[center]].centroid.x.values[0]
center_lat = data.iloc[[center]].centroid.y.values[0]
X = x.sum()
Y = y.sum()
dlon = c_lons - center_lon
dlat = c_lats - center_lat
if (metric == 'euclidean'):
center_dist = np.sqrt((dlon)**2 + (dlat)**2)
if (metric == 'haversine'):
center_dist = 2 * np.arcsin(
np.sqrt(
np.sin(dlat / 2)**2 +
np.cos(center_lat) * np.cos(c_lats) * np.sin(dlon / 2)**2))
if np.isnan(center_dist).sum() > 0:
raise ValueError('It not possible to determine the center distance for, at least, one unit. This is probably due to the magnitude of the number of the centroids. We recommend to reproject the geopandas DataFrame.')
asc_ind = center_dist.argsort()
Xi = np.cumsum(x[asc_ind]) / X
Yi = np.cumsum(y[asc_ind]) / Y
RCE = np.nansum(shift(Xi, 1, cval=np.NaN) * Yi) - \
np.nansum(Xi * shift(Yi, 1, cval=np.NaN))
core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
center_values = [center_lon, center_lat]
return RCE, core_data, center_values
[docs]class RelativeCentralization:
"""
Calculation of Relative Centralization index
Parameters
----------
data : a geopandas DataFrame with a geometry column.
group_pop_var : string
The name of variable in data that contains the population size of the group of interest
total_pop_var : string
The name of variable in data that contains the total population of the unit
center : string, two-dimension values (tuple, list, array) or integer.
This defines what is considered to be the center of the spatial context under study.
If string, this can be set to:
"mean": the center longitude/latitude is the mean of longitudes/latitudes of all units.
"median": the center longitude/latitude is the median of longitudes/latitudes of all units.
"population_weighted_mean": the center longitude/latitude is the mean of longitudes/latitudes of all units weighted by the total population.
"largest_population": the center longitude/latitude is the centroid of the unit with largest total population. If there is a tie in the maximum population, the mean of all coordinates will be taken.
If tuple, list or array, this argument should be the coordinates of the desired center assuming longitude as first value and latitude second value. Therefore, in the form (longitude, latitude), if tuple, or [longitude, latitude] if list or numpy array.
If integer, the center will be the centroid of the polygon from data corresponding to the integer interpreted as index.
For example, if `center = 0` the centroid of the first row of data is used as center, if `center = 1` the second row will be used, and so on.
metric : string. Can be 'euclidean' or 'haversine'. Default is 'euclidean'.
The metric used for the distance between spatial units.
If the projection of the CRS of the geopandas DataFrame field is in degrees, this should be set to 'haversine'.
Attributes
----------
statistic : float
Relative Centralization Index
core_data : a geopandas DataFrame
A geopandas DataFrame that contains the columns used to perform the estimate.
center_values : list
The center, in the form [longitude, latitude], values used for the calculation of the centralization distances.
Examples
--------
In this example, we will calculate the relative centralization index (RCE) for the Riverside County using the census tract data of 2010.
The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset.
Firstly, we need to perform some import the modules and the respective function.
>>> import pandas as pd
>>> import geopandas as gpd
>>> import segregation
>>> from segregation.spatial import RelativeCentralization
Secondly, we need to read the data:
>>> # This example uses all census data that the user must provide your own copy of the external database.
>>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb
>>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below.
>>> filepath = '~/data/LTDB_Std_2010_fullcount.csv'
>>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",")
Then, we filter only for the desired county (in this case, Riverside County):
>>> df = census_2010.loc[census_2010.county == "Riverside County"][['tractid', 'pop10','nhblk10']]
Then, we read the Riverside map data using geopandas (the county id is 06065):
>>> map_url = 'https://raw.githubusercontent.com/renanxcortes/inequality-segregation-supplementary-files/master/Tracts_grouped_by_County/06065.json'
>>> map_gpd = gpd.read_file(map_url)
It is necessary to harmonize the data type of the dataset and the geopandas in order to work the merging procedure.
Later, we extract only the columns that will be used.
>>> map_gpd['INTGEOID10'] = pd.to_numeric(map_gpd["GEOID10"])
>>> gdf_pre = map_gpd.merge(df, left_on = 'INTGEOID10', right_on = 'tractid')
>>> gdf = gdf_pre[['geometry', 'pop10', 'nhblk10']]
The value is estimated below.
>>> relative_centralization_index = RelativeCentralization(gdf, 'nhblk10', 'pop10')
>>> relative_centralization_index.statistic
0.18550429720565376
Notes
-----
Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315.
A discussion of defining the center in this function can be found in https://github.com/pysal/segregation/issues/18.
Reference: :cite:`massey1988dimensions`.
"""
[docs] def __init__(self,
data,
group_pop_var,
total_pop_var,
center="mean",
metric='euclidean'):
data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]])
aux = _relative_centralization(data, group_pop_var, total_pop_var,
center, metric)
self.statistic = aux[0]
self.core_data = aux[1]
self.center_values = aux[2]
self._function = _relative_centralization
class SpatialInformationTheory(MultiInformationTheory):
"""Spatial Multigroup Information Theory Index.
This class calculates the spatial version of the multigroup information
theory index. The data are "spatialized" by converting each observation
to a "local environment" by creating a weighted sum of the focal unit with
its neighboring observations, where the neighborhood is defined by a
libpysal weights matrix or a pandana Network instance.
Parameters
----------
data : geopandas.GeoDataFrame
geodataframe with
groups : list
list of columns on gdf representing population groups for which the SIT
index should be calculated
network : pandana.Network
pandana.Network instance. This is likely created with `get_osm_network`
or via helper functions from OSMnet or UrbanAccess.
w : libpysal.W
distance-based PySAL spatial weights matrix instance
distance : int
maximum distance to consider `accessible` (the default is 2000).
decay : str
decay type pandana should use "linear", "exp", or "flat"
(which means no decay). The default is "linear".
precompute: bool
Whether the pandana.Network instance should precompute the range
queries.This is true by default, but if you plan to calculate several
indices using the same network, then you can set this
parameter to `False` to avoid precomputing repeatedly inside the
function
"""
def __init__(self,
data,
groups,
network=None,
w=None,
decay='linear',
distance=2000,
precompute=True):
if w and network:
raise (
"must pass either a pandana network or a pysal weights object\
but not both")
elif network:
df = calc_access(data,
variables=groups,
network=network,
distance=distance,
decay=decay,
precompute=precompute)
groups = ["acc_" + group for group in groups]
else:
df = _build_local_environment(data, groups, w)
super().__init__(df, groups)
class SpatialDivergence(MultiDivergence):
"""Spatial Multigroup Divergence Index.
This class calculates the spatial version of the multigroup divergence
index. The data are "spatialized" by converting each observation
to a "local environment" by creating a weighted sum of the focal unit with
its neighboring observations, where the neighborhood is defined by a
libpysal weights matrix or a pandana Network instance.
Parameters
----------
data : geopandas.GeoDataFrame
geodataframe with
groups : list
list of columns on gdf representing population groups for which the
divergence index should be calculated
w : libpysal.W
distance-based PySAL spatial weights matrix instance
network : pandana.Network
pandana.Network instance. This is likely created with `get_osm_network`
or via helper functions from OSMnet or UrbanAccess.
distance : int
maximum distance to consider `accessible` (the default is 2000).
decay : str
decay type pandana should use "linear", "exp", or "flat"
(which means no decay). The default is "linear".
precompute: bool
Whether the pandana.Network instance should precompute the range
queries.This is true by default, but if you plan to calculate several
indices using the same network, then you can set this
parameter to `False` to avoid precomputing repeatedly inside the
function
"""
def __init__(self,
data,
groups,
network=None,
w=None,
decay='linear',
distance=2000,
precompute=True):
if w and network:
raise (
"must pass either a pandana network or a pysal weights object\
but not both")
elif network:
df = calc_access(data,
variables=groups,
network=network,
distance=distance,
decay=decay,
precompute=precompute)
groups = ["acc_" + group for group in groups]
else:
df = _build_local_environment(data, groups, w)
super().__init__(df, groups)
def compute_segregation_profile(gdf,
groups=None,
distances=None,
network=None,
decay='linear',
function='triangular',
precompute=True):
"""Compute multiscalar segregation profile.
This function calculates several Spatial Information Theory indices with
increasing distance parameters.
Parameters
----------
gdf : geopandas.GeoDataFrame
geodataframe with rows as observations and columns as population
variables. Note that if using a network distance, the coordinate
system for this gdf should be 4326. If using euclidian distance,
this must be projected into planar coordinates like state plane or UTM.
groups : list
list of variables .
distances : list
list of floats representing bandwidth distances that define a local
environment.
network : pandana.Network (optional)
A pandana.Network likely created with
`segregation.network.get_osm_network`.
decay : str (optional)
decay type to be used in pandana accessibility calculation (the
default is 'linear').
function: 'str' (optional)
which weighting function should be passed to libpysal.weights.Kernel
must be one of: 'triangular','uniform','quadratic','quartic','gaussian'
precompute: bool
Whether the pandana.Network instance should precompute the range
queries.This is true by default, but if you plan to calculate several
segregation profiles using the same network, then you can set this
parameter to `False` to avoid precomputing repeatedly inside the
function
Returns
-------
dict
dictionary with distances as keys and SIT statistics as values
Notes
-----
Based on Sean F. Reardon, Stephen A. Matthews, David O’Sullivan, Barrett A. Lee, Glenn Firebaugh, Chad R. Farrell, & Kendra Bischoff. (2008). The Geographic Scale of Metropolitan Racial Segregation. Demography, 45(3), 489–514. https://doi.org/10.1353/dem.0.0019.
Reference: :cite:`Reardon2008`.
"""
gdf = gdf.copy()
gdf[groups] = gdf[groups].astype(float)
indices = {}
indices[0] = MultiInformationTheory(gdf, groups).statistic
if network:
if not gdf.crs.name == 'WGS 84':
gdf = gdf.to_crs(epsg=4326)
groups2 = ['acc_' + group for group in groups]
if precompute:
maxdist = max(distances)
network.precompute(maxdist)
for distance in distances:
distance = np.float(distance)
access = calc_access(gdf,
network,
decay=decay,
variables=groups,
distance=distance,
precompute=False)
sit = MultiInformationTheory(access, groups2)
indices[distance] = sit.statistic
else:
for distance in distances:
w = Kernel.from_dataframe(gdf,
bandwidth=distance,
function=function)
sit = SpatialInformationTheory(gdf, groups, w=w)
indices[distance] = sit.statistic
return indices
# Deprecation Calls
msg = _dep_message("Spatial_Prox_Prof", "SpatialProxProf")
Spatial_Prox_Prof = DeprecationHelper(SpatialProxProf, message=msg)
msg = _dep_message("Spatial_Dissim", "SpatialDissim")
Spatial_Dissim = DeprecationHelper(SpatialDissim, message=msg)
msg = _dep_message("Boundary_Spatial_Dissim", "BoundarySpatialDissim")
Boundary_Spatial_Dissim = DeprecationHelper(BoundarySpatialDissim, message=msg)
msg = _dep_message("Perimeter_Area_Ratio_Spatial_Dissim",
"PerimeterAreaRatioSpatialDissim")
Perimeter_Area_Ratio_Spatial_Dissim = DeprecationHelper(
PerimeterAreaRatioSpatialDissim, message=msg)
msg = _dep_message("Distance_Decay_Isolation", "DistanceDecayIsolation")
Distance_Decay_Isolation = DeprecationHelper(DistanceDecayIsolation,
message=msg)
msg = _dep_message("Distance_Decay_Exposure", "DistanceDecayExposure")
Distance_Decay_Exposure = DeprecationHelper(DistanceDecayExposure, message=msg)
msg = _dep_message("Spatial_Proximity", "SpatialProximity")
Spatial_Proximity = DeprecationHelper(SpatialProximity, message=msg)
msg = _dep_message("Absolute_Clustering", "AbsoluteClustering")
Absolute_Clustering = DeprecationHelper(AbsoluteClustering, message=msg)
msg = _dep_message("Relative_Clustering", "RelativeClustering")
Relative_Clustering = DeprecationHelper(RelativeClustering, message=msg)
msg = _dep_message("Absolute_Concentration", "AbsoluteConcentration")
Absolute_Concentration = DeprecationHelper(AbsoluteConcentration, message=msg)
msg = _dep_message("Relative_Concentration", "RelativeConcentration")
Relative_Concentration = DeprecationHelper(RelativeConcentration, message=msg)
msg = _dep_message("Absolute_Centralization", "AbsoluteCentralization")
Absolute_Centralization = DeprecationHelper(AbsoluteCentralization,
message=msg)
msg = _dep_message("Relative_Centralization", "RelativeCentralization")
Relative_Centralization = DeprecationHelper(RelativeCentralization, message=msg)