Source code for segregation.local.local_indexes

"""
Local based Segregation Metrics

Important: all classes that start with "Multi_" expects a specific type of input of multigroups since the index will be calculated using many groups.
On the other hand, other classes expects a single group for calculation of the metrics.
"""

__author__ = "Renan X. Cortes <renanc@ucr.edu>, Elijah Knaap <elijah.knaap@ucr.edu> and Sergio J. Rey <sergio.rey@ucr.edu>"

import numpy as np
import libpysal as lps

from segregation.spatial import RelativeCentralization

from segregation.util.util import _dep_message, DeprecationHelper

# Including old and new api in __all__ so users can use both

__all__ = [
    'Multi_Location_Quotient',
    'MultiLocationQuotient',
    
    'Multi_Local_Diversity',
    'MultiLocalDiversity',
    
    'Multi_Local_Entropy',
    'MultiLocalEntropy',
    
    'Multi_Local_Simpson_Interaction',
    'MultiLocalSimpsonInteraction',
    
    'Multi_Local_Simpson_Concentration',
    'MultiLocalSimpsonConcentration',
    
    'Local_Relative_Centralization',
    'LocalRelativeCentralization'
]

# The Deprecation calls of the classes are located in the end of this script #




# suppress numpy divide by zero warnings because it occurs a lot during the
# calculation of many indices
np.seterr(divide='ignore', invalid='ignore')

def _multi_location_quotient(data, groups):
    """
    Calculation of Location Quotient index for each group and unit

    Parameters
    ----------

    data   : a pandas DataFrame of n rows
    
    groups : list of strings of length k.
             The variables names in data of the groups of interest of the analysis.

    Returns
    -------

    statistics : np.array(n,k)
                 Location Quotient values for each group and unit.
                 Column k has the Location Quotient of position k in groups.
                
    core_data  : a pandas DataFrame
                 A pandas DataFrame that contains the columns used to perform the estimate.

    Notes
    -----
    Based on Isard, Walter. Methods of regional analysis. Vol. 4. Рипол Классик, 1967.
    
    Reference: :cite:`isard1967methods`.

    """
    
    core_data = data[groups]
    
    df = np.array(core_data)
    
    n = df.shape[0]
    K = df.shape[1]
    
    T = df.sum()
    
    ti = df.sum(axis = 1)
    Xk = df.sum(axis = 0)
    
    multi_LQ = (df / np.repeat(ti, K, axis = 0).reshape(n,K)) / (Xk / T)
    
    return multi_LQ, core_data


[docs]class MultiLocationQuotient: """ Calculation of Location Quotient index for each group and unit Parameters ---------- data : a pandas DataFrame of n rows groups : list of strings of length k. The variables names in data of the groups of interest of the analysis. Attributes ---------- statistics : np.array(n,k) Location Quotient values for each group and unit. Column k has the Location Quotient of position k in groups. core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we are going to use 2000 Census Tract Data for Sacramento MSA, CA. The groups of interest are White, Black, Asian and Hispanic population. Firstly, we need to perform some import the modules and the respective function. >>> import libpysal >>> import geopandas as gpd >>> from segregation.local import MultiLocationQuotient Then, we read the data and create an auxiliary list with only the necessary columns for fitting the index. >>> input_df = gpd.read_file(libpysal.examples.get_path("sacramentot2.shp")) >>> groups_list = ['WHITE_', 'BLACK_', 'ASIAN_','HISP_'] The value is estimated below. >>> index = MultiLocationQuotient(input_df, groups_list) >>> index.statistics[0:3,0:3] array([[1.36543221, 0.07478049, 0.16245651], [1.18002164, 0. , 0.14836683], [0.68072696, 0.03534425, 0. ]]) Important to note that column k has the Location Quotient (LQ) of position k in groups. Therefore, the LQ of the first unit of 'WHITE_' is 1.36543221. Notes ----- Based on Isard, Walter. Methods of regional analysis. Vol. 4. Рипол Классик, 1967. Reference: :cite:`isard1967methods`. """
[docs] def __init__(self, data, groups): aux = _multi_location_quotient(data, groups) self.statistics = aux[0] self.core_data = aux[1] self._function = _multi_location_quotient
def _multi_local_diversity(data, groups): """ Calculation of Local Diversity index for each group and unit Parameters ---------- data : a pandas DataFrame of n rows groups : list of strings of length k. The variables names in data of the groups of interest of the analysis. Returns ------- statistics : np.array(n,k) Local Diversity values for each group and unit core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Theil, Henry. Statistical decomposition analysis; with applications in the social and administrative sciences. No. 04; HA33, T4.. 1972. Reference: :cite:`theil1972statistical`. """ core_data = data[groups] df = np.array(core_data) ti = df.sum(axis = 1) pik = df/ti[:,None] multi_LD = - np.nansum(pik * np.log(pik), axis = 1) return multi_LD, core_data
[docs]class MultiLocalDiversity: """ Calculation of Local Diversity index for each group and unit Parameters ---------- data : a pandas DataFrame of n rows groups : list of strings of length k. The variables names in data of the groups of interest of the analysis. Attributes ---------- statistics : np.array(n,k) Local Diversity values for each group and unit core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we are going to use 2000 Census Tract Data for Sacramento MSA, CA. The groups of interest are White, Black, Asian and Hispanic population. Firstly, we need to perform some import the modules and the respective function. >>> import libpysal >>> import geopandas as gpd >>> from segregation.local import MultiLocalDiversity Then, we read the data and create an auxiliary list with only the necessary columns for fitting the index. >>> input_df = gpd.read_file(libpysal.examples.get_path("sacramentot2.shp")) >>> groups_list = ['WHITE_', 'BLACK_', 'ASIAN_','HISP_'] The value is estimated below. >>> index = MultiLocalDiversity(input_df, groups_list) >>> index.statistics[0:10] # Values of first 10 units array([0.34332326, 0.56109229, 0.70563225, 0.29713472, 0.22386084, 0.29742517, 0.12322789, 0.11274579, 0.09402405, 0.25129616]) Notes ----- Based on Theil, Henry. Statistical decomposition analysis; with applications in the social and administrative sciences. No. 04; HA33, T4.. 1972. Reference: :cite:`theil1972statistical`. """
[docs] def __init__(self, data, groups): aux = _multi_local_diversity(data, groups) self.statistics = aux[0] self.core_data = aux[1] self._function = _multi_local_diversity
def _multi_local_entropy(data, groups): """ Calculation of Local Entropy index for each unit Parameters ---------- data : a pandas DataFrame of n rows groups : list of strings. The variables names in data of the groups of interest of the analysis. Returns ------- statistics : np.array(n) Local Entropy values for each unit core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Eq. 6 of pg. 139 (individual unit case) of Reardon, Sean F., and David O’Sullivan. "Measures of spatial segregation." Sociological methodology 34.1 (2004): 121-162. Reference: :cite:`reardon2004measures`. """ core_data = data[groups] df = np.array(core_data) K = df.shape[1] ti = df.sum(axis = 1) pik = df/ti[:,None] multi_LE = - np.nansum((pik * np.log(pik)) / np.log(K), axis = 1) return multi_LE, core_data
[docs]class MultiLocalEntropy: """ Calculation of Local Entropy index for each unit Parameters ---------- data : a pandas DataFrame of n rows groups : list of strings. The variables names in data of the groups of interest of the analysis. Attributes ---------- statistics : np.array(n) Local Entropy values for each unit core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we are going to use 2000 Census Tract Data for Sacramento MSA, CA. The groups of interest are White, Black, Asian and Hispanic population. Firstly, we need to perform some import the modules and the respective function. >>> import libpysal >>> import geopandas as gpd >>> from segregation.local import MultiLocalEntropy Then, we read the data and create an auxiliary list with only the necessary columns for fitting the index. >>> input_df = gpd.read_file(libpysal.examples.get_path("sacramentot2.shp")) >>> groups_list = ['WHITE_', 'BLACK_', 'ASIAN_','HISP_'] The value is estimated below. >>> index = MultiLocalEntropy(input_df, groups_list) >>> index.statistics[0:10] # Values of first 10 units array([0.24765538, 0.40474253, 0.50900607, 0.21433739, 0.16148146, 0.21454691, 0.08889013, 0.08132889, 0.06782401, 0.18127186]) Notes ----- Based on Eq. 6 of pg. 139 (individual unit case) of Reardon, Sean F., and David O’Sullivan. "Measures of spatial segregation." Sociological methodology 34.1 (2004): 121-162. Reference: :cite:`reardon2004measures`. """
[docs] def __init__(self, data, groups): aux = _multi_local_entropy(data, groups) self.statistics = aux[0] self.core_data = aux[1] self._function = _multi_local_entropy
def _multi_local_simpson_interaction(data, groups): """ Calculation of Local Simpson Interaction index for each unit Parameters ---------- data : a pandas DataFrame of n rows groups : list of strings. The variables names in data of the groups of interest of the analysis. Returns ------- statistics : np.array(n) Local Simpson Interaction values for each unit core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on the local version of Equation 1 of page 37 of Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67. Simpson's interaction index can be simply interpreted as the probability that two individuals chosen at random and independently from the population will be found to not belong to the same group. Higher values means lesser segregation. Simpson's Concentration + Simpson's Interaction = 1 Reference: :cite:`reardon2002measures`. """ core_data = data[groups] df = np.array(core_data) ti = df.sum(axis = 1) pik = df/ti[:,None] local_SI = np.nansum(pik * (1 - pik), axis = 1) return local_SI, core_data
[docs]class MultiLocalSimpsonInteraction: """ Calculation of Local Simpson Interaction index for each unit Parameters ---------- data : a pandas DataFrame of n rows groups : list of strings. The variables names in data of the groups of interest of the analysis. Attributes ---------- statistics : np.array(n) Local Simpson Interaction values for each unit core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we are going to use 2000 Census Tract Data for Sacramento MSA, CA. The groups of interest are White, Black, Asian and Hispanic population. Firstly, we need to perform some import the modules and the respective function. >>> import libpysal >>> import geopandas as gpd >>> from segregation.local import MultiLocalSimpsonInteraction Then, we read the data and create an auxiliary list with only the necessary columns for fitting the index. >>> input_df = gpd.read_file(libpysal.examples.get_path("sacramentot2.shp")) >>> groups_list = ['WHITE_', 'BLACK_', 'ASIAN_','HISP_'] The value is estimated below. >>> index = MultiLocalSimpsonInteraction(input_df, groups_list) >>> index.statistics[0:10] # Values of first 10 units array([0.15435993, 0.33391595, 0.49909747, 0.1299449 , 0.09805056, 0.13128178, 0.04447356, 0.0398933 , 0.03723054, 0.11758548]) Notes ----- Based on the local version of Equation 1 of page 37 of Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67. Simpson's interaction index can be simply interpreted as the probability that two individuals chosen at random and independently from the population will be found to not belong to the same group. Higher values means lesser segregation. Simpson's Concentration + Simpson's Interaction = 1 Reference: :cite:`reardon2002measures`. """
[docs] def __init__(self, data, groups): aux = _multi_local_simpson_interaction(data, groups) self.statistics = aux[0] self.core_data = aux[1] self._function = _multi_local_simpson_interaction
def _multi_local_simpson_concentration(data, groups): """ Calculation of Local Simpson concentration index for each unit Parameters ---------- data : a pandas DataFrame of n rows groups : list of strings. The variables names in data of the groups of interest of the analysis. Returns ------- statistics : np.array(n) Local Simpson concentration values for each unit core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on the local version of Equation 1 of page 37 of Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67. Simpson's concentration index can be simply interpreted as the probability that two individuals chosen at random and independently from the population will be found to belong to the same group. Higher values means lesser segregation. Simpson's Concentration + Simpson's Interaction = 1 Reference: :cite:`reardon2002measures`. """ core_data = data[groups] df = np.array(core_data) ti = df.sum(axis = 1) pik = df/ti[:,None] local_SC = np.nansum(pik * pik, axis = 1) return local_SC, core_data
[docs]class MultiLocalSimpsonConcentration: """ Calculation of Local Simpson concentration index for each unit Parameters ---------- data : a pandas DataFrame of n rows groups : list of strings. The variables names in data of the groups of interest of the analysis. Attributes ---------- statistics : np.array(n) Local Simpson concentration values for each unit core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we are going to use 2000 Census Tract Data for Sacramento MSA, CA. The groups of interest are White, Black, Asian and Hispanic population. Firstly, we need to perform some import the modules and the respective function. >>> import libpysal >>> import geopandas as gpd >>> from segregation.local import MultiLocalSimpsonConcentration Then, we read the data and create an auxiliary list with only the necessary columns for fitting the index. >>> input_df = gpd.read_file(libpysal.examples.get_path("sacramentot2.shp")) >>> groups_list = ['WHITE_', 'BLACK_', 'ASIAN_','HISP_'] The value is estimated below. >>> index = MultiLocalSimpsonConcentration(input_df, groups_list) >>> index.statistics[0:10] # Values of first 10 units array([0.84564007, 0.66608405, 0.50090253, 0.8700551 , 0.90194944, 0.86871822, 0.95552644, 0.9601067 , 0.96276946, 0.88241452]) Notes ----- Based on the local version of Equation 1 of page 37 of Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67. Simpson's concentration index can be simply interpreted as the probability that two individuals chosen at random and independently from the population will be found to belong to the same group. Higher values means lesser segregation. Simpson's Concentration + Simpson's Interaction = 1 Reference: :cite:`reardon2002measures`. """
[docs] def __init__(self, data, groups): aux = _multi_local_simpson_concentration(data, groups) self.statistics = aux[0] self.core_data = aux[1] self._function = _multi_local_simpson_concentration
def _local_relative_centralization(data, group_pop_var, total_pop_var, k_neigh = 5): """ Calculation of Local Relative Centralization index for each unit Parameters ---------- data : a geopandas DataFrame with a geometry column. group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit k_neigh : integer greater than 0. Default is 5. Number of assumed neighbors for local context (it uses k-nearest algorithm method) Returns ------- statistics : np.array(n) Local Relative Centralization values for each unit core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Folch, David C., and Sergio J. Rey. "The centralization index: A measure of local spatial segregation." Papers in Regional Science 95.3 (2016): 555-576. Reference: :cite:`folch2016centralization`. """ data = data.copy() core_data = data[[group_pop_var, total_pop_var, 'geometry']] c_lons = data.centroid.map(lambda p: p.x) c_lats = data.centroid.map(lambda p: p.y) points = list(zip(c_lons, c_lats)) kd = lps.cg.kdtree.KDTree(np.array(points)) wnnk = lps.weights.KNN(kd, k = k_neigh) local_RCEs = np.empty(len(data)) for i in range(len(data)): x = list(wnnk.neighbors.values())[i] x.append(list(wnnk.neighbors.keys())[i]) # Append in the end the current unit i inside the local context local_data = data.iloc[x,:].copy() # The center is given by the last position (i.e. the current unit i) local_RCE = RelativeCentralization(local_data, group_pop_var, total_pop_var, center = len(local_data) - 1) local_RCEs[i] = local_RCE.statistic return local_RCEs, core_data
[docs]class LocalRelativeCentralization: """ Calculation of Local Relative Centralization index for each unit Parameters ---------- data : a geopandas DataFrame with a geometry column. group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit k_neigh : integer greater than 0. Default is 5. Number of assumed neighbors for local context (it uses k-nearest algorithm method) Returns ------- statistics : np.array(n) Local Relative Centralization values for each unit core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we are going to use 2000 Census Tract Data for Sacramento MSA, CA. The group of interest is Black population. Firstly, we need to perform some import the modules and the respective function. >>> import libpysal >>> import geopandas as gpd >>> from segregation.local import LocalRelativeCentralization Then, we read the data and create an auxiliary list with only the necessary columns for fitting the index. >>> input_df = gpd.read_file(libpysal.examples.get_path("sacramentot2.shp")) The value is estimated below. >>> index = LocalRelativeCentralization(input_df, 'BLACK_', 'TOT_POP') >>> index.statistics[0:10] # Values of first 10 units array([ 0.03443055, -0.29063264, -0.19110976, 0.24978919, 0.01252249, 0.61152941, 0.78917647, 0.53129412, 0.04436346, -0.20216325]) Notes ----- Based on Folch, David C., and Sergio J. Rey. "The centralization index: A measure of local spatial segregation." Papers in Regional Science 95.3 (2016): 555-576. Reference: :cite:`folch2016centralization`. """
[docs] def __init__(self, data, group_pop_var, total_pop_var, k_neigh = 5): aux = _local_relative_centralization(data, group_pop_var, total_pop_var, k_neigh) self.statistics = aux[0] self.core_data = aux[1] self._function = _local_relative_centralization
# Deprecation Calls msg = _dep_message("Multi_Location_Quotient", "MultiLocationQuotient") Multi_Location_Quotient = DeprecationHelper(MultiLocationQuotient, message=msg) msg = _dep_message("Multi_Local_Diversity", "MultiLocalDiversity") Multi_Local_Diversity = DeprecationHelper(MultiLocalDiversity, message=msg) msg = _dep_message("Multi_Local_Entropy", "MultiLocalEntropy") Multi_Local_Entropy = DeprecationHelper(MultiLocalEntropy, message=msg) msg = _dep_message("Multi_Local_Simpson_Interaction", "MultiLocalSimpsonInteraction") Multi_Local_Simpson_Interaction = DeprecationHelper(MultiLocalSimpsonInteraction, message=msg) msg = _dep_message("Multi_Local_Simpson_Concentration", "MultiLocalSimpsonConcentration") Multi_Local_Simpson_Concentration = DeprecationHelper(MultiLocalSimpsonConcentration, message=msg) msg = _dep_message("Local_Relative_Centralization", "LocalRelativeCentralization") Local_Relative_Centralization = DeprecationHelper(LocalRelativeCentralization, message=msg)