Source code for segregation.aspatial.aspatial_indexes

"""
Aspatial based Segregation Metrics
"""

__author__ = "Renan X. Cortes <renanc@ucr.edu>, Sergio J. Rey <sergio.rey@ucr.edu> and Elijah Knaap <elijah.knaap@ucr.edu>"

import numpy as np
import pandas as pd
import warnings
import geopandas as gpd

from scipy.stats import norm
from scipy.optimize import minimize

from segregation.util.util import _dep_message, DeprecationHelper, _nan_handle

# Including old and new api in __all__ so users can use both

__all__ = ['Dissim', 
           
           'Gini_Seg',
           'GiniSeg',
           
           'Entropy', 
           'Isolation',
           'Exposure',
           'Atkinson',
           
           'Correlation_R',
           'CorrelationR',
           
           'Con_Prof',
           'ConProf',
           
           'Modified_Dissim',
           'ModifiedDissim',
           
           'Modified_Gini_Seg',
           'ModifiedGiniSeg',
           
           'Bias_Corrected_Dissim',
           'BiasCorrectedDissim',
           
           'Density_Corrected_Dissim',
           'DensityCorrectedDissim',
           
           'MinMax']

# The Deprecation calls of the classes are located in the end of this script #



def _min_max(data, group_pop_var, total_pop_var):
    """
    Calculation of the Aspatial version of SpatialMinMax

    Parameters
    ----------

    data          : a pandas DataFrame
    
    group_pop_var : string
                    The name of variable in data that contains the population size of the group of interest
                    
    total_pop_var : string
                    The name of variable in data that contains the total population of the unit

    Returns
    ----------

    statistic : float
                MinMax Index
                
    core_data : a pandas DataFrame
                A pandas DataFrame that contains the columns used to perform the estimate.

    Notes
    -----
    Based on O'Sullivan & Wong (2007). A Surface‐Based Approach to Measuring Spatial Segregation.
    Geographical Analysis 39 (2). https://doi.org/10.1111/j.1538-4632.2007.00699.x

    Reference: :cite:`osullivanwong2007surface`.
    
    We'd like to thank @AnttiHaerkoenen for this contribution!
    
    """
    
    if((type(group_pop_var) is not str) or (type(total_pop_var) is not str)):
        raise TypeError('group_pop_var and total_pop_var must be strings')
    
    if ((group_pop_var not in data.columns) or (total_pop_var not in data.columns)):    
        raise ValueError('group_pop_var and total_pop_var must be variables of data')
        
    data = data.rename(columns={group_pop_var: 'group_pop_var', 
                                total_pop_var: 'total_pop_var'})
    
    if any(data.total_pop_var < data.group_pop_var):    
        raise ValueError('Group of interest population must equal or lower than the total population of the units.')
   
    data['group_2_pop_var'] = data['total_pop_var'] - data['group_pop_var']
    
    data['group_1_pop_var_norm'] = data['group_pop_var'] / data['group_pop_var'].sum()
    data['group_2_pop_var_norm'] = data['group_2_pop_var'] / data['group_2_pop_var'].sum()
    
    density_1 = data['group_1_pop_var_norm'].values
    density_2 = data['group_2_pop_var_norm'].values
    densities = np.vstack([
        density_1,
        density_2
    ])
    v_union = densities.max(axis=0).sum()
    v_intersect = densities.min(axis=0).sum()
    
    MM = 1 - v_intersect / v_union
    
    if not isinstance(data, gpd.GeoDataFrame):
        core_data = data[['group_pop_var', 'total_pop_var']]
    
    else:    
        core_data = data[['group_pop_var', 'total_pop_var', 'geometry']]
    
    return MM, core_data


[docs]class MinMax: """ Calculation of the Aspatial version of SpatialMinMax Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit Attributes ---------- statistic : float MinMax Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on O'Sullivan & Wong (2007). A Surface‐Based Approach to Measuring Spatial Segregation. Geographical Analysis 39 (2). https://doi.org/10.1111/j.1538-4632.2007.00699.x Reference: :cite:`osullivanwong2007surface`. We'd like to thank @AnttiHaerkoenen for this contribution! """
[docs] def __init__(self, data, group_pop_var, total_pop_var): if (str(type(data)) == '<class \'geopandas.geodataframe.GeoDataFrame\'>'): data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]]) else: data = _nan_handle(data[[group_pop_var, total_pop_var]]) aux = _min_max(data, group_pop_var, total_pop_var) self.statistic = aux[0] self.core_data = aux[1] self._function = _min_max
def _dissim(data, group_pop_var, total_pop_var): """ Calculation of Dissimilarity index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit Returns ---------- statistic : float Dissimilarity Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """ if((type(group_pop_var) is not str) or (type(total_pop_var) is not str)): raise TypeError('group_pop_var and total_pop_var must be strings') if ((group_pop_var not in data.columns) or (total_pop_var not in data.columns)): raise ValueError('group_pop_var and total_pop_var must be variables of data') data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) x = np.array(data.group_pop_var) t = np.array(data.total_pop_var) if any(t < x): raise ValueError('Group of interest population must equal or lower than the total population of the units.') T = t.sum() P = x.sum() / T # If a unit has zero population, the group of interest frequency is zero pi = np.where(t == 0, 0, x / t) D = (((t * abs(pi - P)))/ (2 * T * P * (1 - P))).sum() if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): core_data = data[['group_pop_var', 'total_pop_var']] else: core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] return D, core_data
[docs]class Dissim: """ Classic Dissimilarity Index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit Attributes ---------- statistic : float Dissimilarity Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we will calculate the degree of dissimilarity (D) for the Riverside County using the census tract data of 2010. The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset. Firstly, we need to perform some import the modules and the respective function. >>> import pandas as pd >>> import geopandas as gpd >>> import segregation >>> from segregation.aspatial import Dissim Secondly, we need to read the data: >>> # This example uses all census data that the user must provide your own copy of the external database. >>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb >>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below. >>> filepath = '~/data/LTDB_Std_2010_fullcount.csv' >>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",") Then, we filter only for the desired county (in this case, Riverside County): >>> df = census_2010.loc[census_2010.county == "Riverside County"][['pop10','tractid']] The value is estimated below. >>> dissim_index = Dissim(df, 'tractid', 'pop10') >>> dissim_index.statistic 0.31565682496226544 The interpretation of this value is that 31.57% of the non-hispanic black population would have to move to reach eveness in the Riverside County. Notes ----- Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """
[docs] def __init__(self, data, group_pop_var, total_pop_var): if (str(type(data)) == '<class \'geopandas.geodataframe.GeoDataFrame\'>'): data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]]) else: data = _nan_handle(data[[group_pop_var, total_pop_var]]) aux = _dissim(data, group_pop_var, total_pop_var) self.statistic = aux[0] self.core_data = aux[1] self._function = _dissim
def _gini_seg(data, group_pop_var, total_pop_var): """ Calculation of Gini Segregation index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit Returns ---------- statistic : float Gini Segregation Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """ if((type(group_pop_var) is not str) or (type(total_pop_var) is not str)): raise TypeError('group_pop_var and total_pop_var must be strings') if ((group_pop_var not in data.columns) or (total_pop_var not in data.columns)): raise ValueError('group_pop_var and total_pop_var must be variables of data') data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) if any(data.total_pop_var < data.group_pop_var): raise ValueError('Group of interest population must equal or lower than the total population of the units.') T = data.total_pop_var.sum() P = data.group_pop_var.sum() / T # If a unit has zero population, the group of interest frequency is zero data = data.assign(ti = data.total_pop_var, pi = np.where(data.total_pop_var == 0, 0, data.group_pop_var/data.total_pop_var)) num = (np.matmul(np.array(data.ti)[np.newaxis].T, np.array(data.ti)[np.newaxis]) * abs(np.array(data.pi)[np.newaxis].T - np.array(data.pi)[np.newaxis])).sum() den = (2 * T**2 * P * (1-P)) G = num / den if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): core_data = data[['group_pop_var', 'total_pop_var']] else: core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] return G, core_data
[docs]class GiniSeg: """ Classic Gini Segregation Index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit Attributes ---------- statistic : float Gini Segregation Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we will calculate the Gini Segregation Index (G) for the Riverside County using the census tract data of 2010. The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset. Firstly, we need to perform some import the modules and the respective function. >>> import pandas as pd >>> import geopandas as gpd >>> import segregation >>> from segregation.aspatial import GiniSeg Secondly, we need to read the data: >>> # This example uses all census data that the user must provide your own copy of the external database. >>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb >>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below. >>> filepath = '~/data/LTDB_Std_2010_fullcount.csv' >>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",") Then, we filter only for the desired county (in this case, Riverside County): >>> df = census_2010.loc[census_2010.county == "Riverside County"][['pop10','tractid']] The value is estimated below. >>> gini_seg_index = GiniSeg(df, 'tractid', 'pop10') >>> gini_seg_index.statistic 0.44620350030600087 Notes ----- Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """
[docs] def __init__(self, data, group_pop_var, total_pop_var): if (str(type(data)) == '<class \'geopandas.geodataframe.GeoDataFrame\'>'): data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]]) else: data = _nan_handle(data[[group_pop_var, total_pop_var]]) aux = _gini_seg(data, group_pop_var, total_pop_var) self.statistic = aux[0] self.core_data = aux[1] self._function = _gini_seg
def _entropy(data, group_pop_var, total_pop_var): """ Calculation of Entropy index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit Returns ---------- statistic : float Entropy Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """ if((type(group_pop_var) is not str) or (type(total_pop_var) is not str)): raise TypeError('group_pop_var and total_pop_var must be strings') if ((group_pop_var not in data.columns) or (total_pop_var not in data.columns)): raise ValueError('group_pop_var and total_pop_var must be variables of data') data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) x = np.array(data.group_pop_var) t = np.array(data.total_pop_var) if any(t < x): raise ValueError('Group of interest population must equal or lower than the total population of the units.') T = t.sum() P = x.sum() / T # If a unit has zero population, the group of interest frequency is zero pi = np.where(t == 0, 0, x / t) E = P * np.log(1 / P) + (1 - P) * np.log(1 / (1 - P)) Ei = pi * np.log(1 / pi) + (1 - pi) * np.log(1 / (1 - pi)) H = np.nansum(t * (E - Ei) / (E * T)) # If some pi is zero, numpy will treat as zero if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): core_data = data[['group_pop_var', 'total_pop_var']] else: core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] return H, core_data
[docs]class Entropy: """ Classic Entropy Index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit Attributes ---------- statistic : float Entropy Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we will calculate the Entropy (H) for the Riverside County using the census tract data of 2010. The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset. Firstly, we need to perform some import the modules and the respective function. >>> import pandas as pd >>> import geopandas as gpd >>> import segregation >>> from segregation.aspatial import Entropy Secondly, we need to read the data: >>> # This example uses all census data that the user must provide your own copy of the external database. >>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb >>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below. >>> filepath = '~/data/LTDB_Std_2010_fullcount.csv' >>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",") Then, we filter only for the desired county (in this case, Riverside County): >>> df = census_2010.loc[census_2010.county == "Riverside County"][['pop10','tractid']] The value is estimated below. >>> entropy_index = Entropy(df, 'tractid', 'pop10') >>> entropy_index.statistic 0.08636489348167173 Notes ----- Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """
[docs] def __init__(self, data, group_pop_var, total_pop_var): if (str(type(data)) == '<class \'geopandas.geodataframe.GeoDataFrame\'>'): data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]]) else: data = _nan_handle(data[[group_pop_var, total_pop_var]]) aux = _entropy(data, group_pop_var, total_pop_var) self.statistic = aux[0] self.core_data = aux[1] self._function = _entropy
def _isolation(data, group_pop_var, total_pop_var): """ Calculation of Isolation index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest (X) total_pop_var : string The name of variable in data that contains the total population of the unit Returns ---------- statistic : float Isolation Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- The group of interest is labelled as group X. Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """ if((type(group_pop_var) is not str) or (type(total_pop_var) is not str)): raise TypeError('group_pop_var and total_pop_var must be strings') if ((group_pop_var not in data.columns) or (total_pop_var not in data.columns)): raise ValueError('group_pop_var and total_pop_var must be variables of data') data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) x = np.array(data.group_pop_var) t = np.array(data.total_pop_var) if any(t < x): raise ValueError('Group of interest population must equal or lower than the total population of the units.') X = x.sum() xPx = np.nansum((x / X) * (x / t)) if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): core_data = data[['group_pop_var', 'total_pop_var']] else: core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] return xPx, core_data
[docs]class Isolation: """ Classic Isolation Index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest (X) total_pop_var : string The name of variable in data that contains the total population of the unit Attributes ---------- statistic : float Isolation Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we will calculate the Isolation Index (xPx) for the Riverside County using the census tract data of 2010. The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset. Firstly, we need to perform some import the modules and the respective function. >>> import pandas as pd >>> import geopandas as gpd >>> import segregation >>> from segregation.aspatial import Isolation Secondly, we need to read the data: >>> # This example uses all census data that the user must provide your own copy of the external database. >>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb >>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below. >>> filepath = '~/data/LTDB_Std_2010_fullcount.csv' >>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",") Then, we filter only for the desired county (in this case, Riverside County): >>> df = census_2010.loc[census_2010.county == "Riverside County"][['pop10','tractid']] The value is estimated below. >>> isolation_index = Isolation(df, 'tractid', 'pop10') >>> isolation_index.statistic 0.11321482777341298 The interpretation of this number is that if you randomly pick a X member of a specific area, there is 11.32% of probability that this member shares a unit with another X member. Notes ----- The group of interest is labelled as group X. Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """
[docs] def __init__(self, data, group_pop_var, total_pop_var): if (str(type(data)) == '<class \'geopandas.geodataframe.GeoDataFrame\'>'): data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]]) else: data = _nan_handle(data[[group_pop_var, total_pop_var]]) aux = _isolation(data, group_pop_var, total_pop_var) self.statistic = aux[0] self.core_data = aux[1] self._function = _isolation
def _exposure(data, group_pop_var, total_pop_var): """ Calculation of Exposure index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest (X) total_pop_var : string The name of variable in data that contains the total population of the unit Returns ---------- statistic : float Exposure Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- The group of interest is labelled as group X, whilst Y is the complementary group. Groups X and Y are mutually excludent. Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """ if((type(group_pop_var) is not str) or (type(total_pop_var) is not str)): raise TypeError('group_pop_var and total_pop_var must be strings') if ((group_pop_var not in data.columns) or (total_pop_var not in data.columns)): raise ValueError('group_pop_var and total_pop_var must be variables of data') data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) x = np.array(data.group_pop_var) t = np.array(data.total_pop_var) if any(t < x): raise ValueError('Group of interest population must equal or lower than the total population of the units.') yi = t - x X = x.sum() xPy = np.nansum((x / X) * (yi / t)) if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): core_data = data[['group_pop_var', 'total_pop_var']] else: core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] return xPy, core_data
[docs]class Exposure: """ Classic Exposure Index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest (X) total_pop_var : string The name of variable in data that contains the total population of the unit Attributes ---------- statistic : float Exposure Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we will calculate the Exposure Index (xPy) for the Riverside County using the census tract data of 2010. The group of interest (X) is non-hispanic black people which is the variable nhblk10 in the dataset and the Y group is the other part of the population. Firstly, we need to perform some import the modules and the respective function. >>> import pandas as pd >>> import geopandas as gpd >>> import segregation >>> from segregation.aspatial import Exposure Secondly, we need to read the data: >>> # This example uses all census data that the user must provide your own copy of the external database. >>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb >>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below. >>> filepath = '~/data/LTDB_Std_2010_fullcount.csv' >>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",") Then, we filter only for the desired county (in this case, Riverside County): >>> df = census_2010.loc[census_2010.county == "Riverside County"][['pop10','tractid']] The value is estimated below. >>> exposure_index = Exposure(df, 'tractid', 'pop10') >>> exposure_index.statistic 0.886785172226587 The interpretation of this number is that if you randomly pick a X member of a specific area, there is 88.68% of probability that this member shares a unit with a Y member. Notes ----- The group of interest is labelled as group X, whilst Y is the complementary group. Groups X and Y are mutually excludent. Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """
[docs] def __init__(self, data, group_pop_var, total_pop_var): if (str(type(data)) == '<class \'geopandas.geodataframe.GeoDataFrame\'>'): data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]]) else: data = _nan_handle(data[[group_pop_var, total_pop_var]]) aux = _exposure(data, group_pop_var, total_pop_var) self.statistic = aux[0] self.core_data = aux[1] self._function = _exposure
def _atkinson(data, group_pop_var, total_pop_var, b = 0.5): """ Calculation of Atkinson index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit b : float The shape parameter, between 0 and 1, that determines how to weight the increments to segregation contributed by different portions of the Lorenz curve. Returns ---------- statistic : float Atkinson Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """ if (not isinstance(b, float)): raise ValueError('The parameter b must be a float.') if ((b < 0) or (b > 1)): raise ValueError('The parameter b must be between 0 and 1.') if((type(group_pop_var) is not str) or (type(total_pop_var) is not str)): raise TypeError('group_pop_var and total_pop_var must be strings') if ((group_pop_var not in data.columns) or (total_pop_var not in data.columns)): raise ValueError('group_pop_var and total_pop_var must be variables of data') data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) x = np.array(data.group_pop_var) t = np.array(data.total_pop_var) if any(t < x): raise ValueError('Group of interest population must equal or lower than the total population of the units.') T = t.sum() P = x.sum() / T # If a unit has zero population, the group of interest frequency is zero pi = np.where(t == 0, 0, x / t) A = 1 - (P / (1-P)) * abs((((1 - pi) ** (1-b) * pi ** b * t) / (P * T)).sum()) ** (1 / (1 - b)) if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): core_data = data[['group_pop_var', 'total_pop_var']] else: core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] return A, core_data
[docs]class Atkinson: """ Classic Atkinson Index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit b : float The shape parameter, between 0 and 1, that determines how to weight the increments to segregation contributed by different portions of the Lorenz curve. Attributes ---------- statistic : float Atkison Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we will calculate the Atkinson Index (A) with the shape parameter (b) equals to 0.5 for the Riverside County using the census tract data of 2010. The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset. Firstly, we need to perform some import the modules and the respective function. >>> import pandas as pd >>> import geopandas as gpd >>> import segregation >>> from segregation.aspatial import Atkinson Secondly, we need to read the data: >>> # This example uses all census data that the user must provide your own copy of the external database. >>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb >>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below. >>> filepath = '~/data/LTDB_Std_2010_fullcount.csv' >>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",") Then, we filter only for the desired county (in this case, Riverside County): >>> df = census_2010.loc[census_2010.county == "Riverside County"][['pop10','tractid']] The value is estimated below. >>> atkinson_index = Atkinson(df, 'tractid', 'pop10', b = 0.5) >>> atkinson_index.statistic 0.16722406110274002 Notes ----- Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """
[docs] def __init__(self, data, group_pop_var, total_pop_var, b = 0.5): if (str(type(data)) == '<class \'geopandas.geodataframe.GeoDataFrame\'>'): data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]]) else: data = _nan_handle(data[[group_pop_var, total_pop_var]]) aux = _atkinson(data, group_pop_var, total_pop_var, b) self.statistic = aux[0] self.core_data = aux[1] self._function = _atkinson
def _correlationr(data, group_pop_var, total_pop_var): """ Calculation of Correlation Ratio index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest (X) total_pop_var : string The name of variable in data that contains the total population of the unit Returns ---------- statistic : float Correlation Ratio Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """ if((type(group_pop_var) is not str) or (type(total_pop_var) is not str)): raise TypeError('group_pop_var and total_pop_var must be strings') if ((group_pop_var not in data.columns) or (total_pop_var not in data.columns)): raise ValueError('group_pop_var and total_pop_var must be variables of data') data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) x = np.array(data.group_pop_var) t = np.array(data.total_pop_var) if any(t < x): raise ValueError('Group of interest population must equal or lower than the total population of the units.') X = x.sum() T = t.sum() P = X / T xPx = np.nansum((x / X) * (x / t)) V = (xPx - P) / (1 - P) if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): core_data = data[['group_pop_var', 'total_pop_var']] else: core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] return V, core_data
[docs]class CorrelationR: """ Classic Correlation Ratio Index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest (X) total_pop_var : string The name of variable in data that contains the total population of the unit Attributes ---------- statistic : float Correlation Ratio Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we will calculate the Correlation Ratio Index (V) for the Riverside County using the census tract data of 2010. The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset. Firstly, we need to perform some import the modules and the respective function. >>> import pandas as pd >>> import geopandas as gpd >>> import segregation >>> from segregation.aspatial import CorrelationR Secondly, we need to read the data: >>> # This example uses all census data that the user must provide your own copy of the external database. >>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb >>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below. >>> filepath = '~/data/LTDB_Std_2010_fullcount.csv' >>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",") Then, we filter only for the desired county (in this case, Riverside County): >>> df = census_2010.loc[census_2010.county == "Riverside County"][['pop10','tractid']] The value is estimated below. >>> correlationr_index = CorrelationR(df, 'tractid', 'pop10') >>> correlationr_index.statistic 0.048716810856363923 Notes ----- Based on Massey, Douglas S., and Nancy A. Denton. "The dimensions of residential segregation." Social forces 67.2 (1988): 281-315. Reference: :cite:`massey1988dimensions`. """
[docs] def __init__(self, data, group_pop_var, total_pop_var): if (str(type(data)) == '<class \'geopandas.geodataframe.GeoDataFrame\'>'): data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]]) else: data = _nan_handle(data[[group_pop_var, total_pop_var]]) aux = _correlationr(data, group_pop_var, total_pop_var) self.statistic = aux[0] self.core_data = aux[1] self._function = _correlationr
def _conprof(data, group_pop_var, total_pop_var, m = 1000): """ Calculation of Concentration Profile Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit m : int a numeric value indicating the number of thresholds to be used. Default value is 1000. A large value of m creates a smoother-looking graph and a more precise concentration profile value but slows down the calculation speed. Returns ---------- statistic : float Concentration Profile Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Hong, Seong-Yun, and Yukio Sadahiro. "Measuring geographic segregation: a graph-based approach." Journal of Geographical Systems 16.2 (2014): 211-231. Reference: :cite:`hong2014measuring`. """ if(type(m) is not int): raise TypeError('m must be a string.') if(m < 2): raise ValueError('m must be greater than 1.') if((type(group_pop_var) is not str) or (type(total_pop_var) is not str)): raise TypeError('group_pop_var and total_pop_var must be strings') if ((group_pop_var not in data.columns) or (total_pop_var not in data.columns)): raise ValueError('group_pop_var and total_pop_var must be variables of data') data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) x = np.array(data.group_pop_var) t = np.array(data.total_pop_var) if any(t < x): raise ValueError('Group of interest population must equal or lower than the total population of the units.') def calculate_vt(th): g_t_i = np.where(x / t >= th, 1, 0) v_t = (g_t_i * x).sum() / x.sum() return v_t grid = np.linspace(0, 1, m) curve = np.array(list(map(calculate_vt, grid))) threshold = x.sum() / t.sum() R = ((threshold - ((curve[grid < threshold]).sum() / m - (curve[grid >= threshold]).sum()/ m)) / (1 - threshold)) if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): core_data = data[['group_pop_var', 'total_pop_var']] else: core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] return R, grid, curve, core_data
[docs]class ConProf: """ Concentration Profile Index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit m : int a numeric value indicating the number of thresholds to be used. A large value of m creates a smoother-looking graph and a more precise concentration profile value but slows down the calculation speed. Attributes ---------- statistic : float Concentration Profile Index core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we will calculate the concentration profile (R) for the Riverside County using the census tract data of 2010. The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset. Firstly, we need to perform some import the modules and the respective function. >>> import pandas as pd >>> import geopandas as gpd >>> import segregation >>> from segregation.aspatial import ConProf Secondly, we need to read the data: >>> # This example uses all census data that the user must provide your own copy of the external database. >>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb >>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below. >>> filepath = '~/data/LTDB_Std_2010_fullcount.csv' >>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",") Then, we filter only for the desired county (in this case, Riverside County): >>> df = census_2010.loc[census_2010.county == "Riverside County"][['pop10','tractid']] The value is estimated below. >>> conprof_index = ConProf(df, 'tractid', 'pop10') >>> conprof_index.statistic 0.06393365660089256 You can plot the profile curve with the plot method. >>> conprof_index.plot() Notes ----- Based on Hong, Seong-Yun, and Yukio Sadahiro. "Measuring geographic segregation: a graph-based approach." Journal of Geographical Systems 16.2 (2014): 211-231. Reference: :cite:`hong2014measuring`. """
[docs] def __init__(self, data, group_pop_var, total_pop_var, m = 1000): if (str(type(data)) == '<class \'geopandas.geodataframe.GeoDataFrame\'>'): data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]]) else: data = _nan_handle(data[[group_pop_var, total_pop_var]]) aux = _conprof(data, group_pop_var, total_pop_var, m) self.statistic = aux[0] self.grid = aux[1] self.curve = aux[2] self.core_data = aux[3] self._function = _conprof
def plot(self): """ Plot the Concentration Profile """ try: import matplotlib.pyplot as plt except ImportError: warnings.warn('This method relies on importing `matplotlib`') graph = plt.scatter(self.grid, self.curve, s = 0.1) return graph
def _modified_dissim(data, group_pop_var, total_pop_var, iterations = 500): """ Calculation of Modified Dissimilarity index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit iterations : int The number of iterations the evaluate average classic dissimilarity under eveness. Default value is 500. Returns ---------- statistic : float Modified Dissimilarity Index (Dissimilarity from Carrington and Troske (1997)) core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Carrington, William J., and Kenneth R. Troske. "On measuring segregation in samples with small units." Journal of Business & Economic Statistics 15.4 (1997): 402-409. Reference: :cite:`carrington1997measuring`. """ if(type(iterations) is not int): raise TypeError('iterations must be an integer') if(iterations < 2): raise TypeError('iterations must be greater than 1.') D = _dissim(data, group_pop_var, total_pop_var)[0] data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) # core_data has to be in the beggining of the call because assign methods will be used later if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): core_data = data[['group_pop_var', 'total_pop_var']] else: core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] x = np.array(data.group_pop_var) t = np.array(data.total_pop_var) p_null = x.sum() / t.sum() Ds = np.empty(iterations) for i in np.array(range(iterations)): freq_sim = np.random.binomial(n = np.array([t.tolist()]), p = np.array([[p_null] * data.shape[0]]), size = (1, data.shape[0])).tolist()[0] data = data.assign(group_pop_var = freq_sim) aux = _dissim(data, 'group_pop_var', 'total_pop_var')[0] Ds[i] = aux D_star = Ds.mean() if (D >= D_star): Dct = (D - D_star)/(1 - D_star) else: Dct = (D - D_star)/D_star return Dct, core_data
[docs]class ModifiedDissim: """ Calculation of Modified Dissimilarity index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit iterations : int The number of iterations the evaluate average classic dissimilarity under eveness. Default value is 500. Attributes ---------- statistic : float Modified Dissimilarity Index (Dissimilarity from Carrington and Troske (1997)) core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we will calculate the Modified Dissimilarity Index (Dct) for the Riverside County using the census tract data of 2010. The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset. Firstly, we need to perform some import the modules and the respective function. >>> import pandas as pd >>> import geopandas as gpd >>> import segregation >>> from segregation.aspatial import ModifiedDissim Secondly, we need to read the data: >>> # This example uses all census data that the user must provide your own copy of the external database. >>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb >>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below. >>> filepath = '~/data/LTDB_Std_2010_fullcount.csv' >>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",") Then, we filter only for the desired county (in this case, Riverside County): >>> df = census_2010.loc[census_2010.county == "Riverside County"][['pop10','tractid']] The value is estimated below. >>> np.random.seed(1234) >>> modified_dissim_index = ModifiedDissim(df, 'tractid', 'pop10') >>> modified_dissim_index.statistic 0.30009504639081996 Notes ----- Based on Carrington, William J., and Kenneth R. Troske. "On measuring segregation in samples with small units." Journal of Business & Economic Statistics 15.4 (1997): 402-409. Reference: :cite:`carrington1997measuring`. """
[docs] def __init__(self, data, group_pop_var, total_pop_var, iterations = 500): if (str(type(data)) == '<class \'geopandas.geodataframe.GeoDataFrame\'>'): data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]]) else: data = _nan_handle(data[[group_pop_var, total_pop_var]]) aux = _modified_dissim(data, group_pop_var, total_pop_var, iterations) self.statistic = aux[0] self.core_data = aux[1] self._function = _modified_dissim
def _modified_gini_seg(data, group_pop_var, total_pop_var, iterations = 500): """ Calculation of Modified Gini Segregation index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit iterations : int The number of iterations the evaluate average classic gini segregation under eveness. Default value is 500. Returns ---------- statistic : float Modified Gini Segregation Index (Gini from Carrington and Troske (1997)) core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Carrington, William J., and Kenneth R. Troske. "On measuring segregation in samples with small units." Journal of Business & Economic Statistics 15.4 (1997): 402-409. Reference: :cite:`carrington1997measuring`. """ if(type(iterations) is not int): raise TypeError('iterations must be an integer') if(iterations < 2): raise TypeError('iterations must be greater than 1.') G = _gini_seg(data, group_pop_var, total_pop_var)[0] data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) # core_data has to be in the beggining of the call because assign methods will be used later if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): core_data = data[['group_pop_var', 'total_pop_var']] else: core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] x = np.array(data.group_pop_var) t = np.array(data.total_pop_var) p_null = x.sum() / t.sum() Gs = np.empty(iterations) for i in np.array(range(iterations)): freq_sim = np.random.binomial(n = np.array([t.tolist()]), p = np.array([[p_null] * data.shape[0]]), size = (1, data.shape[0])).tolist()[0] data = data.assign(group_pop_var = freq_sim) aux = _gini_seg(data, 'group_pop_var', 'total_pop_var')[0] Gs[i] = aux G_star = Gs.mean() if (G >= G_star): Gct = (G - G_star)/(1 - G_star) else: Gct = (G - G_star)/G_star return Gct, core_data
[docs]class ModifiedGiniSeg: """ Calculation of Modified Gini Segregation index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit iterations : int The number of iterations the evaluate average classic gini segregation under eveness. Default value is 500. Attributes ---------- statistic : float Modified Gini Segregation Index (Gini from Carrington and Troske (1997)) core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we will calculate the Modified Gini Segregation Index (Gct) for the Riverside County using the census tract data of 2010. The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset. Firstly, we need to perform some import the modules and the respective function. >>> import pandas as pd >>> import geopandas as gpd >>> import segregation >>> from segregation.aspatial import ModifiedGiniSeg Secondly, we need to read the data: >>> # This example uses all census data that the user must provide your own copy of the external database. >>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb >>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below. >>> filepath = '~/data/LTDB_Std_2010_fullcount.csv' >>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",") Then, we filter only for the desired county (in this case, Riverside County): >>> df = census_2010.loc[census_2010.county == "Riverside County"][['pop10','tractid']] The value is estimated below. >>> np.random.seed(1234) >>> modified_gini_seg_index = ModifiedGiniSeg(df, 'tractid', 'pop10') >>> modified_gini_seg_index.statistic 0.4280279611418648 Notes ----- Based on Carrington, William J., and Kenneth R. Troske. "On measuring segregation in samples with small units." Journal of Business & Economic Statistics 15.4 (1997): 402-409. Reference: :cite:`carrington1997measuring`. """
[docs] def __init__(self, data, group_pop_var, total_pop_var, iterations = 500): if (str(type(data)) == '<class \'geopandas.geodataframe.GeoDataFrame\'>'): data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]]) else: data = _nan_handle(data[[group_pop_var, total_pop_var]]) aux = _modified_gini_seg(data, group_pop_var, total_pop_var, iterations) self.statistic = aux[0] self.core_data = aux[1] self._function = _modified_gini_seg
def _bias_corrected_dissim(data, group_pop_var, total_pop_var, B = 500): """ Calculation of Bias Corrected Dissimilarity index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit B : int The number of iterations to calculate Dissimilarity simulating randomness with multinomial distributions. Default value is 500. Returns ---------- statistic : float Dissimilarity with Bias-Correction (bias correction from Allen, Rebecca et al. (2015)) core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Allen, Rebecca, et al. "More reliable inference for the dissimilarity index of segregation." The econometrics journal 18.1 (2015): 40-66. Reference: :cite:`allen2015more`. """ if(type(B) is not int): raise TypeError('B must be an integer') if(B < 2): raise TypeError('B must be greater than 1.') D = _dissim(data, group_pop_var, total_pop_var)[0] data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) x = np.array(data.group_pop_var) t = np.array(data.total_pop_var) other_group_pop = t - x # Group 0: minority group p0_i = x / x.sum() n0 = x.sum() sim0 = np.random.multinomial(n0, p0_i, size = B) # Group 1: complement group p1_i = other_group_pop / other_group_pop.sum() n1 = other_group_pop.sum() sim1 = np.random.multinomial(n1, p1_i, size = B) Dbcs = np.empty(B) for i in np.array(range(B)): data_aux = {'simul_group': sim0[i].tolist(), 'simul_tot': (sim0[i] + sim1[i]).tolist()} df_aux = pd.DataFrame.from_dict(data_aux) Dbcs[i] = _dissim(df_aux, 'simul_group', 'simul_tot')[0] Db = Dbcs.mean() Dbc = 2 * D - Db Dbc # It expected to be lower than D, because D is upwarded biased if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): core_data = data[['group_pop_var', 'total_pop_var']] else: core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] return Dbc, core_data
[docs]class BiasCorrectedDissim: """ Calculation of Bias Corrected Dissimilarity index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit B : int The number of iterations to calculate Dissimilarity simulating randomness with multinomial distributions. Default value is 500. Attributes ---------- statistic : float Dissimilarity with Bias-Correction (bias correction from Allen, Rebecca et al. (2015)) core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we will calculate the Dissimilarity with Bias Correction (Dbc) for the Riverside County using the census tract data of 2010. The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset. Firstly, we need to perform some import the modules and the respective function. >>> import pandas as pd >>> import geopandas as gpd >>> import segregation >>> from segregation.aspatial import BiasCorrectedDissim Secondly, we need to read the data: >>> # This example uses all census data that the user must provide your own copy of the external database. >>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb >>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below. >>> filepath = '~/data/LTDB_Std_2010_fullcount.csv' >>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",") Then, we filter only for the desired county (in this case, Riverside County): >>> df = census_2010.loc[census_2010.county == "Riverside County"][['pop10','tractid']] The value is estimated below. >>> np.random.seed(1234) >>> bias_corrected_dissim_index = BiasCorrectedDissim(df, 'tractid', 'pop10') >>> bias_corrected_dissim_index.statistic 0.31484636081876954 Notes ----- Based on Allen, Rebecca, et al. "More reliable inference for the dissimilarity index of segregation." The econometrics journal 18.1 (2015): 40-66. Reference: :cite:`allen2015more`. """
[docs] def __init__(self, data, group_pop_var, total_pop_var, B = 500): if (str(type(data)) == '<class \'geopandas.geodataframe.GeoDataFrame\'>'): data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]]) else: data = _nan_handle(data[[group_pop_var, total_pop_var]]) aux = _bias_corrected_dissim(data, group_pop_var, total_pop_var, B) self.statistic = aux[0] self.core_data = aux[1] self._function = _bias_corrected_dissim
def _density_corrected_dissim(data, group_pop_var, total_pop_var, xtol = 1e-5): """ Calculation of Density Corrected Dissimilarity index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit xtol : float The degree of tolerance in the optimization process of returning optimal theta_j Returns ---------- statistic : float Dissimilarity with Density-Correction (density correction from Allen, Rebecca et al. (2015)) core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Notes ----- Based on Allen, Rebecca, et al. "More reliable inference for the dissimilarity index of segregation." The econometrics journal 18.1 (2015): 40-66. Reference: :cite:`allen2015more`. """ if((type(group_pop_var) is not str) or (type(total_pop_var) is not str)): raise TypeError('group_pop_var and total_pop_var must be strings') if ((group_pop_var not in data.columns) or (total_pop_var not in data.columns)): raise ValueError('group_pop_var and total_pop_var must be variables of data') data = data.rename(columns={group_pop_var: 'group_pop_var', total_pop_var: 'total_pop_var'}) g = np.array(data.group_pop_var) t = np.array(data.total_pop_var) if any(t < g): raise ValueError('Group of interest population must equal or lower than the total population of the units.') other_group_pop = t - g # Group 0: minority group p0_i = g / g.sum() n0 = g.sum() # Group 1: complement group p1_i = other_group_pop / other_group_pop.sum() n1 = other_group_pop.sum() sigma_hat_j = np.sqrt(((p1_i * (1 - p1_i)) / n1) + ((p0_i * (1 - p0_i)) / n0)) theta_hat_j = abs(p1_i - p0_i) / sigma_hat_j # Constructing function that returns $n(\hat{\theta}_j)$ def return_optimal_theta(theta_j): def fold_norm(x): y = (-1) * (norm.pdf(x - theta_j) + norm.pdf(x + theta_j)) return y initial_guesses = np.array(0) res = minimize(fold_norm, initial_guesses, method='nelder-mead', options = {'xtol': xtol}) return res.final_simplex[0][1][0] optimal_thetas = pd.Series(data = theta_hat_j).apply(return_optimal_theta) Ddc = np.multiply(sigma_hat_j, optimal_thetas).sum() / 2 if (str(type(data)) != '<class \'geopandas.geodataframe.GeoDataFrame\'>'): core_data = data[['group_pop_var', 'total_pop_var']] else: core_data = data[['group_pop_var', 'total_pop_var', 'geometry']] return Ddc, core_data
[docs]class DensityCorrectedDissim: """ Calculation of Density Corrected Dissimilarity index Parameters ---------- data : a pandas DataFrame group_pop_var : string The name of variable in data that contains the population size of the group of interest total_pop_var : string The name of variable in data that contains the total population of the unit xtol : float The degree of tolerance in the optimization process of returning optimal theta_j Attributes ---------- statistic : float Dissimilarity with Density-Correction (density correction from Allen, Rebecca et al. (2015)) core_data : a pandas DataFrame A pandas DataFrame that contains the columns used to perform the estimate. Examples -------- In this example, we will calculate the Dissimilarity with Density Correction (Ddc) for the Riverside County using the census tract data of 2010. The group of interest is non-hispanic black people which is the variable nhblk10 in the dataset. Firstly, we need to perform some import the modules and the respective function. >>> import pandas as pd >>> import geopandas as gpd >>> import segregation >>> from segregation.aspatial import DensityCorrectedDissim Secondly, we need to read the data: >>> # This example uses all census data that the user must provide your own copy of the external database. >>> # A step-by-step procedure for downloading the data can be found here: https://github.com/spatialucr/geosnap/blob/master/examples/01_getting_started.ipynb >>> # After the user download the LTDB_Std_All_fullcount.zip and extract the files, the filepath might be something like presented below. >>> filepath = '~/data/LTDB_Std_2010_fullcount.csv' >>> census_2010 = pd.read_csv(filepath, encoding = "ISO-8859-1", sep = ",") Then, we filter only for the desired county (in this case, Riverside County): >>> df = census_2010.loc[census_2010.county == "Riverside County"][['pop10','tractid']] The value is estimated below. >>> density_corrected_dissim_index = DensityCorrectedDissim(df, 'tractid', 'pop10') >>> density_corrected_dissim_index.statistic 0.29350643204887517 Notes ----- Based on Allen, Rebecca, et al. "More reliable inference for the dissimilarity index of segregation." The econometrics journal 18.1 (2015): 40-66. Reference: :cite:`allen2015more`. """
[docs] def __init__(self, data, group_pop_var, total_pop_var, xtol = 1e-5): if (str(type(data)) == '<class \'geopandas.geodataframe.GeoDataFrame\'>'): data = _nan_handle(data[[group_pop_var, total_pop_var, data._geometry_column_name]]) else: data = _nan_handle(data[[group_pop_var, total_pop_var]]) aux = _density_corrected_dissim(data, group_pop_var, total_pop_var, xtol) self.statistic = aux[0] self.core_data = aux[1] self._function = _density_corrected_dissim
# Deprecation Calls msg = _dep_message("Gini_Seg", "GiniSeg") Gini_Seg = DeprecationHelper(GiniSeg, message=msg) msg = _dep_message("Correlation_R", "CorrelationR") Correlation_R = DeprecationHelper(CorrelationR, message=msg) msg = _dep_message("Con_Prof", "ConProf") Con_Prof = DeprecationHelper(ConProf, message=msg) msg = _dep_message("Modified_Dissim", "ModifiedDissim") Modified_Dissim = DeprecationHelper(ModifiedDissim, message=msg) msg = _dep_message("Modified_Gini_Seg", "ModifiedGiniSeg") Modified_Gini_Seg = DeprecationHelper(ModifiedGiniSeg, message=msg) msg = _dep_message("Bias_Corrected_Dissim", "BiasCorrectedDissim") Bias_Corrected_Dissim = DeprecationHelper(BiasCorrectedDissim, message=msg) msg = _dep_message("Density_Corrected_Dissim", "DensityCorrectedDissim") Density_Corrected_Dissim = DeprecationHelper(DensityCorrectedDissim, message=msg)