In [64]:
import numpy as np
import pandas as pd
import os

from sklearn import svm
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import multilabel_confusion_matrix,accuracy_score
from sklearn import preprocessing
from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression

from exponential_dp import exponential_mechanism, save_dict

## Load and process dataset

In [66]:
dataset_dir = "../data_out/"

dataset_name = "MPIIDPEye"
feature = "Pami_Features_52"

dataset_path = dataset_dir+dataset_name+"_clean"

In [67]:
feature_files = os.listdir(dataset_path)

#keep only csv files
feature_files = [f for f in feature_files if f[-3:] == 'csv']

#keep only files that have the feature string within them
feature_files = [f for f in feature_files if feature in f]

## Load Dataset

In [68]:
#keys are subject_ids, values are a list of np arrays
data_dict = {}

#loop over files, loading npy files and adding into the dictionary 
for f in feature_files:
    #load
    file_path = os.path.join(dataset_path,f)
   
    #parse by splitting on '_'
    f_split = f.split('_')
    
    curr_sub = int(f_split[1][1:])
    
    curr_img = int(f_split[2][1:])
    
    file_data = np.genfromtxt(file_path, delimiter=',')
    
    #columns to discard as zeros per Inken
    if dataset_name == "MPIIDPEye":
        file_data = np.delete(file_data,[30,36,42,48],axis=1)
    
    #if one row adjust shape
    if len(file_data.shape) == 1:
        file_data = np.reshape(file_data,(-1,file_data.shape[0]))
       
    num_rows = file_data.shape[0]
    
    num_features = file_data.shape[1]
    
    #add curr_sub to the data first column for later segmentation
    file_data = np.insert(file_data,0,curr_sub,axis=1)
    
    #removea any nan rows
    file_data = file_data[~np.isnan(file_data).any(axis=1)]
    
    data_dict.setdefault(curr_img,[]).append(file_data)
    

0

## Get deltas from feature ranges

In [69]:
feature_max = np.zeros((1,num_features))
feature_min = np.ones((1,num_features))* np.inf

for img in data_dict.keys():
    #list of np arrays from each subject for this image concatenated into one array
    curr_img_data = np.concatenate(data_dict[img])

    curr_feature_data = curr_img_data[:,1:]
    
    #set the maxs and mins
    feature_max = np.maximum(feature_max,np.max(curr_feature_data,axis=0))
    feature_min = np.minimum(feature_min,np.min(curr_feature_data,axis=0))

deltas = feature_max - feature_min

array([[ 1.0000e-02,  6.2832e-03,  4.6080e+01,  3.5723e+01,  1.5002e+02,
         4.2571e-03, -1.9826e+00,  1.0000e+00, -3.3808e+05, -2.4185e+05,
         0.0000e+00,  4.2571e-01, -4.2172e+00,  1.0000e+00,  8.2276e-05,
         8.7266e-05,  5.2104e-03,  1.0946e-03,  2.4588e+02, -2.0593e+02,
        -2.2614e+02,  0.0000e+00,  1.0077e-02, -7.1980e+00,  1.0000e+00,
        -4.5213e+01, -4.5213e+01,  0.0000e+00,  1.2341e-02, -8.4824e+00,
         1.0000e+00, -1.6994e+04, -8.6054e+03,  0.0000e+00,  1.0077e+00,
        -6.0836e+00,  1.0000e+00, -4.5213e+03, -4.5213e+03,  0.0000e+00,
         9.0330e-01, -7.2392e+00,  1.0000e+00,  2.0000e-02]])

## Apply Exponential Mechanism to whole dataset

In [70]:
#keep track of t_mins and t_maxs before padding is applied, for estimating how much padded data was added
t_mins = []
t_maxs = []

start_time = 0
end_time   = 0

#parameter values
w = 1
epsilon_params = [100,80,60,40,20,10,5,2,1]

for epsilon in epsilon_params:
    private_data_dict = {}

    epsilons = np.repeat(epsilon,num_features)
    
    #get start time
    #start_time = time.time()
    
    #Loop through each img
    for img in data_dict.keys():
        #list of np arrays from each subject for this image concatenated into one array
        curr_img_data = np.concatenate(data_dict[img])

        #seperate out the sub_id col
        sub_ids       = curr_img_data[:,0]
        curr_img_data = curr_img_data[:,1:]

        #extract the unique subjects from with this dict data entry
        unique_sub_ids = np.unique(sub_ids)

        #pass through list of all subjects to get t_min and t_max
        t_min = np.inf
        t_max = 0
        for sub_id in unique_sub_ids:
            sub_idx = sub_ids == sub_id
            num_idx = np.count_nonzero(sub_idx)

            t_min = np.minimum(t_min,num_idx)
            t_max = np.maximum(t_max,num_idx)
        t_min = int(t_min)
        t_max = int(t_max)

        t_mins.append(t_min)
        t_maxs.append(t_max)

        #then loop through again and apply padding to match t_max
        for sub_id in unique_sub_ids:
            sub_idx = sub_ids == sub_id
            num_idx = np.count_nonzero(sub_idx)

            curr_sub_data = curr_img_data[sub_idx,:]

            #pad out rows from num_idx to t_max

            #get last row
            last_row = curr_sub_data[-1,:]

            num_repeats = t_max-num_idx

            #get repeats
            repeat_rows = np.repeat([last_row],num_repeats,axis=0)
            repeat_ids  = np.repeat([sub_id],num_repeats,axis=0)

            #add them into curr_img_data and sub_ids from before sub_idx[-1]+1
            curr_img_data = np.insert(curr_img_data,sub_idx[-1]+1,repeat_rows,axis=0)
            sub_ids       = np.insert(sub_ids,sub_idx[-1]+1,repeat_ids,axis=0)

        #apply exponential mechanism
        private_data = exponential_mechanism(curr_img_data,deltas,epsilons,w,t_max)
        

        #add the sub_ids back in as first column for later export
        private_data = np.insert(private_data,0,sub_ids,axis=1)

        #put result into a new dictionary w/ private data 
        private_data_dict[img] = private_data
    
    
    
    #save out csv for this param
    out_dataset_path = dataset_dir+dataset_name+"_exponential_" + str(epsilon)
    save_dict(out_dataset_path,feature,private_data_dict)
    
    #get end time and print
    #end_time =  time.time()
    #print(end_time-start_time)
    

3.2969446182250977
2.2082934379577637
1.6634676456451416
1.9933619499206543
2.0753355026245117
1.7304766178131104
1.8174188137054443
2.473207712173462
2.0693368911743164
