import numpy as np
import matplotlib.pyplot as pl
import tensorflow as tf
import tensorflow.keras.optimizers as optimizers
from model import *
from input import NetworkInput
from data import createFeaturesDescription
from config import *
from tensorflow.keras.models import Model
from datetime import datetime
from IPython.display import Audio
import soundfile as sf
import os
import json
import csv
import itertools
from tensorboard.plugins import projector
from sklearn import cluster
from scipy.cluster.hierarchy import dendrogram
run_with_papermill=False
batch_size=16 #16 #512
stride=2
use_F0=False
F0_binary_values=False
use_deltas=False
hidden_size=128
num_layers=2
weights_filename='./logdir/2020-11-20_18-39-24-2_150_multiple_dropout/weights/weights_2020-11-20_18-39-24.h5'
weights_name='2_150_multiple_dropout'
useExScores=False
useRamus=False
use1dScores=False
useBalancedDataSet=False
useTest=False
balanced_dataset_folder='balanced_20_1'
evaluate_model=False
save_activations=False #save activations to json files #mode ALL_CELLS or SELECTED_CELLS defined later
save_embeddings=True #save embeddings as checkpoint file (for tensorboard)
do_similarity_analyses=True
compute_confusion_matrix=False
embeddings_Hellinger=True #embeddings based on sqrt of output
force_dropout=True #for embeddings, similarity analyses
force_dropout_conf_matrix=False #force dropout for confusion matrix
#For the examples of outputs, set manually
keep_prob=1
keep_prob_recurrent=1
keep_prob_dense_layer=1
#note: others useful parameters in rest of code, e.g. max_batches for analyses
max_examples_language=120 #max examples by language for tSNE embeddings
do_pairwise_analysis=False #pairewise analysis inside every batch
# Parameters
run_with_papermill = True
hidden_size = 180
use_deltas = True
use_F0 = True
F0_binary_values = False
useBalancedDataSet = False
keep_prob = 0.8
keep_prob_recurrent = 1
keep_prob_dense_layer = 1
weights_filename = "./logdir/2020-12-02_12-27-00-2_180_w_deltas_dropout_20_w_F0/weights/weights_2020-12-02_12-27-00.h5"
weights_name = "weights_2_180_F0"
compute_confusion_matrix = True
save_embeddings = False
do_similarity_analyses = False
useTest = True
if not(run_with_papermill): #manual settings
force_dropout=True
keep_prob=0.7
keep_prob_dense_layer=0.9
keep_prob_recurrent=0.9
useBalancedDataSet=True
#useRamus=True
use_deltas=True
hidden_size=150
evaluate_model=False
save_embeddings=False
save_activations=True
do_similarity_analyses=False
compute_confusion_matrix=False
weights_config={
'num_steps':32,
'features_description':createFeaturesDescription(F0=use_F0),
'stride':stride
}
load_weights=True
#old models (old inputs)
#1D models
#weights_filename="./models/weights0420/weights20epochs.h5"
#weights_filename, weights_name="./models/weights0420b/weights_2020-04-07_17-55-05.h5", "weights_2020-04-07"
#weights_config['features_description']=createFeaturesDescription(HRmsValue=False, F0=False)
#2D models
#weights_filename, weights_name='./models/weights0715-2d-60Hz/weights_2020-07-15_12-15-21.h5', 'weights0715-2d-60Hz'
#weights_filename, weights_name='./models/weights0717-2d-60Hz-64/weights_2020-07-17_17-35-40.h5', 'weights0717-2d-60Hz-64'
#weights_config['features_description']=createFeaturesDescription(F0=False)
#weights_config['stride']=1
#weights_filename, weights_name="./models/weights0710-3d/weights_2020-07-03_17-40-31.h5", "weights0710-3d"
num_steps=weights_config["num_steps"]
features_description=weights_config['features_description']
stride=weights_config['stride']
if not(force_dropout or force_dropout_keep_prob_dense_layer):
keep_prob_dense_layer=1
keep_prob=1
config=Config(batch_size, num_steps, hidden_size=hidden_size,
num_layers=num_layers,
keep_prob=keep_prob, keep_prob_recurrent=keep_prob_recurrent,
keep_prob_dense_layer=keep_prob_dense_layer)
config=completedConfig(config) #take default params for unspecified params
WARNING:root:No cell type specified in config: using LSTM
languages = ["Danish", "Dutch", "English", "Finnish",
"French", "German", "Hungarian", "Italian",
"Japanese", "Korean", "Mandarin", "Polish",
"Portuguese", "Russian", "Spanish",
"Swedish", "Turkish", "Estonian", "Arabic", "Czech", "Romanian",
"Basque", "Catalan"] #NB: check that the order of elements is consistent with model
#Remove languages with not enough data
languages.remove("Czech")
languages.remove("Romanian")
#languages_dataset=languages+['Romanian'] #None -> autodetect, languages-> same as model (defined above)
languages_dataset=None
#scores Folder #default: "./Scores"
assert useExScores^useRamus^use1dScores^useBalancedDataSet^useTest , "choose a unique dataset"
if useExScores:
scores_folder='./ex_Scores'
elif useRamus:
scores_folder='./Scores_Ramus'
elif use1dScores:
scores_folder='./Scores_1d'
elif useTest or useBalancedDataSet:
scores_folder='./Scores'
max_files_evaluation= 2024 #np.inf
# FIRST VERSION
#languages = ['Danish', 'Russian', 'Mandarin', 'Finnish', 'Dutch', 'English', 'Hungarian', 'Swedish',
# 'Italian', 'French', 'Japanese', 'German', 'Portuguese', 'Polish', 'Spanish', 'Korean']
sets ={}
set_folds=[0]
if useExScores or useRamus:
sets_folds={"test":[0]}
elif useBalancedDataSet:
sets_folds={"test":[0]} #subfolder defined later
else:
sets_folds = {"train" : [0, 1, 2],
"test":[3,4],
"test1" : [3],
"test2" : [4]}
initial_sample_length=3*2**14 if useRamus else 10*2**14
TFRecords_batch_size=1 if useRamus else 16
set_name='train'
if set_name in sets_folds:
if useRamus:
subfolders=[""]
elif useBalancedDataSet:
subfolders=[balanced_dataset_folder]
else:
subfolders=["fold_{}/".format(k_fold) for k_fold in sets_folds[set_name]]
sets[set_name] = NetworkInput(config, folder=scores_folder,
subfolder=subfolders,
stride=stride, verbose=True, for_evaluation=True,
languages=languages, name=set_name, features_description=features_description,
initial_sample_length=initial_sample_length, TFRecords_batch_size=TFRecords_batch_size,
use_deltas=use_deltas,
F0_binary_values=F0_binary_values) #TRAINING SET BUT FOR EVALUATION
set_name='test'
if useRamus:
subfolders=[""]
elif useBalancedDataSet:
subfolders=[balanced_dataset_folder]
else:
subfolders=["fold_{}/".format(k_fold) for k_fold in sets_folds[set_name]]
sets[set_name] = NetworkInput(config, folder=scores_folder, for_evaluation=True,
subfolder=subfolders,
stride=stride, verbose=True,
languages=languages, languages_model=languages, name=set_name, features_description=features_description,
initial_sample_length=initial_sample_length, TFRecords_batch_size=TFRecords_batch_size,
use_deltas=use_deltas,
F0_binary_values=F0_binary_values) #autodetect languages
'''
sets_folds = {"train" : [0, 1, 2],
"test1" : [3],
"test2" : [4]}
sets ={}
sets_folds={"train":sets_folds["train"]}
for set_name, set_folds in sets_folds.items():
print("{} : folds {}".format(set_name, set_folds))
sets[set_name] = NetworkInput(config, folder='./Scores',
subfolder=["fold_{}/".format(k_fold) for k_fold in set_folds],
stride=stride, verbose=True,
languages=languages, name=set_name)
'''
WARNING ; some languages in the dataset are not considered by the model DATASET train for evaluation only (test/validation set) Data augmentation: off. Input params/info: sampling frequency of inputs : 31.25 Hz sample length : 320 (initial sample length : 163840, step : 256, stride : 2) sample duration : 10.24 s batch size : 16 num slices by example: 10 (num timesteps by slices: 32) WARNING ; some languages in the dataset are not considered by the model languages (total: 21) 0: Danish 1: Dutch 2: English 3: Finnish 4: French 5: German 6: Hungarian 7: Italian 8: Japanese 9: Korean 10: Mandarin 11: Polish 12: Portuguese 13: Russian 14: Spanish 15: Swedish 16: Turkish 17: Estonian 18: Arabic 19: Basque 20: Catalan (Sub)folders: ['fold_0/', 'fold_1/', 'fold_2/'] Total number of examples - train - : 410880 (25680 batchs) Per language : Danish : 5216 (1.27 %) Dutch : 8400 (2.04 %) English : 61936 (15.07 %) Finnish : 7248 (1.76 %) French : 52096 (12.68 %) German : 69824 (16.99 %) Hungarian : 2352 (0.57 %) Italian : 28064 (6.83 %) Japanese : 2720 (0.66 %) Korean : 10192 (2.48 %) Mandarin : 11952 (2.91 %) Polish : 5456 (1.33 %) Portuguese : 11888 (2.89 %) Russian : 12048 (2.93 %) Spanish : 40864 (9.95 %) Swedish : 8976 (2.18 %) Turkish : 3920 (0.95 %) Estonian : 2752 (0.67 %) Arabic : 4160 (1.01 %) Basque : 14864 (3.62 %) Catalan : 45952 (11.18 %) input depth (nb features) : 3 x2 (using deltas) = 6 WARNING ; some languages in the dataset are not considered by the model DATASET test for evaluation only (test/validation set) Data augmentation: off. Input params/info: sampling frequency of inputs : 31.25 Hz sample length : 320 (initial sample length : 163840, step : 256, stride : 2) sample duration : 10.24 s batch size : 16 num slices by example: 10 (num timesteps by slices: 32) WARNING ; some languages in the dataset are not considered by the model languages (total: 21) 0: Danish 1: Dutch 2: English 3: Finnish 4: French 5: German 6: Hungarian 7: Italian 8: Japanese 9: Korean 10: Mandarin 11: Polish 12: Portuguese 13: Russian 14: Spanish 15: Swedish 16: Turkish 17: Estonian 18: Arabic 19: Basque 20: Catalan (Sub)folders: ['fold_3/', 'fold_4/'] Total number of examples - test - : 34336 (2146 batchs) Per language : Danish : 832 (2.42 %) Dutch : 2240 (6.52 %) English : 2576 (7.50 %) Finnish : 1072 (3.12 %) French : 2720 (7.92 %) German : 2544 (7.41 %) Hungarian : 672 (1.96 %) Italian : 2352 (6.85 %) Japanese : 1136 (3.31 %) Korean : 704 (2.05 %) Mandarin : 2048 (5.96 %) Polish : 1376 (4.01 %) Portuguese : 2224 (6.48 %) Russian : 2832 (8.25 %) Spanish : 2720 (7.92 %) Swedish : 1344 (3.91 %) Turkish : 1456 (4.24 %) Estonian : 768 (2.24 %) Arabic : 1360 (3.96 %) Basque : 736 (2.14 %) Catalan : 624 (1.82 %) input depth (nb features) : 3 x2 (using deltas) = 6
'\nsets_folds = {"train" : [0, 1, 2],\n "test1" : [3],\n "test2" : [4]}\n \n \nsets ={}\n\nsets_folds={"train":sets_folds["train"]}\n\nfor set_name, set_folds in sets_folds.items():\n print("{} : folds {}".format(set_name, set_folds))\n sets[set_name] = NetworkInput(config, folder=\'./Scores\', \n subfolder=["fold_{}/".format(k_fold) for k_fold in set_folds],\n stride=stride, verbose=True, \n languages=languages, name=set_name)\n'
inds_lang_test=list(np.flatnonzero(sets['test'].frequencies))
filter_lang_test=np.array(sets['test'].frequencies)>0
languages_test= [languages[i] for i in inds_lang_test]
networkInput=sets["test"]
model=build_model(config, networkInput, return_state=True) #return_state will be useful to retrieve cell states
model.summary()
Model: "model" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input_1 (InputLayer) [(16, 32, 6)] 0 __________________________________________________________________________________________________ lstm_1 (LSTM) [(16, 32, 180), (16, 134640 input_1[0][0] __________________________________________________________________________________________________ lstm_2 (LSTM) [(16, 32, 180), (16, 259920 lstm_1[0][0] __________________________________________________________________________________________________ dropout (Dropout) (16, 32, 180) 0 lstm_2[0][0] __________________________________________________________________________________________________ time_distributed (TimeDistribut (16, 32, 21) 3801 dropout[0][0] __________________________________________________________________________________________________ input_2 (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ input_3 (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ softmax (Softmax) (16, 32, 21) 0 time_distributed[0][0] ================================================================================================== Total params: 398,361 Trainable params: 398,361 Non-trainable params: 0 __________________________________________________________________________________________________
if load_weights:
model.load_weights(weights_filename)
#METRICS
#acc_end_seq=AccuracyStateless(networkInput, includeSampleWeights=False)
acc_slices=[AccuracyStateless(networkInput, ind_batch_compute=k) for k in range(networkInput.num_slices_by_example)]
top3_slices=[TopKAccuracyStateless(networkInput, k=3, ind_batch_compute=j) for j in range(networkInput.num_slices_by_example)]
metricsList=[#accuracy_on_last_step, top_k_accuracy_on_last_step_partial(k=3)
KL_div_on_last_step, cross_entropy_on_last_step]
metricsList+=acc_slices
metricsList+=top3_slices
KLLoss=tf.keras.losses.KLDivergence()
model.compile(loss=KLLoss, metrics=metricsList)
if evaluate_model:#EVALUATION
true_nb_batches=networkInput.nbr_batchs*networkInput.num_slices_by_example
max_nb_batches=max_files_evaluation/config.batch_size*networkInput.num_slices_by_example
nb_steps=np.minimum(max_nb_batches, true_nb_batches)
forgetStates=Forget_states_callback(networkInput, model, verbose=False)
callbacksList=[forgetStates]
metrics_end=model.evaluate(networkInput.sliced_batch, verbose=1, steps=nb_steps,callbacks=callbacksList)
def print_top5(st, y, y_):
ind0=np.argmax(y)
ind=np.argsort(-y_)
print(f"{st}\nlanguage: {languages[ind0]}")
st=" "
for k in range(7):
st+=f"{k+1}: {languages[ind[k]]}, "
st+='\n'
#for k, lang in enumerate(languages):
# print(lang)
# print(y_[k].numpy())
print(st)
def gen_yy_(return_all_outputs=False):
'''return_all_outputs: if True, returns all the outputs associated with each example (at every slice and every step)'''
model.reset_states()
batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example)
#option 1, use y_=model(x, training=False)
#option 2 (first axis has size batch_size x steps)
#predictions=model.predict(batch, steps = networkInput.num_slices_by_example)
outputs=[]
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch) #, training=False
outputs.append(y_)
y=y.numpy()[:,-1]
y_=y_[:,-1]
res=[]
for k in range(networkInput.config.batch_size):
if return_all_outputs:
res.append((filenames[k][0].numpy().decode('utf-8'),
y[k], [outputs_2[k] for outputs_2 in outputs] ))
else:
res.append((filenames[k][0].numpy().decode('utf-8'),
y[k], y_[k]))
return res
for i in range(min(5, 10//batch_size*5+1)):
batch_yy_ = gen_yy_()
for k in range( min(5, batch_size)):
print_top5(*batch_yy_[k])
Creating 'look-up tables' for filenames fold_3_WLI_file_13_arb-834dbc7cccc2d734cc1622db730ce12b_10 language: Arabic 1: Arabic, 2: Spanish, 3: Polish, 4: French, 5: Italian, 6: Swedish, 7: Finnish, fold_3_CommonVoice_261bca862597269_261bca862597269_slice34 language: Portuguese 1: Portuguese, 2: Italian, 3: Russian, 4: French, 5: Spanish, 6: German, 7: Turkish, fold_4_librivox_reader10013_reader10013_9951_sec7_3 language: Spanish 1: Japanese, 2: Spanish, 3: Russian, 4: Portuguese, 5: Korean, 6: Dutch, 7: Italian, fold_4_CommonVoice_18140a4d5e88199_18140a4d5e88199_slice62 language: Dutch 1: German, 2: Dutch, 3: English, 4: Catalan, 5: Russian, 6: Italian, 7: Spanish, fold_3_WLI_file_19_cmn-9f04e2093c526615594e690d8e4bd3e6_80 language: Mandarin 1: Danish, 2: Swedish, 3: Arabic, 4: English, 5: Korean, 6: Portuguese, 7: Spanish,
test audio
def cell_st(cell_type):
if cell_type==LSTM_CELL:
return "lstm"
elif cell_type==GRU_CELL:
return "gru"
nb_batchs_iter=networkInput.nbr_batchs if save_activations else 1 #nbr batchs (before splitted) to iter on
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
dic_list=[{} for i in range(nb_batchs_iter*config.batch_size)]
i_batch=0
#specific activations
selected_cells={'lstm_2': {
'cell_states': [3, 4, 92, 115, 116, 121],
'outputs': [3, 4, 92, 115, 116, 121]
},
'lstm_1':{
'outputs': [],
'cell_states': []
}
}
#mode, save all cells or selected cells
SELECTED_CELLS=1
ALL_CELLS=0
mode=ALL_CELLS
while(i_batch<nb_batchs_iter):
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
for l in range(config.num_layers): #NB: very inefficient because the network activations are computed several times
layerName=f'{cell_st(config.cell_type)}_{l+1}'
if l==0: #also add output scores
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch) # training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
for i in range(batch_size):
ind_batch=i_batch*batch_size+i
filename=filenames[i][0].numpy().decode('utf-8')
if useRamus: #HACK
filename="_".join(filename.split("_")[2:])
dic_list[ind_batch]['filename']=filename
ind0=np.argmax(y[i])
ind=np.argmax(y_[i])
dic_list[ind_batch]['label']=languages[ind0]
dic_list[ind_batch]['predicted']=languages[ind]
dic_list[ind_batch]['activations']={}
if mode==ALL_CELLS:
dic_list[ind_batch]['scores']={}
for j, lang in enumerate(languages):
dic_list[ind_batch]['scores'][lang]=str(y_[i][j])
modelBis = Model(inputs=model.input, outputs=model.get_layer(layerName).output)
modelBis.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
act_seq, act_h, act_c=modelBis(trueBatch) #training=False
for i in range(batch_size):
ind_batch=i_batch*batch_size+i
dic_list[ind_batch]['activations'][layerName]={}
if mode==SELECTED_CELLS:
dic_list[ind_batch]['activations'][layerName]['outputs']={}
for j in selected_cells[layerName]['outputs']:
dic_list[ind_batch]['activations'][layerName]['outputs'][str(j)]=str(act_h.numpy()[i][j])
dic_list[ind_batch]['activations'][layerName]['cell_states']={}
for j in selected_cells[layerName]['cell_states']:
dic_list[ind_batch]['activations'][layerName]['cell_states'][str(j)]=str(act_c.numpy()[i][j])
elif mode==ALL_CELLS:
dic_list[ind_batch]['activations'][layerName]['outputs']=[str(x) for x in act_h.numpy()[i]]
dic_list[ind_batch]['activations'][layerName]['cell_states']=[str(x) for x in act_c.numpy()[i]]
i_batch+=1
#save activations to files
if save_activations:
for i in range(nb_batchs_iter*batch_size):
dic=dic_list[i]
filename=dic['filename']
mode_text = 'ALL' if mode==ALL_CELLS else 'SELECTED'
jsonFolder=f'./activations/{scores_folder}/{weights_name}_{mode_text}/'
os.makedirs(jsonFolder, exist_ok=True)
jsonFilename=f'{jsonFolder}{filename}.json'
with open(jsonFilename, 'w') as f:
json.dump(dic, f, indent=4)
if force_dropout:
dropout_flag=f'dropout_{int(round(100*(1-keep_prob_dense_layer)))}'
else:
dropout_flag='no_dropout'
if embeddings_Hellinger:
Hell_flag='_Hellinger'
else:
Hell_flag=''
embFolder=f'./embeddings/{scores_folder}/{weights_name}/{dropout_flag}{Hell_flag}/'
TRUE_LANGUAGE=0
PREDICTED_LANGUAGE=1
mode_label = 1 #write both labels either ways, but differs for max nb examples by language strategy
max_nb_batchs_iter=np.inf
if save_embeddings:
if not os.path.exists(embFolder):
os.makedirs(embFolder)
proj_config = projector.ProjectorConfig()
#proj_config.model_checkpoint_path = embeddings_ckpt_name
#TODO diff tensors with diff. datasets?
embeddings = proj_config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`
embeddings.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
if mode_label == TRUE_LANGUAGE:
embeddings.metadata_path = 'labels_true.tsv'
elif mode_label == PREDICTED_LANGUAGE:
embeddings.metadata_path = 'labels_predicted.tsv'
projector.visualize_embeddings(embFolder, proj_config)
# save checkpoint/metadata
if save_embeddings:
st_info=''
nb_batchs_iter=min(networkInput.nbr_batchs, max_nb_batchs_iter)
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
languages_true_list=[]
languages_predicted_list=[]
count_lang=dict([(lang, 0) for lang in languages])
scores=[]
mean_act=np.zeros(len(languages)) #for normalization purposes if needed
i_batch=0
while(i_batch<nb_batchs_iter):
above_thr=[count_lang[lang]>=max_examples_language for lang in languages]
if all(above_thr):
break
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch, training=force_dropout) #training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
#mean_act=np.sum(y_, axis=0)
for i in range(batch_size):
ind_batch=i_batch*batch_size+i
ind0=np.argmax(y[i])
ind=np.argmax(y_[i])
lang0=languages[ind0]
lang=languages[ind]
if mode_label == TRUE_LANGUAGE:
if count_lang[lang0]>=max_examples_language:
continue
count_lang[lang0] += 1
if count_lang[lang0] == max_examples_language:
st_info+=f'{lang0}: max examples reached at seq {ind_batch}\n'
print(f'{lang0}: max examples reached at seq {ind_batch}')
elif mode_label == PREDICTED_LANGUAGE:
if count_lang[lang]>=max_examples_language:
continue
count_lang[lang] += 1
if count_lang[lang] == max_examples_language:
st_info+=f'{lang}: max examples reached at seq {ind_batch}\n'
print(f'{lang}: max examples reached at seq {ind_batch}')
languages_true_list.append(lang0)
languages_predicted_list.append(lang)
if embeddings_Hellinger:
scores.append(np.sqrt(y_[i]))
else:
scores.append(y_[i])
i_batch+=1
#mean_act/=i_batch
#mean_act_copy=np.copy(mean_act)
scores_arr=np.stack(scores)
#checkpoint
checkpoint = tf.train.Checkpoint(embedding=tf.Variable(scores_arr))
checkpoint.save(os.path.join(embFolder, "embeddings.ckpt"))
#metadata
with open(os.path.join(embFolder, 'labels_true.tsv'), "w") as f:
for lang in languages_true_list:
f.write("{}\n".format(lang))
with open(os.path.join(embFolder, 'labels_predicted.tsv'), "w") as f:
for lang in languages_predicted_list:
f.write("{}\n".format(lang))
#save all data to csv
with open(os.path.join(embFolder, 'data.csv'), "w") as f:
csvWriter=csv.writer(f, delimiter='\t')
csvWriter.writerow(['label_true', 'label_predicted']+languages)
for i in range(len(languages_true_list)):
row=[languages_true_list[i], languages_predicted_list[i]]
row+=list(scores[i])
csvWriter.writerow(row)
with open(f'{embFolder}/info.txt', 'w') as f:
f.write(st_info)
max_batchs=10000//batch_size #limit analysis to a certain number of batches #np.inf if no limitation
Correlation matrix / histogram of activations
if do_similarity_analyses:
nb_batchs_iter=min(networkInput.nbr_batchs, max_batchs)
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
corr_matrix=np.zeros((len(languages), len(languages))) #r coefficient
hist_act=np.zeros((nb_batchs_iter*batch_size, len(languages))) #non normalized activation scores
hist_act_true_lang=[]
hist_act_predicted_lang=[]
if do_pairwise_analysis:
pairwise_corr=np.zeros((len(languages), len(languages)))
pairwise_count=np.zeros((len(languages), len(languages)))
i_batch=0
while(i_batch<nb_batchs_iter):
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch, training=force_dropout) # training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
for i in range(batch_size):
ind0=np.argmax(y[i])
ind=np.argmax(y_[i])
lang0=languages[ind0]
lang=languages[ind]
hist_act_true_lang.append(lang0)
hist_act_predicted_lang.append(lang)
corr_matrix+= np.sum(np.expand_dims(y_, 1)*np.expand_dims(y_, 2), axis=0)
#for i in range(batch_size):
# corr_matrix+=np.outer(y_[i], y_[i])
hist_act[i_batch*batch_size:(i_batch+1)*batch_size]=y_
if do_pairwise_analysis:
modelBis = Model(inputs=model.input, outputs=model.get_layer('lstm_2').output)
modelBis.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
act_seq, act_h, act_c=modelBis(trueBatch) #training=False
for i in range(batch_size):
ind0=np.argmax(y[i])
for j in range(i+1, batch_size):
ind0bis=np.argmax(y[j])
ii=ind0
jj=ind0bis
corr=np.sum(act_h[i]*act_h[j])
pairwise_corr[ii][jj]+=corr
pairwise_corr[jj][ii]+=corr
pairwise_count[ii][jj]+=1
pairwise_count[jj][ii]+=1
i_batch+=1
corr_matrix/=i_batch
dev=np.sqrt(corr_matrix.diagonal())
corr_matrix/=np.outer(dev, dev) #normalization by deviations
hist_act/=np.sum(hist_act, axis=0)
if do_pairwise_analysis:
pairwise_corr/=(pairwise_count+1e-4)
#proximity measures based on activation histograms
if do_similarity_analyses:
prox_d_kl=np.zeros((len(languages), len(languages)))
prox_d_jensen=np.zeros((len(languages), len(languages)))
prox_d_hell=np.zeros((len(languages), len(languages)))
prox_d_bhat=np.zeros((len(languages), len(languages)))
for i in range(len(languages)):
for j in range(len(languages)):
p_i=hist_act[:, i]+1e-8
p_j=hist_act[:, j]+1e-8
p_mean=(p_i+p_j)/2
prox_d_kl[i][j]=np.sum(p_i*np.log2(p_i/p_j))
prox_d_kl[j][i]=np.sum(p_j*np.log2(p_j/p_i))
prox_d_jensen[i][j]=0.5*(np.sum(p_i*np.log2(p_i/p_mean))+np.sum(p_j*np.log2(p_j/p_mean)))
prox_d_jensen[j][i]=prox_d_jensen[i][j]
prox_d_hell[i][j]=np.sqrt(np.sum((np.sqrt(p_i)-np.sqrt(p_j))**2))
prox_d_hell[j][i]=prox_d_hell[i][j]
prox_d_bhat[i][j]=-2*np.log2(np.sum(np.sqrt(p_i*p_j)))
def plot_distance_matrix(dm, classes, normalize=False, title='Distance matrix', cmap=pl.cm.Blues, vmin=0, vmax=4, invert_colors=False):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
dm = dm.astype('float') / (0.00001+dm.sum(axis=1)[:, np.newaxis])
dm = np.round(dm*100, decimals=2)
#print(cm)
#pl.imshow(-np.log2(dm), interpolation='nearest', cmap=cmap, vmax=-3,vmin=-5)
#pl.imshow(-dm, interpolation='nearest', cmap=cmap, vmax=-8,vmin=-17)
if not(invert_colors):
pl.imshow(-dm, interpolation='nearest', cmap=cmap, vmin=-vmax, vmax=-vmin)
else:
pl.imshow(dm, interpolation='nearest', cmap=cmap, vmin=vmin, vmax=vmax)
pl.title(title)
#pl.colorbar()
tick_marks = np.arange(len(classes))
pl.xticks(tick_marks, classes, rotation=45)
pl.yticks(tick_marks, classes)
thresh = dm.max()*(1-2*invert_colors) / 4.
for i, j in itertools.product(range(dm.shape[0]), range(dm.shape[1])):
pl.text(j, i, int(dm[i,j]*100)*1./100,
horizontalalignment="center",
color="white" if dm[i, j]*(1-2*invert_colors) < thresh else "black")
#pl.tight_layout()
pl.ylabel('Label1 (reference)')
pl.xlabel('Label2')
def permut_mat(mat, permut=[1,2,5, 13,12,4,16,18, 15,11, 14,19,20,7, 17, 0, 3, 6, 8, 9, 10]):
conf_matrix=mat
conf_matrix_permut=np.zeros_like(conf_matrix)
for i, ind in enumerate(permut):
conf_matrix_permut[i]=conf_matrix[ind][permut]
languages_permut=[languages[ind] for ind in permut]
return languages_permut, conf_matrix_permut
if do_similarity_analyses:
#pl.figure(figsize=(10, 10))
#plot_distance_matrix(corr_matrix, languages, title='Correlation matrix', vmin=0.05, vmax=0.5)
pl.figure(figsize=(10, 10))
languages_permut, prox_d_hell_permut=permut_mat(prox_d_hell)
plot_distance_matrix(prox_d_hell_permut, languages_permut, title='Dissimilarity matrix (Hellinger distance)', vmin=0.8, vmax=1.3)
pl.figure(figsize=(10, 10))
languages_permut, prox_d_kl_permut=permut_mat(prox_d_kl)
plot_distance_matrix(prox_d_kl_permut, languages_permut, title='Dissimilarity matrix (KL div)', vmin=3, vmax=15)
pl.figure(figsize=(10, 10))
plot_distance_matrix( (prox_d_kl_permut-prox_d_kl_permut.T)/(prox_d_kl_permut+1e-3)*100, languages_permut, title='KL div, diff transpose', vmin=-20, vmax=20, invert_colors=True)
pl.figure(figsize=(10, 10))
languages_permut, prox_d_bhat_permut=permut_mat(prox_d_bhat)
plot_distance_matrix(prox_d_bhat_permut, languages_permut, title='Dissimilarity matrix (Bhattacharyya distance)', vmin=0, vmax=8)
Multidimensional scaling (nonmetric scaling)
See http://scikit-learn.org/stable/modules/manifold.html#multidimensional-scaling
import itertools
from sklearn import cluster
from sklearn import manifold
from sklearn.metrics import pairwise_distances
mds = manifold.MDS(n_components=2, metric=True,
verbose=0, dissimilarity='precomputed', n_init=10)
#NON METRIC
#mds = manifold.MDS(n_components=2, metric=False,
# n_init=30, max_iter=300,
# verbose=0, eps=0.001, dissimilarity='precomputed')
if do_similarity_analyses:
dist_measure_str = "Hellinger distance"
dm=prox_d_hell
dist_measure_str = "(symm.) KL divergence"
dm=prox_d_kl
languages2=languages
'''
print('not shown: Hungarian, Finnish, (and Polish?)')
#HACK delete hungarian
dm=np.delete(dm, 6, axis=0)
dm=np.delete(dm, 6, axis=1)
languages2=languages[0:6]+languages[7::]
#HACK delete Finnish
dm=np.delete(dm, 3, axis=0)
dm=np.delete(dm, 3, axis=1)
languages2=languages2[0:3]+languages2[4::]
#HACK delete Polish
dm=np.delete(dm, 9, axis=0)
dm=np.delete(dm, 9, axis=1)
languages2=languages2[0:9]+languages2[10::]
'''
coord_pts = mds.fit_transform((dm.T+dm)/2) #symmetrize if necessary
delta = 0.01
fig = pl.figure(figsize=(10,10))
ax = pl.gca()
ax.scatter(coord_pts[:,0], coord_pts[:,1])
for i, txt in enumerate(languages2):
ax.annotate(txt, coord_pts[i]+(delta, delta))
pl.title("Metric multidimensional scaling (computed with {})".format(dist_measure_str))
pl.plot()
stress=np.sqrt(mds.stress_/(np.sum(dm**2)/2))
dm2=pairwise_distances(coord_pts)
stress2=np.sqrt(np.sum((dm2-dm)**2)/np.sum(dm**2))
print(f'stress : {stress}')
print(f'stress : {stress2}')
for i in range(len(languages2)):
act_diss=dm[i]
diff=(dm2[i]-dm[i])
lang=languages2[i]
print(f'{lang.rjust(10)}\t actual dissimilarity: {np.sum(act_diss):.3f} \t difference: {np.sum(np.abs(diff)):.3f} \t percent diff.: {np.sum(np.abs(diff))/np.sum(act_diss)*100:.2f} %')
'''
coord_pts = mds.fit_transform((dm_selected_modified.T+dm_selected_modified)/2)
fig = pl.figure(figsize=(10,10))
ax = pl.gca()
ax.scatter(coord_pts[:,0], coord_pts[:,1])
for i, txt in enumerate(selected_languages):
ax.annotate(txt, coord_pts[i]+(delta, delta))
dist_measure_str = "KL divergence" if dist_measure == D_KL else "Hellinger distance"
pl.title("Metric multidimensional scaling (computed with {})".format(dist_measure_str))
pl.plot()
'''
Note: the 'hole' : Japanese, Mandarin, Korean, Polish
if do_similarity_analyses:
pl.imshow(-dm2)
pl.colorbar()
pl.figure()
pl.imshow(-dm)
pl.colorbar()
Some clustering
delta=10.
n_clusters = 6
spec_clustering = cluster.SpectralClustering(n_clusters=n_clusters,
affinity="precomputed")
if do_similarity_analyses:
dm=(prox_d_kl+prox_d_kl.T)/2
similarity_matrix=np.exp(- dm**2 / (2. * delta ** 2))
clusters = spec_clustering.fit_predict(similarity_matrix)
print("All languages : ")
for i in range(n_clusters):
cluster_labels = [languages[j] for j in np.where(clusters == i)[0]]
print("> cluster {} : {} \n".format(i, cluster_labels))
plt=pl
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack([model.children_, model.distances_,
counts]).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
if do_similarity_analyses:
# Ward based on hist_act
# setting distance_threshold=0 ensures we compute the full tree.
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None) #affinity='precomputed'
#XXX hist_act or sqrt hist_act ??
model_clustering = model_clustering.fit(np.sqrt(hist_act.T))
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
def labels_dendogram(k):
return languages[k]
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Ward + Hellinger distance")
plt.show()
#DM #try different linkage methods
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity='precomputed', linkage='complete' )
dm=(prox_d_kl+prox_d_kl.T)/2
model_clustering = model_clustering.fit(dm) #if dm, needs to add affinity='precomputed' and change linkage method
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
def labels_dendogram(k):
return languages[k]
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Complete linkage based on symmetrized D_KL ")
plt.show()
#Jensen-Shannon + complete linkage
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity='precomputed', linkage='complete' )
dm=prox_d_jensen
model_clustering = model_clustering.fit(dm) #if dm, needs to add affinity='precomputed' and change linkage method
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Complete linkage based on Jensen-Shannon divergence ")
plt.show()
# Bhattacharyya distance
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity='precomputed', linkage='complete' )
dm=prox_d_bhat
#dm+=16*prox_d_hell
model_clustering = model_clustering.fit(dm) #if dm, needs to add affinity='precomputed' and change linkage method
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Complete linkage based on Bhattacharyya distance ")
plt.show()
max_batchs=np.infty
if compute_confusion_matrix:
nb_batchs_iter=min(networkInput.nbr_batchs, max_batchs)
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
#lang_true=[]
#lang_predicted=[]
conf_matrix=np.zeros((len(languages), len(languages)))
conf_matrix_filtered=np.zeros((len(languages_test), len(languages_test)))
i_batch=0
while(i_batch<nb_batchs_iter):
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch, training=force_dropout_conf_matrix) #training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
for j in range(batch_size):
ind0=np.argmax(y[j])
ind=np.argmax(y_[j])
lang0=languages[ind0]
lang=languages[ind]
#lang_true.append(lang0)
#lang_predicted.append(lang)
conf_matrix[ind0][ind]+=1
ind0bis=np.argmax(y[j][inds_lang_test])
indbis=np.argmax(y_[j][inds_lang_test])
conf_matrix_filtered[ind0bis][indbis]+=1
i_batch+=1
if compute_confusion_matrix:
#plot_distance_matrix(conf_matrix, languages, vmin=0, vmax=20, normalize=True, invert_colors=True)
permut=[1,2,5, 13,12,4,16,18, 15,11, 14,19,20,7, 17, 0, 3, 6, 8, 9, 10]
conf_matrix_permut=np.zeros_like(conf_matrix)
for i, ind in enumerate(permut):
conf_matrix_permut[i]=conf_matrix[ind][permut]
languages_permut=[languages[ind] for ind in permut]
if useRamus:
plot_distance_matrix(conf_matrix_filtered, languages_test, vmin=0, vmax=5, invert_colors=True,title='Confusion matrix')
permut=[0,1,2,5,3,6,7, 4]
conf_matrix_filtered_permut=np.zeros_like(conf_matrix_filtered)
for i, ind in enumerate(permut):
conf_matrix_filtered_permut[i]=conf_matrix_filtered[ind][permut]
languages_test_permut=[languages_test[ind] for ind in permut]
pl.figure(figsize=(6,6))
pl.ylim([7.5, -0.5])
plot_distance_matrix(conf_matrix_filtered_permut, languages_test_permut , vmin=0, vmax=5, invert_colors=True, title='Confusion matrix')
pl.figure(figsize=(6,6))
plot_distance_matrix(conf_matrix_filtered_permut, languages_test_permut , normalize=True, vmin=0, vmax=50, invert_colors=True, title='Confusion matrix')
pl.ylim([7.5, -0.5])
else:
pl.figure(figsize=(15,15))
plot_distance_matrix(conf_matrix_permut, languages_permut , normalize=True, vmin=0, vmax=10, invert_colors=True, title='Confusion matrix')
pl.ylim([20.5, -0.5])
pl.figure(figsize=(15,15))
plot_distance_matrix(conf_matrix_permut, languages_permut ,normalize=True, vmin=0, vmax=20,
cmap=pl.cm.Purples, invert_colors=True, title='Confusion matrix')
pl.ylim([20.5, -0.5])
#pl.savefig('conf_matrix.svg')