import numpy as np
import matplotlib.pyplot as pl
import tensorflow as tf
import tensorflow.keras.optimizers as optimizers
from model import *
from input import NetworkInput
from data import createFeaturesDescription
from config import *
from tensorflow.keras.models import Model
from datetime import datetime
from IPython.display import Audio
import soundfile as sf
import os
import json
import csv
import itertools
from tensorboard.plugins import projector
from sklearn import cluster
from scipy.cluster.hierarchy import dendrogram
run_with_papermill=False
batch_size=16 #16 #512
stride=2
use_F0=False
F0_binary_values=False
use_deltas=False
hidden_size=128
num_layers=2
weights_filename='./logdir/2020-11-20_18-39-24-2_150_multiple_dropout/weights/weights_2020-11-20_18-39-24.h5'
weights_name='2_150_multiple_dropout'
useExScores=False
useRamus=False
use1dScores=False
useBalancedDataSet=False
balanced_dataset_folder='balanced_20_1'
evaluate_model=False
save_activations=False #save activations to json files #mode ALL_CELLS or SELECTED_CELLS defined later
save_embeddings=True #save embeddings as checkpoint file (for tensorboard)
do_similarity_analyses=True
compute_confusion_matrix=False
embeddings_Hellinger=True #embeddings based on sqrt of output
force_dropout=True #for embeddings, similarity analyses
force_dropout_conf_matrix=False #force dropout for confusion matrix
#For the examples of outputs, set manually
keep_prob=1
keep_prob_recurrent=1
keep_prob_dense_layer=1
#note: others useful parameters in rest of code, e.g. max_examples_language, max_batches for analyses
do_pairwise_analysis=False #pairewise analysis inside every batch
# Parameters
run_with_papermill = True
hidden_size = 150
use_deltas = True
use_F0 = True
F0_binary_values = True
useRamus = True
batch_size = 1
evaluate_model = True
save_activations = True
save_embeddings = False
compute_confusion_matrix = True
weights_filename = "./logdir/2021-08-19_14-53-14-2_150_mult_dropout_voiced_unvoiced_bis/weights/weights_2021-08-19_14-53-14.h5"
weights_name = "weights_2_150_voiced_unvoiced_bis"
if not(run_with_papermill): #manual settings
force_dropout=True
keep_prob=0.7
keep_prob_dense_layer=0.9
keep_prob_recurrent=0.9
useBalancedDataSet=True
#useRamus=True
use_deltas=True
hidden_size=150
evaluate_model=False
save_embeddings=False
save_activations=True
do_similarity_analyses=False
compute_confusion_matrix=False
weights_config={
'num_steps':32,
'features_description':createFeaturesDescription(F0=use_F0),
'stride':stride
}
load_weights=True
#old models (old inputs)
#1D models
#weights_filename="./models/weights0420/weights20epochs.h5"
#weights_filename, weights_name="./models/weights0420b/weights_2020-04-07_17-55-05.h5", "weights_2020-04-07"
#weights_config['features_description']=createFeaturesDescription(HRmsValue=False, F0=False)
#2D models
#weights_filename, weights_name='./models/weights0715-2d-60Hz/weights_2020-07-15_12-15-21.h5', 'weights0715-2d-60Hz'
#weights_filename, weights_name='./models/weights0717-2d-60Hz-64/weights_2020-07-17_17-35-40.h5', 'weights0717-2d-60Hz-64'
#weights_config['features_description']=createFeaturesDescription(F0=False)
#weights_config['stride']=1
#weights_filename, weights_name="./models/weights0710-3d/weights_2020-07-03_17-40-31.h5", "weights0710-3d"
num_steps=weights_config["num_steps"]
features_description=weights_config['features_description']
stride=weights_config['stride']
if not(force_dropout or force_dropout_keep_prob_dense_layer):
keep_prob_dense_layer=1
keep_prob=1
config=Config(batch_size, num_steps, hidden_size=hidden_size,
num_layers=num_layers,
keep_prob=keep_prob, keep_prob_recurrent=keep_prob_recurrent,
keep_prob_dense_layer=keep_prob_dense_layer)
config=completedConfig(config) #take default params for unspecified params
WARNING:root:No cell type specified in config: using LSTM
languages = ["Danish", "Dutch", "English", "Finnish",
"French", "German", "Hungarian", "Italian",
"Japanese", "Korean", "Mandarin", "Polish",
"Portuguese", "Russian", "Spanish",
"Swedish", "Turkish", "Estonian", "Arabic", "Czech", "Romanian",
"Basque", "Catalan"] #NB: check that the order of elements is consistent with model
#Remove languages with not enough data
languages.remove("Czech")
languages.remove("Romanian")
#languages_dataset=languages+['Romanian'] #None -> autodetect, languages-> same as model (defined above)
languages_dataset=None
#scores Folder #default: "./Scores"
assert useExScores^useRamus^use1dScores^useBalancedDataSet , "choose a unique dataset"
if useExScores:
scores_folder='./ex_Scores'
elif useRamus:
scores_folder='./Scores_Ramus'
elif use1dScores:
scores_folder='./Scores_1d'
else:
scores_folder='./Scores'
max_files_evaluation= 2024 #np.inf
# FIRST VERSION
#languages = ['Danish', 'Russian', 'Mandarin', 'Finnish', 'Dutch', 'English', 'Hungarian', 'Swedish',
# 'Italian', 'French', 'Japanese', 'German', 'Portuguese', 'Polish', 'Spanish', 'Korean']
sets ={}
set_folds=[0]
if useExScores or useRamus:
sets_folds={"test":[0]}
elif useBalancedDataSet:
sets_folds={"test":[0]} #subfolder defined later
else:
sets_folds = {"train" : [0, 1, 2],
"test":[3,4],
"test1" : [3],
"test2" : [4]}
initial_sample_length=3*2**14 if useRamus else 10*2**14
TFRecords_batch_size=1 if useRamus else 16
set_name='train'
if set_name in sets_folds:
if useRamus:
subfolders=[""]
elif useBalancedDataSet:
subfolders=[balanced_dataset_folder]
else:
subfolders=["fold_{}/".format(k_fold) for k_fold in sets_folds[set_name]]
sets[set_name] = NetworkInput(config, folder=scores_folder,
subfolder=subfolders,
stride=stride, verbose=True, for_evaluation=True,
languages=languages, name=set_name, features_description=features_description,
initial_sample_length=initial_sample_length, TFRecords_batch_size=TFRecords_batch_size,
use_deltas=use_deltas,
F0_binary_values=F0_binary_values) #TRAINING SET BUT FOR EVALUATION
set_name='test'
if useRamus:
subfolders=[""]
elif useBalancedDataSet:
subfolders=[balanced_dataset_folder]
else:
subfolders=["fold_{}/".format(k_fold) for k_fold in sets_folds[set_name]]
sets[set_name] = NetworkInput(config, folder=scores_folder, for_evaluation=True,
subfolder=subfolders,
stride=stride, verbose=True,
languages=languages, languages_model=languages, name=set_name, features_description=features_description,
initial_sample_length=initial_sample_length, TFRecords_batch_size=TFRecords_batch_size,
use_deltas=use_deltas,
F0_binary_values=F0_binary_values) #autodetect languages
'''
sets_folds = {"train" : [0, 1, 2],
"test1" : [3],
"test2" : [4]}
sets ={}
sets_folds={"train":sets_folds["train"]}
for set_name, set_folds in sets_folds.items():
print("{} : folds {}".format(set_name, set_folds))
sets[set_name] = NetworkInput(config, folder='./Scores',
subfolder=["fold_{}/".format(k_fold) for k_fold in set_folds],
stride=stride, verbose=True,
languages=languages, name=set_name)
'''
WARNING ; some languages in the dataset are not considered by the model DATASET test for evaluation only (test/validation set) Data augmentation: off. Input params/info: sampling frequency of inputs : 31.25 Hz sample length : 96 (initial sample length : 49152, step : 256, stride : 2) sample duration : 3.07 s batch size : 1 num slices by example: 3 (num timesteps by slices: 32) WARNING ; some languages in the dataset are not considered by the model languages (total: 21) 0: Danish 1: Dutch 2: English 3: Finnish 4: French 5: German 6: Hungarian 7: Italian 8: Japanese 9: Korean 10: Mandarin 11: Polish 12: Portuguese 13: Russian 14: Spanish 15: Swedish 16: Turkish 17: Estonian 18: Arabic 19: Basque 20: Catalan (Sub)folders: [''] Total number of examples - test - : 153 (153 batchs) Per language : Danish : 0 (0.00 %) Dutch : 19 (12.42 %) English : 15 (9.80 %) Finnish : 0 (0.00 %) French : 20 (13.07 %) German : 0 (0.00 %) Hungarian : 0 (0.00 %) Italian : 20 (13.07 %) Japanese : 20 (13.07 %) Korean : 0 (0.00 %) Mandarin : 0 (0.00 %) Polish : 20 (13.07 %) Portuguese : 0 (0.00 %) Russian : 0 (0.00 %) Spanish : 19 (12.42 %) Swedish : 0 (0.00 %) Turkish : 0 (0.00 %) Estonian : 0 (0.00 %) Arabic : 0 (0.00 %) Basque : 0 (0.00 %) Catalan : 20 (13.07 %) F0 takes only 2 values (0:unvoiced/1:voiced) input depth (nb features) : 3 x2 (using deltas) = 6
'\nsets_folds = {"train" : [0, 1, 2],\n "test1" : [3],\n "test2" : [4]}\n \n \nsets ={}\n\nsets_folds={"train":sets_folds["train"]}\n\nfor set_name, set_folds in sets_folds.items():\n print("{} : folds {}".format(set_name, set_folds))\n sets[set_name] = NetworkInput(config, folder=\'./Scores\', \n subfolder=["fold_{}/".format(k_fold) for k_fold in set_folds],\n stride=stride, verbose=True, \n languages=languages, name=set_name)\n'
inds_lang_test=list(np.flatnonzero(sets['test'].frequencies))
filter_lang_test=np.array(sets['test'].frequencies)>0
languages_test= [languages[i] for i in inds_lang_test]
networkInput=sets["test"]
model=build_model(config, networkInput, return_state=True) #return_state will be useful to retrieve cell states
model.summary()
Model: "model" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input_1 (InputLayer) [(1, 32, 6)] 0 __________________________________________________________________________________________________ lstm_1 (LSTM) [(1, 32, 150), (1, 1 94200 input_1[0][0] __________________________________________________________________________________________________ lstm_2 (LSTM) [(1, 32, 150), (1, 1 180600 lstm_1[0][0] __________________________________________________________________________________________________ dropout (Dropout) (1, 32, 150) 0 lstm_2[0][0] __________________________________________________________________________________________________ time_distributed (TimeDistribut (1, 32, 21) 3171 dropout[0][0] __________________________________________________________________________________________________ input_2 (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ input_3 (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ softmax (Softmax) (1, 32, 21) 0 time_distributed[0][0] ================================================================================================== Total params: 277,971 Trainable params: 277,971 Non-trainable params: 0 __________________________________________________________________________________________________
if load_weights:
model.load_weights(weights_filename)
#METRICS
#acc_end_seq=AccuracyStateless(networkInput, includeSampleWeights=False)
acc_slices=[AccuracyStateless(networkInput, ind_batch_compute=k) for k in range(networkInput.num_slices_by_example)]
top3_slices=[TopKAccuracyStateless(networkInput, k=3, ind_batch_compute=j) for j in range(networkInput.num_slices_by_example)]
metricsList=[#accuracy_on_last_step, top_k_accuracy_on_last_step_partial(k=3)
KL_div_on_last_step, cross_entropy_on_last_step]
metricsList+=acc_slices
metricsList+=top3_slices
KLLoss=tf.keras.losses.KLDivergence()
model.compile(loss=KLLoss, metrics=metricsList)
if evaluate_model:#EVALUATION
true_nb_batches=networkInput.nbr_batchs*networkInput.num_slices_by_example
max_nb_batches=max_files_evaluation/config.batch_size*networkInput.num_slices_by_example
nb_steps=np.minimum(max_nb_batches, true_nb_batches)
forgetStates=Forget_states_callback(networkInput, model, verbose=False)
callbacksList=[forgetStates]
metrics_end=model.evaluate(networkInput.sliced_batch, verbose=1, steps=nb_steps,callbacks=callbacksList)
Creating 'look-up tables' for filenames 459/459 [==============================] - 8s 17ms/step - loss: 3.0106 - KL_div_on_last_step: 2.8731 - cross_entropy_on_last_step: 2.8731 - accuracy_slice_0: 0.0327 - accuracy_slice_1: 0.1176 - accuracy_at_end_of_sequences: 0.1765 - top_3_accuracy_slice_0: 0.1895 - top_3_accuracy_slice_1: 0.3595 - top_3_accuracy_at_end_of_sequences: 0.4248
def print_top5(st, y, y_):
ind0=np.argmax(y)
ind=np.argsort(-y_)
print(f"{st}\nlanguage: {languages[ind0]}")
st=" "
for k in range(7):
st+=f"{k+1}: {languages[ind[k]]}, "
st+='\n'
#for k, lang in enumerate(languages):
# print(lang)
# print(y_[k].numpy())
print(st)
def gen_yy_():
model.reset_states()
batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example)
#option 1, use y_=model(x, training=False)
#option 2 (first axis has size batch_size x steps)
#predictions=model.predict(batch, steps = networkInput.num_slices_by_example)
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch) #, training=False
y=y.numpy()[:,-1]
y_=y_[:,-1]
res=[]
for k in range(networkInput.config.batch_size):
res.append((filenames[k][0].numpy().decode('utf-8'),
y[k], y_[k]))
return res
for i in range(min(5, 10//batch_size*5+1)):
batch_yy_ = gen_yy_()
for k in range( min(5, batch_size)):
print_top5(*batch_yy_[k])
_undefined_French_frl1164_fixed_normalise language: French 1: Korean, 2: Mandarin, 3: Turkish, 4: Dutch, 5: English, 6: Russian, 7: German, _undefined_English_ENL1194_normalise language: English 1: Russian, 2: Italian, 3: Basque, 4: Turkish, 5: Arabic, 6: Dutch, 7: French, _undefined_Spanish_esp2191_normalise language: Spanish 1: Spanish, 2: Catalan, 3: Russian, 4: Estonian, 5: Basque, 6: Portuguese, 7: Italian, _undefined_Italian_itl3191_fixed_normalise language: Italian 1: Catalan, 2: Spanish, 3: Russian, 4: Basque, 5: Italian, 6: Estonian, 7: Turkish, _undefined_Dutch_dul2192_normalise language: Dutch 1: Dutch, 2: German, 3: English, 4: Turkish, 5: Russian, 6: Polish, 7: French,
test audio
def cell_st(cell_type):
if cell_type==LSTM_CELL:
return "lstm"
elif cell_type==GRU_CELL:
return "gru"
nb_batchs_iter=networkInput.nbr_batchs if save_activations else 1 #nbr batchs (before splitted) to iter on
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
dic_list=[{} for i in range(nb_batchs_iter*config.batch_size)]
i_batch=0
#specific activations
selected_cells={'lstm_2': {
'cell_states': [3, 4, 92, 115, 116, 121],
'outputs': [3, 4, 92, 115, 116, 121]
},
'lstm_1':{
'outputs': [],
'cell_states': []
}
}
#mode, save all cells or selected cells
SELECTED_CELLS=1
ALL_CELLS=0
mode=ALL_CELLS
while(i_batch<nb_batchs_iter):
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
for l in range(config.num_layers): #NB: very inefficient because the network activations are computed several times
layerName=f'{cell_st(config.cell_type)}_{l+1}'
if l==0: #also add output scores
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch) # training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
for i in range(batch_size):
ind_batch=i_batch*batch_size+i
filename=filenames[i][0].numpy().decode('utf-8')
if useRamus: #HACK
filename="_".join(filename.split("_")[2:])
dic_list[ind_batch]['filename']=filename
ind0=np.argmax(y[i])
ind=np.argmax(y_[i])
dic_list[ind_batch]['label']=languages[ind0]
dic_list[ind_batch]['predicted']=languages[ind]
dic_list[ind_batch]['activations']={}
if mode==ALL_CELLS:
dic_list[ind_batch]['scores']={}
for j, lang in enumerate(languages):
dic_list[ind_batch]['scores'][lang]=str(y_[i][j])
modelBis = Model(inputs=model.input, outputs=model.get_layer(layerName).output)
modelBis.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
act_seq, act_h, act_c=modelBis(trueBatch) #training=False
for i in range(batch_size):
ind_batch=i_batch*batch_size+i
dic_list[ind_batch]['activations'][layerName]={}
if mode==SELECTED_CELLS:
dic_list[ind_batch]['activations'][layerName]['outputs']={}
for j in selected_cells[layerName]['outputs']:
dic_list[ind_batch]['activations'][layerName]['outputs'][str(j)]=str(act_h.numpy()[i][j])
dic_list[ind_batch]['activations'][layerName]['cell_states']={}
for j in selected_cells[layerName]['cell_states']:
dic_list[ind_batch]['activations'][layerName]['cell_states'][str(j)]=str(act_c.numpy()[i][j])
elif mode==ALL_CELLS:
dic_list[ind_batch]['activations'][layerName]['outputs']=[str(x) for x in act_h.numpy()[i]]
dic_list[ind_batch]['activations'][layerName]['cell_states']=[str(x) for x in act_c.numpy()[i]]
i_batch+=1
#save activations to files
if save_activations:
for i in range(nb_batchs_iter*batch_size):
dic=dic_list[i]
filename=dic['filename']
mode_text = 'ALL' if mode==ALL_CELLS else 'SELECTED'
jsonFolder=f'./activations/{scores_folder}/{weights_name}_{mode_text}/'
os.makedirs(jsonFolder, exist_ok=True)
jsonFilename=f'{jsonFolder}{filename}.json'
with open(jsonFilename, 'w') as f:
json.dump(dic, f, indent=4)
if force_dropout:
dropout_flag=f'dropout_{int(round(100*(1-keep_prob_dense_layer)))}'
else:
dropout_flag='no_dropout'
if embeddings_Hellinger:
Hell_flag='_Hellinger'
else:
Hell_flag=''
embFolder=f'./embeddings/{scores_folder}/{weights_name}/{dropout_flag}{Hell_flag}/'
TRUE_LANGUAGE=0
PREDICTED_LANGUAGE=1
mode_label = 1 #write both labels either ways, but differs for max nb examples by language strategy
max_examples_language=120
max_nb_batchs_iter=np.inf
if save_embeddings:
if not os.path.exists(embFolder):
os.makedirs(embFolder)
proj_config = projector.ProjectorConfig()
#proj_config.model_checkpoint_path = embeddings_ckpt_name
#TODO diff tensors with diff. datasets?
embeddings = proj_config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`
embeddings.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
if mode_label == TRUE_LANGUAGE:
embeddings.metadata_path = 'labels_true.tsv'
elif mode_label == PREDICTED_LANGUAGE:
embeddings.metadata_path = 'labels_predicted.tsv'
projector.visualize_embeddings(embFolder, proj_config)
# save checkpoint/metadata
if save_embeddings:
st_info=''
nb_batchs_iter=min(networkInput.nbr_batchs, max_nb_batchs_iter)
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
languages_true_list=[]
languages_predicted_list=[]
count_lang=dict([(lang, 0) for lang in languages])
scores=[]
mean_act=np.zeros(len(languages)) #for normalization purposes if needed
i_batch=0
while(i_batch<nb_batchs_iter):
above_thr=[count_lang[lang]>=max_examples_language for lang in languages]
if all(above_thr):
break
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch, training=force_dropout) #training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
#mean_act=np.sum(y_, axis=0)
for i in range(batch_size):
ind_batch=i_batch*batch_size+i
ind0=np.argmax(y[i])
ind=np.argmax(y_[i])
lang0=languages[ind0]
lang=languages[ind]
if mode_label == TRUE_LANGUAGE:
if count_lang[lang0]>=max_examples_language:
continue
count_lang[lang0] += 1
if count_lang[lang0] == max_examples_language:
st_info+=f'{lang0}: max examples reached at seq {ind_batch}\n'
print(f'{lang0}: max examples reached at seq {ind_batch}')
elif mode_label == PREDICTED_LANGUAGE:
if count_lang[lang]>=max_examples_language:
continue
count_lang[lang] += 1
if count_lang[lang] == max_examples_language:
st_info+=f'{lang}: max examples reached at seq {ind_batch}\n'
print(f'{lang}: max examples reached at seq {ind_batch}')
languages_true_list.append(lang0)
languages_predicted_list.append(lang)
if embeddings_Hellinger:
scores.append(np.sqrt(y_[i]))
else:
scores.append(y_[i])
i_batch+=1
#mean_act/=i_batch
#mean_act_copy=np.copy(mean_act)
scores_arr=np.stack(scores)
#checkpoint
checkpoint = tf.train.Checkpoint(embedding=tf.Variable(scores_arr))
checkpoint.save(os.path.join(embFolder, "embeddings.ckpt"))
#metadata
with open(os.path.join(embFolder, 'labels_true.tsv'), "w") as f:
for lang in languages_true_list:
f.write("{}\n".format(lang))
with open(os.path.join(embFolder, 'labels_predicted.tsv'), "w") as f:
for lang in languages_predicted_list:
f.write("{}\n".format(lang))
#save all data to csv
with open(os.path.join(embFolder, 'data.csv'), "w") as f:
csvWriter=csv.writer(f, delimiter='\t')
csvWriter.writerow(['label_true', 'label_predicted']+languages)
for i in range(len(languages_true_list)):
row=[languages_true_list[i], languages_predicted_list[i]]
row+=list(scores[i])
csvWriter.writerow(row)
with open(f'{embFolder}/info.txt', 'w') as f:
f.write(st_info)
max_batchs=10000//batch_size #limit analysis to a certain number of batches #np.inf if no limitation
Correlation matrix / histogram of activations
if do_similarity_analyses:
nb_batchs_iter=min(networkInput.nbr_batchs, max_batchs)
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
corr_matrix=np.zeros((len(languages), len(languages))) #r coefficient
hist_act=np.zeros((nb_batchs_iter*batch_size, len(languages))) #non normalized activation scores
hist_act_true_lang=[]
hist_act_predicted_lang=[]
if do_pairwise_analysis:
pairwise_corr=np.zeros((len(languages), len(languages)))
pairwise_count=np.zeros((len(languages), len(languages)))
i_batch=0
while(i_batch<nb_batchs_iter):
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch, training=force_dropout) # training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
for i in range(batch_size):
ind0=np.argmax(y[i])
ind=np.argmax(y_[i])
lang0=languages[ind0]
lang=languages[ind]
hist_act_true_lang.append(lang0)
hist_act_predicted_lang.append(lang)
corr_matrix+= np.sum(np.expand_dims(y_, 1)*np.expand_dims(y_, 2), axis=0)
#for i in range(batch_size):
# corr_matrix+=np.outer(y_[i], y_[i])
hist_act[i_batch*batch_size:(i_batch+1)*batch_size]=y_
if do_pairwise_analysis:
modelBis = Model(inputs=model.input, outputs=model.get_layer('lstm_2').output)
modelBis.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
act_seq, act_h, act_c=modelBis(trueBatch) #training=False
for i in range(batch_size):
ind0=np.argmax(y[i])
for j in range(i+1, batch_size):
ind0bis=np.argmax(y[j])
ii=ind0
jj=ind0bis
corr=np.sum(act_h[i]*act_h[j])
pairwise_corr[ii][jj]+=corr
pairwise_corr[jj][ii]+=corr
pairwise_count[ii][jj]+=1
pairwise_count[jj][ii]+=1
i_batch+=1
corr_matrix/=i_batch
dev=np.sqrt(corr_matrix.diagonal())
corr_matrix/=np.outer(dev, dev) #normalization by deviations
hist_act/=np.sum(hist_act, axis=0)
if do_pairwise_analysis:
pairwise_corr/=(pairwise_count+1e-4)
#proximity measures based on activation histograms
if do_similarity_analyses:
prox_d_kl=np.zeros((len(languages), len(languages)))
prox_d_hell=np.zeros((len(languages), len(languages)))
prox_d_bhat=np.zeros((len(languages), len(languages)))
for i in range(len(languages)):
for j in range(len(languages)):
p_i=hist_act[:, i]+1e-8
p_j=hist_act[:, j]+1e-8
prox_d_kl[i][j]=np.sum(p_i*np.log2(p_i/p_j))
prox_d_kl[j][i]=np.sum(p_j*np.log2(p_j/p_i))
prox_d_hell[i][j]=np.sqrt(np.sum((np.sqrt(p_i)-np.sqrt(p_j))**2))
prox_d_hell[j][i]=prox_d_hell[i][j]
prox_d_bhat[i][j]=-2*np.log2(np.sum(np.sqrt(p_i*p_j)))
def plot_distance_matrix(dm, classes, normalize=False, title='Distance matrix', cmap=pl.cm.Blues, vmin=0, vmax=4, invert_colors=False):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
dm = dm.astype('float') / (0.00001+dm.sum(axis=1)[:, np.newaxis])
dm = np.round(dm*100, decimals=2)
#print(cm)
#pl.imshow(-np.log2(dm), interpolation='nearest', cmap=cmap, vmax=-3,vmin=-5)
#pl.imshow(-dm, interpolation='nearest', cmap=cmap, vmax=-8,vmin=-17)
if not(invert_colors):
pl.imshow(-dm, interpolation='nearest', cmap=cmap, vmin=-vmax, vmax=-vmin)
else:
pl.imshow(dm, interpolation='nearest', cmap=cmap, vmin=vmin, vmax=vmax)
pl.title(title)
#pl.colorbar()
tick_marks = np.arange(len(classes))
pl.xticks(tick_marks, classes, rotation=45)
pl.yticks(tick_marks, classes)
thresh = dm.max()*(1-2*invert_colors) / 2.
for i, j in itertools.product(range(dm.shape[0]), range(dm.shape[1])):
pl.text(j, i, int(dm[i,j]*100)*1./100,
horizontalalignment="center",
color="white" if dm[i, j]*(1-2*invert_colors) < thresh else "black")
#pl.tight_layout()
pl.ylabel('Label1 (reference)')
pl.xlabel('Label2')
def permut_mat(mat, permut=[1,2,5, 13,12,4,16,18, 15,11, 14,19,20,7, 17, 0, 3, 6, 8, 9, 10]):
conf_matrix=mat
conf_matrix_permut=np.zeros_like(conf_matrix)
for i, ind in enumerate(permut):
conf_matrix_permut[i]=conf_matrix[ind][permut]
languages_permut=[languages[ind] for ind in permut]
return languages_permut, conf_matrix_permut
if do_similarity_analyses:
#pl.figure(figsize=(10, 10))
#plot_distance_matrix(corr_matrix, languages, title='Correlation matrix', vmin=0.05, vmax=0.5)
pl.figure(figsize=(10, 10))
languages_permut, prox_d_hell_permut=permut_mat(prox_d_hell)
plot_distance_matrix(prox_d_hell_permut, languages_permut, title='Dissimilarity matrix (Hellinger distance)', vmin=0.8, vmax=1.3)
pl.figure(figsize=(10, 10))
languages_permut, prox_d_kl_permut=permut_mat(prox_d_kl)
plot_distance_matrix(prox_d_kl_permut, languages_permut, title='Dissimilarity matrix (KL div)', vmin=3, vmax=15)
pl.figure(figsize=(10, 10))
plot_distance_matrix( (prox_d_kl_permut-prox_d_kl_permut.T)/(prox_d_kl_permut+1e-3)*100, languages_permut, title='KL div, diff transpose', vmin=-20, vmax=20, invert_colors=True)
pl.figure(figsize=(10, 10))
languages_permut, prox_d_bhat_permut=permut_mat(prox_d_bhat)
plot_distance_matrix(prox_d_bhat_permut, languages_permut, title='Dissimilarity matrix (Bhattacharyya distance)', vmin=0, vmax=8)
Multidimensional scaling (nonmetric scaling)
See http://scikit-learn.org/stable/modules/manifold.html#multidimensional-scaling
import itertools
from sklearn import cluster
from sklearn import manifold
from sklearn.metrics import pairwise_distances
mds = manifold.MDS(n_components=2, metric=True,
verbose=0, dissimilarity='precomputed', n_init=10)
#NON METRIC
#mds = manifold.MDS(n_components=2, metric=False,
# n_init=30, max_iter=300,
# verbose=0, eps=0.001, dissimilarity='precomputed')
dist_measure_str = "Hellinger distance"
dm=prox_d_hell
dist_measure_str = "(symm.) KL divergence"
dm=prox_d_kl
languages2=languages
'''
print('not shown: Hungarian, Finnish, (and Polish?)')
#HACK delete hungarian
dm=np.delete(dm, 6, axis=0)
dm=np.delete(dm, 6, axis=1)
languages2=languages[0:6]+languages[7::]
#HACK delete Finnish
dm=np.delete(dm, 3, axis=0)
dm=np.delete(dm, 3, axis=1)
languages2=languages2[0:3]+languages2[4::]
#HACK delete Polish
dm=np.delete(dm, 9, axis=0)
dm=np.delete(dm, 9, axis=1)
languages2=languages2[0:9]+languages2[10::]
'''
coord_pts = mds.fit_transform((dm.T+dm)/2) #symmetrize if necessary
delta = 0.01
fig = pl.figure(figsize=(10,10))
ax = pl.gca()
ax.scatter(coord_pts[:,0], coord_pts[:,1])
for i, txt in enumerate(languages2):
ax.annotate(txt, coord_pts[i]+(delta, delta))
pl.title("Metric multidimensional scaling (computed with {})".format(dist_measure_str))
pl.plot()
stress=np.sqrt(mds.stress_/(np.sum(dm**2)/2))
dm2=pairwise_distances(coord_pts)
stress2=np.sqrt(np.sum((dm2-dm)**2)/np.sum(dm**2))
print(f'stress : {stress}')
print(f'stress : {stress2}')
for i in range(len(languages2)):
act_diss=dm[i]
diff=(dm2[i]-dm[i])
lang=languages2[i]
print(f'{lang.rjust(10)}\t actual dissimilarity: {np.sum(act_diss):.3f} \t difference: {np.sum(np.abs(diff)):.3f} \t percent diff.: {np.sum(np.abs(diff))/np.sum(act_diss)*100:.2f} %')
'''
coord_pts = mds.fit_transform((dm_selected_modified.T+dm_selected_modified)/2)
fig = pl.figure(figsize=(10,10))
ax = pl.gca()
ax.scatter(coord_pts[:,0], coord_pts[:,1])
for i, txt in enumerate(selected_languages):
ax.annotate(txt, coord_pts[i]+(delta, delta))
dist_measure_str = "KL divergence" if dist_measure == D_KL else "Hellinger distance"
pl.title("Metric multidimensional scaling (computed with {})".format(dist_measure_str))
pl.plot()
'''
stress : 0.164924403026518 stress : 0.19061521671913212 Danish actual dissimilarity: 55.578 difference: 13.557 percent diff.: 24.39 % Dutch actual dissimilarity: 75.111 difference: 11.570 percent diff.: 15.40 % English actual dissimilarity: 53.429 difference: 8.920 percent diff.: 16.70 % Finnish actual dissimilarity: 85.116 difference: 12.165 percent diff.: 14.29 % French actual dissimilarity: 44.158 difference: 6.488 percent diff.: 14.69 % German actual dissimilarity: 75.540 difference: 11.286 percent diff.: 14.94 % Hungarian actual dissimilarity: 58.934 difference: 9.734 percent diff.: 16.52 % Italian actual dissimilarity: 45.416 difference: 5.161 percent diff.: 11.36 % Japanese actual dissimilarity: 51.611 difference: 8.269 percent diff.: 16.02 % Korean actual dissimilarity: 76.616 difference: 13.818 percent diff.: 18.03 % Mandarin actual dissimilarity: 82.105 difference: 11.829 percent diff.: 14.41 % Polish actual dissimilarity: 74.312 difference: 8.101 percent diff.: 10.90 % Portuguese actual dissimilarity: 40.621 difference: 6.419 percent diff.: 15.80 % Russian actual dissimilarity: 51.513 difference: 5.084 percent diff.: 9.87 % Spanish actual dissimilarity: 56.640 difference: 8.367 percent diff.: 14.77 % Swedish actual dissimilarity: 46.584 difference: 9.886 percent diff.: 21.22 % Turkish actual dissimilarity: 46.590 difference: 6.543 percent diff.: 14.04 % Estonian actual dissimilarity: 62.225 difference: 12.563 percent diff.: 20.19 % Arabic actual dissimilarity: 44.752 difference: 7.406 percent diff.: 16.55 % Basque actual dissimilarity: 46.494 difference: 5.228 percent diff.: 11.24 % Catalan actual dissimilarity: 64.099 difference: 10.686 percent diff.: 16.67 %
'\ncoord_pts = mds.fit_transform((dm_selected_modified.T+dm_selected_modified)/2)\n\nfig = pl.figure(figsize=(10,10))\nax = pl.gca()\nax.scatter(coord_pts[:,0], coord_pts[:,1])\n\nfor i, txt in enumerate(selected_languages):\n ax.annotate(txt, coord_pts[i]+(delta, delta))\n\ndist_measure_str = "KL divergence" if dist_measure == D_KL else "Hellinger distance"\npl.title("Metric multidimensional scaling (computed with {})".format(dist_measure_str))\n\npl.plot()\n'
Note: the 'hole' : Japanese, Mandarin, Korean, Polish
pl.imshow(-dm2)
pl.colorbar()
pl.figure()
pl.imshow(-dm)
pl.colorbar()
<matplotlib.colorbar.Colorbar at 0x7fba846f8c10>
Some clustering
delta=10.
n_clusters = 6
spec_clustering = cluster.SpectralClustering(n_clusters=n_clusters,
affinity="precomputed")
if do_similarity_analyses:
dm=(prox_d_kl+prox_d_kl.T)/2
similarity_matrix=np.exp(- dm**2 / (2. * delta ** 2))
clusters = spec_clustering.fit_predict(similarity_matrix)
print("All languages : ")
for i in range(n_clusters):
cluster_labels = [languages[j] for j in np.where(clusters == i)[0]]
print("> cluster {} : {} \n".format(i, cluster_labels))
All languages : > cluster 0 : ['French', 'Hungarian', 'Italian', 'Portuguese', 'Russian', 'Spanish', 'Swedish', 'Turkish', 'Catalan'] > cluster 1 : ['Danish', 'Finnish', 'Estonian'] > cluster 2 : ['Korean'] > cluster 3 : ['Polish'] > cluster 4 : ['Japanese', 'Mandarin', 'Arabic', 'Basque'] > cluster 5 : ['Dutch', 'English', 'German']
plt=pl
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack([model.children_, model.distances_,
counts]).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
if do_similarity_analyses:
# Ward based on hist_act
# setting distance_threshold=0 ensures we compute the full tree.
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None) #affinity='precomputed'
#XXX hist_act or sqrt hist_act ??
model_clustering = model_clustering.fit(np.sqrt(hist_act.T))
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
def labels_dendogram(k):
return languages[k]
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Ward + Hellinger distance")
plt.show()
#DM #try different linkage methods
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity='precomputed', linkage='complete' )
dm=(prox_d_kl+prox_d_kl.T)/2
model_clustering = model_clustering.fit(dm) #if dm, needs to add affinity='precomputed' and change linkage method
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
def labels_dendogram(k):
return languages[k]
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Complete linkage based on symmetrized D_KL ")
plt.show()
# Bhattacharyya distance
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity='precomputed', linkage='complete' )
dm=prox_d_bhat
#dm+=16*prox_d_hell
model_clustering = model_clustering.fit(dm) #if dm, needs to add affinity='precomputed' and change linkage method
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Complete linkage based on Bhattacharyya distance ")
plt.show()
max_batchs=np.infty
if compute_confusion_matrix:
nb_batchs_iter=min(networkInput.nbr_batchs, max_batchs)
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
#lang_true=[]
#lang_predicted=[]
conf_matrix=np.zeros((len(languages), len(languages)))
conf_matrix_filtered=np.zeros((len(languages_test), len(languages_test)))
i_batch=0
while(i_batch<nb_batchs_iter):
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch, training=force_dropout_conf_matrix) #training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
for j in range(batch_size):
ind0=np.argmax(y[j])
ind=np.argmax(y_[j])
lang0=languages[ind0]
lang=languages[ind]
#lang_true.append(lang0)
#lang_predicted.append(lang)
conf_matrix[ind0][ind]+=1
ind0bis=np.argmax(y[j][inds_lang_test])
indbis=np.argmax(y_[j][inds_lang_test])
conf_matrix_filtered[ind0bis][indbis]+=1
i_batch+=1
if compute_confusion_matrix:
#plot_distance_matrix(conf_matrix, languages, vmin=0, vmax=20, normalize=True, invert_colors=True)
permut=[1,2,5, 13,12,4,16,18, 15,11, 14,19,20,7, 17, 0, 3, 6, 8, 9, 10]
conf_matrix_permut=np.zeros_like(conf_matrix)
for i, ind in enumerate(permut):
conf_matrix_permut[i]=conf_matrix[ind][permut]
languages_permut=[languages[ind] for ind in permut]
if useRamus:
plot_distance_matrix(conf_matrix_filtered, languages_test, vmin=0, vmax=5, invert_colors=True,title='Confusion matrix')
permut=[0,1,2,5,3,6,7, 4]
conf_matrix_filtered_permut=np.zeros_like(conf_matrix_filtered)
for i, ind in enumerate(permut):
conf_matrix_filtered_permut[i]=conf_matrix_filtered[ind][permut]
languages_test_permut=[languages_test[ind] for ind in permut]
pl.figure(figsize=(6,6))
pl.ylim([7.5, -0.5])
plot_distance_matrix(conf_matrix_filtered_permut, languages_test_permut , vmin=0, vmax=5, invert_colors=True, title='Confusion matrix')
pl.figure(figsize=(6,6))
plot_distance_matrix(conf_matrix_filtered_permut, languages_test_permut , normalize=True, vmin=0, vmax=50, invert_colors=True, title='Confusion matrix')
pl.ylim([7.5, -0.5])
else:
pl.figure(figsize=(15,15))
plot_distance_matrix(conf_matrix_permut, languages_permut , normalize=True, vmin=0, vmax=10, invert_colors=True, title='Confusion matrix')
pl.ylim([20.5, -0.5])