import numpy as np
import matplotlib.pyplot as pl
import tensorflow as tf
import tensorflow.keras.optimizers as optimizers
from model import *
from input import NetworkInput
from data import createFeaturesDescription
from config import *
from tensorflow.keras.models import Model
from datetime import datetime
from IPython.display import Audio
import soundfile as sf
import os
import json
import csv
import itertools
from tensorboard.plugins import projector
from sklearn import cluster
from scipy.cluster.hierarchy import dendrogram
run_with_papermill=False
batch_size=16 #16 #512
stride=2
use_F0=False
F0_binary_values=False
use_deltas=False
hidden_size=128
num_layers=2
weights_filename='./logdir/2020-11-20_18-39-24-2_150_multiple_dropout/weights/weights_2020-11-20_18-39-24.h5'
weights_name='2_150_multiple_dropout'
useExScores=False
useRamus=False
use1dScores=False
useBalancedDataSet=False
balanced_dataset_folder='balanced_20_1'
evaluate_model=False
save_activations=False #save activations to json files #mode ALL_CELLS or SELECTED_CELLS defined later
save_embeddings=True #save embeddings as checkpoint file (for tensorboard)
do_similarity_analyses=True
compute_confusion_matrix=False
embeddings_Hellinger=True #embeddings based on sqrt of output
force_dropout=True #for embeddings, similarity analyses
force_dropout_conf_matrix=False #force dropout for confusion matrix
#For the examples of outputs, set manually
keep_prob=1
keep_prob_recurrent=1
keep_prob_dense_layer=1
#note: others useful parameters in rest of code, e.g. max_examples_language, max_batches for analyses
do_pairwise_analysis=False #pairewise analysis inside every batch
# Parameters
run_with_papermill = True
hidden_size = 150
use_deltas = True
use_F0 = True
F0_binary_values = True
useBalancedDataSet = False #on train set defined later
keep_prob = 0.9
keep_prob_recurrent = 0.9
keep_prob_dense_layer = 0.8
weights_filename = "./logdir/2021-08-19_14-53-14-2_150_mult_dropout_voiced_unvoiced_bis/weights/weights_2021-08-19_14-53-14.h5"
weights_name = "weights_2_150_voiced_unvoiced_bis"
compute_confusion_matrix = True
do_similarity_analyses=False
save_embeddings=False
evaluate_model=True
if not(run_with_papermill): #manual settings
force_dropout=True
keep_prob=0.7
keep_prob_dense_layer=0.9
keep_prob_recurrent=0.9
useBalancedDataSet=True
#useRamus=True
use_deltas=True
hidden_size=150
evaluate_model=False
save_embeddings=False
save_activations=True
do_similarity_analyses=False
compute_confusion_matrix=False
weights_config={
'num_steps':32,
'features_description':createFeaturesDescription(F0=use_F0),
'stride':stride
}
load_weights=True
#old models (old inputs)
#1D models
#weights_filename="./models/weights0420/weights20epochs.h5"
#weights_filename, weights_name="./models/weights0420b/weights_2020-04-07_17-55-05.h5", "weights_2020-04-07"
#weights_config['features_description']=createFeaturesDescription(HRmsValue=False, F0=False)
#2D models
#weights_filename, weights_name='./models/weights0715-2d-60Hz/weights_2020-07-15_12-15-21.h5', 'weights0715-2d-60Hz'
#weights_filename, weights_name='./models/weights0717-2d-60Hz-64/weights_2020-07-17_17-35-40.h5', 'weights0717-2d-60Hz-64'
#weights_config['features_description']=createFeaturesDescription(F0=False)
#weights_config['stride']=1
#weights_filename, weights_name="./models/weights0710-3d/weights_2020-07-03_17-40-31.h5", "weights0710-3d"
num_steps=weights_config["num_steps"]
features_description=weights_config['features_description']
stride=weights_config['stride']
if not(force_dropout or force_dropout_keep_prob_dense_layer):
keep_prob_dense_layer=1
keep_prob=1
config=Config(batch_size, num_steps, hidden_size=hidden_size,
num_layers=num_layers,
keep_prob=keep_prob, keep_prob_recurrent=keep_prob_recurrent,
keep_prob_dense_layer=keep_prob_dense_layer)
config=completedConfig(config) #take default params for unspecified params
WARNING:root:No cell type specified in config: using LSTM
languages = ["Danish", "Dutch", "English", "Finnish",
"French", "German", "Hungarian", "Italian",
"Japanese", "Korean", "Mandarin", "Polish",
"Portuguese", "Russian", "Spanish",
"Swedish", "Turkish", "Estonian", "Arabic", "Czech", "Romanian",
"Basque", "Catalan"] #NB: check that the order of elements is consistent with model
#Remove languages with not enough data
languages.remove("Czech")
languages.remove("Romanian")
#languages_dataset=languages+['Romanian'] #None -> autodetect, languages-> same as model (defined above)
languages_dataset=None
#scores Folder #default: "./Scores"
#assert useExScores^useRamus^use1dScores^useBalancedDataSet , "choose a unique dataset"
if useExScores:
scores_folder='./ex_Scores'
elif useRamus:
scores_folder='./Scores_Ramus'
elif use1dScores:
scores_folder='./Scores_1d'
else:
scores_folder='./Scores'
max_files_evaluation= 2024 #np.inf
# FIRST VERSION
#languages = ['Danish', 'Russian', 'Mandarin', 'Finnish', 'Dutch', 'English', 'Hungarian', 'Swedish',
# 'Italian', 'French', 'Japanese', 'German', 'Portuguese', 'Polish', 'Spanish', 'Korean']
sets ={}
set_folds=[0]
if useExScores or useRamus:
sets_folds={"test":[0]}
elif useBalancedDataSet:
sets_folds={"test":[0]} #subfolder defined later
else:
sets_folds = {"train" : [0, 1, 2],
"test":[3,4],
"test1" : [3],
"test2" : [4]}
initial_sample_length=3*2**14 if useRamus else 10*2**14
TFRecords_batch_size=1 if useRamus else 16
set_name='train'
if set_name in sets_folds:
if useRamus:
subfolders=[""]
elif useBalancedDataSet:
subfolders=[balanced_dataset_folder]
else:
subfolders=["fold_{}/".format(k_fold) for k_fold in sets_folds[set_name]]
sets[set_name] = NetworkInput(config, folder=scores_folder,
subfolder=subfolders,
stride=stride, verbose=True, for_evaluation=True,
languages=languages, name=set_name, features_description=features_description,
initial_sample_length=initial_sample_length, TFRecords_batch_size=TFRecords_batch_size,
use_deltas=use_deltas,
F0_binary_values=F0_binary_values) #TRAINING SET BUT FOR EVALUATION
set_name='test'
if useRamus:
subfolders=[""]
elif useBalancedDataSet:
subfolders=[balanced_dataset_folder]
else:
subfolders=["fold_{}/".format(k_fold) for k_fold in sets_folds[set_name]]
sets[set_name] = NetworkInput(config, folder=scores_folder, for_evaluation=True,
subfolder=subfolders,
stride=stride, verbose=True,
languages=languages, languages_model=languages, name=set_name, features_description=features_description,
initial_sample_length=initial_sample_length, TFRecords_batch_size=TFRecords_batch_size,
use_deltas=use_deltas,
F0_binary_values=F0_binary_values) #autodetect languages
'''
sets_folds = {"train" : [0, 1, 2],
"test1" : [3],
"test2" : [4]}
sets ={}
sets_folds={"train":sets_folds["train"]}
for set_name, set_folds in sets_folds.items():
print("{} : folds {}".format(set_name, set_folds))
sets[set_name] = NetworkInput(config, folder='./Scores',
subfolder=["fold_{}/".format(k_fold) for k_fold in set_folds],
stride=stride, verbose=True,
languages=languages, name=set_name)
'''
WARNING ; some languages in the dataset are not considered by the model DATASET train for evaluation only (test/validation set) Data augmentation: off. Input params/info: sampling frequency of inputs : 31.25 Hz sample length : 320 (initial sample length : 163840, step : 256, stride : 2) sample duration : 10.24 s batch size : 16 num slices by example: 10 (num timesteps by slices: 32) WARNING ; some languages in the dataset are not considered by the model languages (total: 21) 0: Danish 1: Dutch 2: English 3: Finnish 4: French 5: German 6: Hungarian 7: Italian 8: Japanese 9: Korean 10: Mandarin 11: Polish 12: Portuguese 13: Russian 14: Spanish 15: Swedish 16: Turkish 17: Estonian 18: Arabic 19: Basque 20: Catalan (Sub)folders: ['fold_0/', 'fold_1/', 'fold_2/'] Total number of examples - train - : 410880 (25680 batchs) Per language : Danish : 5216 (1.27 %) Dutch : 8400 (2.04 %) English : 61936 (15.07 %) Finnish : 7248 (1.76 %) French : 52096 (12.68 %) German : 69824 (16.99 %) Hungarian : 2352 (0.57 %) Italian : 28064 (6.83 %) Japanese : 2720 (0.66 %) Korean : 10192 (2.48 %) Mandarin : 11952 (2.91 %) Polish : 5456 (1.33 %) Portuguese : 11888 (2.89 %) Russian : 12048 (2.93 %) Spanish : 40864 (9.95 %) Swedish : 8976 (2.18 %) Turkish : 3920 (0.95 %) Estonian : 2752 (0.67 %) Arabic : 4160 (1.01 %) Basque : 14864 (3.62 %) Catalan : 45952 (11.18 %) F0 takes only 2 values (0:unvoiced/1:voiced) input depth (nb features) : 3 x2 (using deltas) = 6 WARNING ; some languages in the dataset are not considered by the model DATASET test for evaluation only (test/validation set) Data augmentation: off. Input params/info: sampling frequency of inputs : 31.25 Hz sample length : 320 (initial sample length : 163840, step : 256, stride : 2) sample duration : 10.24 s batch size : 16 num slices by example: 10 (num timesteps by slices: 32) WARNING ; some languages in the dataset are not considered by the model languages (total: 21) 0: Danish 1: Dutch 2: English 3: Finnish 4: French 5: German 6: Hungarian 7: Italian 8: Japanese 9: Korean 10: Mandarin 11: Polish 12: Portuguese 13: Russian 14: Spanish 15: Swedish 16: Turkish 17: Estonian 18: Arabic 19: Basque 20: Catalan (Sub)folders: ['fold_3/', 'fold_4/'] Total number of examples - test - : 34336 (2146 batchs) Per language : Danish : 832 (2.42 %) Dutch : 2240 (6.52 %) English : 2576 (7.50 %) Finnish : 1072 (3.12 %) French : 2720 (7.92 %) German : 2544 (7.41 %) Hungarian : 672 (1.96 %) Italian : 2352 (6.85 %) Japanese : 1136 (3.31 %) Korean : 704 (2.05 %) Mandarin : 2048 (5.96 %) Polish : 1376 (4.01 %) Portuguese : 2224 (6.48 %) Russian : 2832 (8.25 %) Spanish : 2720 (7.92 %) Swedish : 1344 (3.91 %) Turkish : 1456 (4.24 %) Estonian : 768 (2.24 %) Arabic : 1360 (3.96 %) Basque : 736 (2.14 %) Catalan : 624 (1.82 %) F0 takes only 2 values (0:unvoiced/1:voiced) input depth (nb features) : 3 x2 (using deltas) = 6
'\nsets_folds = {"train" : [0, 1, 2],\n "test1" : [3],\n "test2" : [4]}\n \n \nsets ={}\n\nsets_folds={"train":sets_folds["train"]}\n\nfor set_name, set_folds in sets_folds.items():\n print("{} : folds {}".format(set_name, set_folds))\n sets[set_name] = NetworkInput(config, folder=\'./Scores\', \n subfolder=["fold_{}/".format(k_fold) for k_fold in set_folds],\n stride=stride, verbose=True, \n languages=languages, name=set_name)\n'
inds_lang_test=list(np.flatnonzero(sets['test'].frequencies))
filter_lang_test=np.array(sets['test'].frequencies)>0
languages_test= [languages[i] for i in inds_lang_test]
networkInput=sets["train"]
model=build_model(config, networkInput, return_state=True) #return_state will be useful to retrieve cell states
model.summary()
Model: "model" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input_1 (InputLayer) [(16, 32, 6)] 0 __________________________________________________________________________________________________ lstm_1 (LSTM) [(16, 32, 150), (16, 94200 input_1[0][0] __________________________________________________________________________________________________ lstm_2 (LSTM) [(16, 32, 150), (16, 180600 lstm_1[0][0] __________________________________________________________________________________________________ dropout (Dropout) (16, 32, 150) 0 lstm_2[0][0] __________________________________________________________________________________________________ time_distributed (TimeDistribut (16, 32, 21) 3171 dropout[0][0] __________________________________________________________________________________________________ input_2 (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ input_3 (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ softmax (Softmax) (16, 32, 21) 0 time_distributed[0][0] ================================================================================================== Total params: 277,971 Trainable params: 277,971 Non-trainable params: 0 __________________________________________________________________________________________________
if load_weights:
model.load_weights(weights_filename)
#METRICS
#acc_end_seq=AccuracyStateless(networkInput, includeSampleWeights=False)
acc_slices=[AccuracyStateless(networkInput, ind_batch_compute=k) for k in range(networkInput.num_slices_by_example)]
top3_slices=[TopKAccuracyStateless(networkInput, k=3, ind_batch_compute=j) for j in range(networkInput.num_slices_by_example)]
metricsList=[#accuracy_on_last_step, top_k_accuracy_on_last_step_partial(k=3)
KL_div_on_last_step, cross_entropy_on_last_step]
metricsList+=acc_slices
metricsList+=top3_slices
KLLoss=tf.keras.losses.KLDivergence()
model.compile(loss=KLLoss, metrics=metricsList)
if evaluate_model:#EVALUATION
true_nb_batches=networkInput.nbr_batchs*networkInput.num_slices_by_example
max_nb_batches=max_files_evaluation/config.batch_size*networkInput.num_slices_by_example
nb_steps=np.minimum(max_nb_batches, true_nb_batches)
forgetStates=Forget_states_callback(networkInput, model, verbose=False)
callbacksList=[forgetStates]
metrics_end=model.evaluate(networkInput.sliced_batch, verbose=1, steps=nb_steps,callbacks=callbacksList)
Creating 'look-up tables' for filenames 1265/1265 [==============================] - 62s 49ms/step - loss: 1.9783 - KL_div_on_last_step: 1.9051 - cross_entropy_on_last_step: 1.9052 - accuracy_slice_0: 0.1344 - accuracy_slice_1: 0.2173 - accuracy_slice_2: 0.3036 - accuracy_slice_3: 0.3482 - accuracy_slice_4: 0.4008 - accuracy_slice_5: 0.4330 - accuracy_slice_6: 0.4618 - accuracy_slice_7: 0.4777 - accuracy_slice_8: 0.4926 - accuracy_at_end_of_sequences: 0.5050 - top_3_accuracy_slice_0: 0.3467 - top_3_accuracy_slice_1: 0.4950 - top_3_accuracy_slice_2: 0.5868 - top_3_accuracy_slice_3: 0.6448 - top_3_accuracy_slice_4: 0.6920 - top_3_accuracy_slice_5: 0.7178 - top_3_accuracy_slice_6: 0.7426 - top_3_accuracy_slice_7: 0.7614 - top_3_accuracy_slice_8: 0.7812 - top_3_accuracy_at_end_of_sequences: 0.8021
def print_top5(st, y, y_):
ind0=np.argmax(y)
ind=np.argsort(-y_)
print(f"{st}\nlanguage: {languages[ind0]}")
st=" "
for k in range(7):
st+=f"{k+1}: {languages[ind[k]]}, "
st+='\n'
#for k, lang in enumerate(languages):
# print(lang)
# print(y_[k].numpy())
print(st)
def gen_yy_():
model.reset_states()
batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example)
#option 1, use y_=model(x, training=False)
#option 2 (first axis has size batch_size x steps)
#predictions=model.predict(batch, steps = networkInput.num_slices_by_example)
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch) #, training=False
y=y.numpy()[:,-1]
y_=y_[:,-1]
res=[]
for k in range(networkInput.config.batch_size):
res.append((filenames[k][0].numpy().decode('utf-8'),
y[k], y_[k]))
return res
for i in range(min(5, 10//batch_size*5+1)):
batch_yy_ = gen_yy_()
for k in range( min(10, batch_size)):
print_top5(*batch_yy_[k])
fold_0_tatoeba_spk_0_tatoeba_deu_spk0_set0_424 language: German 1: German, 2: English, 3: Dutch, 4: Russian, 5: Polish, 6: Portuguese, 7: Swedish, fold_0_librivox_reader3704_reader3704_2973_sec1_49 language: Finnish 1: Finnish, 2: Italian, 3: Spanish, 4: Swedish, 5: Japanese, 6: Russian, 7: Basque, fold_0_CommonVoice_a9ac06a1b7fc1c0_a9ac06a1b7fc1c0_slice13 language: English 1: English, 2: German, 3: Dutch, 4: Turkish, 5: Swedish, 6: French, 7: Portuguese, fold_0_CommonVoice_83e27a6122071cd_83e27a6122071cd_slice12 language: German 1: Turkish, 2: German, 3: Russian, 4: French, 5: Dutch, 6: Italian, 7: Japanese, fold_0_CommonVoice_e34706a57faeb1a_e34706a57faeb1a_slice12 language: French 1: French, 2: Swedish, 3: English, 4: Japanese, 5: Arabic, 6: Portuguese, 7: Mandarin, fold_0_WLI_file_29_fin-a054f06becdc93089b3983ba41c786db_60 language: Finnish 1: Finnish, 2: Spanish, 3: Italian, 4: Swedish, 5: Russian, 6: Catalan, 7: Danish, fold_0_CommonVoice_3dfd6574c8de634_3dfd6574c8de634_slice20 language: English 1: English, 2: Dutch, 3: German, 4: Catalan, 5: Turkish, 6: French, 7: Italian, fold_2_CommonVoice_0ae84240d635ebb_0ae84240d635ebb_slice4 language: English 1: Basque, 2: Turkish, 3: Italian, 4: Spanish, 5: Catalan, 6: Arabic, 7: Russian, fold_0_WLI_file_41_kor-62ce4e9981f3fe39897c58771f83881b_81 language: Korean 1: Korean, 2: Danish, 3: Mandarin, 4: Turkish, 5: Swedish, 6: Finnish, 7: Italian, fold_0_CommonVoice_626e9eb4bf3a8ab_626e9eb4bf3a8ab_slice16 language: Catalan 1: Spanish, 2: Catalan, 3: Turkish, 4: Basque, 5: Italian, 6: English, 7: Arabic,
test audio
def cell_st(cell_type):
if cell_type==LSTM_CELL:
return "lstm"
elif cell_type==GRU_CELL:
return "gru"
nb_batchs_iter=networkInput.nbr_batchs if save_activations else 1 #nbr batchs (before splitted) to iter on
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
dic_list=[{} for i in range(nb_batchs_iter*config.batch_size)]
i_batch=0
#specific activations
selected_cells={'lstm_2': {
'cell_states': [3, 4, 92, 115, 116, 121],
'outputs': [3, 4, 92, 115, 116, 121]
},
'lstm_1':{
'outputs': [],
'cell_states': []
}
}
#mode, save all cells or selected cells
SELECTED_CELLS=1
ALL_CELLS=0
mode=ALL_CELLS
while(i_batch<nb_batchs_iter):
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
for l in range(config.num_layers): #NB: very inefficient because the network activations are computed several times
layerName=f'{cell_st(config.cell_type)}_{l+1}'
if l==0: #also add output scores
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch) # training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
for i in range(batch_size):
ind_batch=i_batch*batch_size+i
filename=filenames[i][0].numpy().decode('utf-8')
if useRamus: #HACK
filename="_".join(filename.split("_")[2:])
dic_list[ind_batch]['filename']=filename
ind0=np.argmax(y[i])
ind=np.argmax(y_[i])
dic_list[ind_batch]['label']=languages[ind0]
dic_list[ind_batch]['predicted']=languages[ind]
dic_list[ind_batch]['activations']={}
if mode==ALL_CELLS:
dic_list[ind_batch]['scores']={}
for j, lang in enumerate(languages):
dic_list[ind_batch]['scores'][lang]=str(y_[i][j])
modelBis = Model(inputs=model.input, outputs=model.get_layer(layerName).output)
modelBis.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
act_seq, act_h, act_c=modelBis(trueBatch) #training=False
for i in range(batch_size):
ind_batch=i_batch*batch_size+i
dic_list[ind_batch]['activations'][layerName]={}
if mode==SELECTED_CELLS:
dic_list[ind_batch]['activations'][layerName]['outputs']={}
for j in selected_cells[layerName]['outputs']:
dic_list[ind_batch]['activations'][layerName]['outputs'][str(j)]=str(act_h.numpy()[i][j])
dic_list[ind_batch]['activations'][layerName]['cell_states']={}
for j in selected_cells[layerName]['cell_states']:
dic_list[ind_batch]['activations'][layerName]['cell_states'][str(j)]=str(act_c.numpy()[i][j])
elif mode==ALL_CELLS:
dic_list[ind_batch]['activations'][layerName]['outputs']=[str(x) for x in act_h.numpy()[i]]
dic_list[ind_batch]['activations'][layerName]['cell_states']=[str(x) for x in act_c.numpy()[i]]
i_batch+=1
#save activations to files
if save_activations:
for i in range(nb_batchs_iter*batch_size):
dic=dic_list[i]
filename=dic['filename']
mode_text = 'ALL' if mode==ALL_CELLS else 'SELECTED'
jsonFolder=f'./activations/{scores_folder}/{weights_name}_{mode_text}/'
os.makedirs(jsonFolder, exist_ok=True)
jsonFilename=f'{jsonFolder}{filename}.json'
with open(jsonFilename, 'w') as f:
json.dump(dic, f, indent=4)
if force_dropout:
dropout_flag=f'dropout_{int(round(100*(1-keep_prob_dense_layer)))}'
else:
dropout_flag='no_dropout'
if embeddings_Hellinger:
Hell_flag='_Hellinger'
else:
Hell_flag=''
embFolder=f'./embeddings/{scores_folder}/{weights_name}/{dropout_flag}{Hell_flag}/'
TRUE_LANGUAGE=0
PREDICTED_LANGUAGE=1
mode_label = 1 #write both labels either ways, but differs for max nb examples by language strategy
max_examples_language=120
max_nb_batchs_iter=np.inf
if save_embeddings:
if not os.path.exists(embFolder):
os.makedirs(embFolder)
proj_config = projector.ProjectorConfig()
#proj_config.model_checkpoint_path = embeddings_ckpt_name
#TODO diff tensors with diff. datasets?
embeddings = proj_config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`
embeddings.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
if mode_label == TRUE_LANGUAGE:
embeddings.metadata_path = 'labels_true.tsv'
elif mode_label == PREDICTED_LANGUAGE:
embeddings.metadata_path = 'labels_predicted.tsv'
projector.visualize_embeddings(embFolder, proj_config)
# save checkpoint/metadata
if save_embeddings:
st_info=''
nb_batchs_iter=min(networkInput.nbr_batchs, max_nb_batchs_iter)
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
languages_true_list=[]
languages_predicted_list=[]
count_lang=dict([(lang, 0) for lang in languages])
scores=[]
mean_act=np.zeros(len(languages)) #for normalization purposes if needed
i_batch=0
while(i_batch<nb_batchs_iter):
above_thr=[count_lang[lang]>=max_examples_language for lang in languages]
if all(above_thr):
break
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch, training=force_dropout) #training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
#mean_act=np.sum(y_, axis=0)
for i in range(batch_size):
ind_batch=i_batch*batch_size+i
ind0=np.argmax(y[i])
ind=np.argmax(y_[i])
lang0=languages[ind0]
lang=languages[ind]
if mode_label == TRUE_LANGUAGE:
if count_lang[lang0]>=max_examples_language:
continue
count_lang[lang0] += 1
if count_lang[lang0] == max_examples_language:
st_info+=f'{lang0}: max examples reached at seq {ind_batch}\n'
print(f'{lang0}: max examples reached at seq {ind_batch}')
elif mode_label == PREDICTED_LANGUAGE:
if count_lang[lang]>=max_examples_language:
continue
count_lang[lang] += 1
if count_lang[lang] == max_examples_language:
st_info+=f'{lang}: max examples reached at seq {ind_batch}\n'
print(f'{lang}: max examples reached at seq {ind_batch}')
languages_true_list.append(lang0)
languages_predicted_list.append(lang)
if embeddings_Hellinger:
scores.append(np.sqrt(y_[i]))
else:
scores.append(y_[i])
i_batch+=1
#mean_act/=i_batch
#mean_act_copy=np.copy(mean_act)
scores_arr=np.stack(scores)
#checkpoint
checkpoint = tf.train.Checkpoint(embedding=tf.Variable(scores_arr))
checkpoint.save(os.path.join(embFolder, "embeddings.ckpt"))
#metadata
with open(os.path.join(embFolder, 'labels_true.tsv'), "w") as f:
for lang in languages_true_list:
f.write("{}\n".format(lang))
with open(os.path.join(embFolder, 'labels_predicted.tsv'), "w") as f:
for lang in languages_predicted_list:
f.write("{}\n".format(lang))
#save all data to csv
with open(os.path.join(embFolder, 'data.csv'), "w") as f:
csvWriter=csv.writer(f, delimiter='\t')
csvWriter.writerow(['label_true', 'label_predicted']+languages)
for i in range(len(languages_true_list)):
row=[languages_true_list[i], languages_predicted_list[i]]
row+=list(scores[i])
csvWriter.writerow(row)
with open(f'{embFolder}/info.txt', 'w') as f:
f.write(st_info)
Russian: max examples reached at seq 1320 Finnish: max examples reached at seq 1491 Hungarian: max examples reached at seq 1829 Arabic: max examples reached at seq 1867 Japanese: max examples reached at seq 1887 Mandarin: max examples reached at seq 1965 Polish: max examples reached at seq 2010 Basque: max examples reached at seq 2170 Danish: max examples reached at seq 2274 French: max examples reached at seq 2351 Swedish: max examples reached at seq 2385 Dutch: max examples reached at seq 2460 Turkish: max examples reached at seq 2728 Korean: max examples reached at seq 2890 Spanish: max examples reached at seq 3353 English: max examples reached at seq 3741 Italian: max examples reached at seq 4305 Estonian: max examples reached at seq 4416 Catalan: max examples reached at seq 4467 German: max examples reached at seq 4758 Portuguese: max examples reached at seq 6579
max_batchs=10000//batch_size #limit analysis to a certain number of batches #np.inf if no limitation
Correlation matrix / histogram of activations
if do_similarity_analyses:
nb_batchs_iter=min(networkInput.nbr_batchs, max_batchs)
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
corr_matrix=np.zeros((len(languages), len(languages))) #r coefficient
hist_act=np.zeros((nb_batchs_iter*batch_size, len(languages))) #non normalized activation scores
hist_act_true_lang=[]
hist_act_predicted_lang=[]
if do_pairwise_analysis:
pairwise_corr=np.zeros((len(languages), len(languages)))
pairwise_count=np.zeros((len(languages), len(languages)))
i_batch=0
while(i_batch<nb_batchs_iter):
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch, training=force_dropout) # training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
for i in range(batch_size):
ind0=np.argmax(y[i])
ind=np.argmax(y_[i])
lang0=languages[ind0]
lang=languages[ind]
hist_act_true_lang.append(lang0)
hist_act_predicted_lang.append(lang)
corr_matrix+= np.sum(np.expand_dims(y_, 1)*np.expand_dims(y_, 2), axis=0)
#for i in range(batch_size):
# corr_matrix+=np.outer(y_[i], y_[i])
hist_act[i_batch*batch_size:(i_batch+1)*batch_size]=y_
if do_pairwise_analysis:
modelBis = Model(inputs=model.input, outputs=model.get_layer('lstm_2').output)
modelBis.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
act_seq, act_h, act_c=modelBis(trueBatch) #training=False
for i in range(batch_size):
ind0=np.argmax(y[i])
for j in range(i+1, batch_size):
ind0bis=np.argmax(y[j])
ii=ind0
jj=ind0bis
corr=np.sum(act_h[i]*act_h[j])
pairwise_corr[ii][jj]+=corr
pairwise_corr[jj][ii]+=corr
pairwise_count[ii][jj]+=1
pairwise_count[jj][ii]+=1
i_batch+=1
corr_matrix/=i_batch
dev=np.sqrt(corr_matrix.diagonal())
corr_matrix/=np.outer(dev, dev) #normalization by deviations
hist_act/=np.sum(hist_act, axis=0)
if do_pairwise_analysis:
pairwise_corr/=(pairwise_count+1e-4)
#proximity measures based on activation histograms
if do_similarity_analyses:
prox_d_kl=np.zeros((len(languages), len(languages)))
prox_d_hell=np.zeros((len(languages), len(languages)))
prox_d_bhat=np.zeros((len(languages), len(languages)))
for i in range(len(languages)):
for j in range(len(languages)):
p_i=hist_act[:, i]+1e-8
p_j=hist_act[:, j]+1e-8
prox_d_kl[i][j]=np.sum(p_i*np.log2(p_i/p_j))
prox_d_kl[j][i]=np.sum(p_j*np.log2(p_j/p_i))
prox_d_hell[i][j]=np.sqrt(np.sum((np.sqrt(p_i)-np.sqrt(p_j))**2))
prox_d_hell[j][i]=prox_d_hell[i][j]
prox_d_bhat[i][j]=-2*np.log2(np.sum(np.sqrt(p_i*p_j)))
def plot_distance_matrix(dm, classes, normalize=False, title='Distance matrix', cmap=pl.cm.Blues, vmin=0, vmax=4, invert_colors=False):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
dm = dm.astype('float') / (0.00001+dm.sum(axis=1)[:, np.newaxis])
dm = np.round(dm*100, decimals=2)
#print(cm)
#pl.imshow(-np.log2(dm), interpolation='nearest', cmap=cmap, vmax=-3,vmin=-5)
#pl.imshow(-dm, interpolation='nearest', cmap=cmap, vmax=-8,vmin=-17)
if not(invert_colors):
pl.imshow(-dm, interpolation='nearest', cmap=cmap, vmin=-vmax, vmax=-vmin)
else:
pl.imshow(dm, interpolation='nearest', cmap=cmap, vmin=vmin, vmax=vmax)
pl.title(title)
#pl.colorbar()
tick_marks = np.arange(len(classes))
pl.xticks(tick_marks, classes, rotation=45)
pl.yticks(tick_marks, classes)
thresh = dm.max()*(1-2*invert_colors) / 2.
for i, j in itertools.product(range(dm.shape[0]), range(dm.shape[1])):
pl.text(j, i, int(dm[i,j]*100)*1./100,
horizontalalignment="center",
color="white" if dm[i, j]*(1-2*invert_colors) < thresh else "black")
#pl.tight_layout()
pl.ylabel('Label1 (reference)')
pl.xlabel('Label2')
def permut_mat(mat, permut=[1,2,5, 13,12,4,16,18, 15,11, 14,19,20,7, 17, 0, 3, 6, 8, 9, 10]):
conf_matrix=mat
conf_matrix_permut=np.zeros_like(conf_matrix)
for i, ind in enumerate(permut):
conf_matrix_permut[i]=conf_matrix[ind][permut]
languages_permut=[languages[ind] for ind in permut]
return languages_permut, conf_matrix_permut
if do_similarity_analyses:
#pl.figure(figsize=(10, 10))
#plot_distance_matrix(corr_matrix, languages, title='Correlation matrix', vmin=0.05, vmax=0.5)
pl.figure(figsize=(10, 10))
languages_permut, prox_d_hell_permut=permut_mat(prox_d_hell)
plot_distance_matrix(prox_d_hell_permut, languages_permut, title='Dissimilarity matrix (Hellinger distance)', vmin=0.8, vmax=1.3)
pl.figure(figsize=(10, 10))
languages_permut, prox_d_kl_permut=permut_mat(prox_d_kl)
plot_distance_matrix(prox_d_kl_permut, languages_permut, title='Dissimilarity matrix (KL div)', vmin=3, vmax=15)
pl.figure(figsize=(10, 10))
plot_distance_matrix( (prox_d_kl_permut-prox_d_kl_permut.T)/(prox_d_kl_permut+1e-3)*100, languages_permut, title='KL div, diff transpose', vmin=-20, vmax=20, invert_colors=True)
pl.figure(figsize=(10, 10))
languages_permut, prox_d_bhat_permut=permut_mat(prox_d_bhat)
plot_distance_matrix(prox_d_bhat_permut, languages_permut, title='Dissimilarity matrix (Bhattacharyya distance)', vmin=0, vmax=8)
Multidimensional scaling (nonmetric scaling)
See http://scikit-learn.org/stable/modules/manifold.html#multidimensional-scaling
import itertools
from sklearn import cluster
from sklearn import manifold
from sklearn.metrics import pairwise_distances
mds = manifold.MDS(n_components=2, metric=True,
verbose=0, dissimilarity='precomputed', n_init=10)
#NON METRIC
#mds = manifold.MDS(n_components=2, metric=False,
# n_init=30, max_iter=300,
# verbose=0, eps=0.001, dissimilarity='precomputed')
dist_measure_str = "Hellinger distance"
dm=prox_d_hell
dist_measure_str = "(symm.) KL divergence"
dm=prox_d_kl
languages2=languages
'''
print('not shown: Hungarian, Finnish, (and Polish?)')
#HACK delete hungarian
dm=np.delete(dm, 6, axis=0)
dm=np.delete(dm, 6, axis=1)
languages2=languages[0:6]+languages[7::]
#HACK delete Finnish
dm=np.delete(dm, 3, axis=0)
dm=np.delete(dm, 3, axis=1)
languages2=languages2[0:3]+languages2[4::]
#HACK delete Polish
dm=np.delete(dm, 9, axis=0)
dm=np.delete(dm, 9, axis=1)
languages2=languages2[0:9]+languages2[10::]
'''
coord_pts = mds.fit_transform((dm.T+dm)/2) #symmetrize if necessary
delta = 0.01
fig = pl.figure(figsize=(10,10))
ax = pl.gca()
ax.scatter(coord_pts[:,0], coord_pts[:,1])
for i, txt in enumerate(languages2):
ax.annotate(txt, coord_pts[i]+(delta, delta))
pl.title("Metric multidimensional scaling (computed with {})".format(dist_measure_str))
pl.plot()
stress=np.sqrt(mds.stress_/(np.sum(dm**2)/2))
dm2=pairwise_distances(coord_pts)
stress2=np.sqrt(np.sum((dm2-dm)**2)/np.sum(dm**2))
print(f'stress : {stress}')
print(f'stress : {stress2}')
for i in range(len(languages2)):
act_diss=dm[i]
diff=(dm2[i]-dm[i])
lang=languages2[i]
print(f'{lang.rjust(10)}\t actual dissimilarity: {np.sum(act_diss):.3f} \t difference: {np.sum(np.abs(diff)):.3f} \t percent diff.: {np.sum(np.abs(diff))/np.sum(act_diss)*100:.2f} %')
'''
coord_pts = mds.fit_transform((dm_selected_modified.T+dm_selected_modified)/2)
fig = pl.figure(figsize=(10,10))
ax = pl.gca()
ax.scatter(coord_pts[:,0], coord_pts[:,1])
for i, txt in enumerate(selected_languages):
ax.annotate(txt, coord_pts[i]+(delta, delta))
dist_measure_str = "KL divergence" if dist_measure == D_KL else "Hellinger distance"
pl.title("Metric multidimensional scaling (computed with {})".format(dist_measure_str))
pl.plot()
'''
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-20-f653be17cd70> in <module> 17 18 dist_measure_str = "Hellinger distance" ---> 19 dm=prox_d_hell 20 21 dist_measure_str = "(symm.) KL divergence" NameError: name 'prox_d_hell' is not defined
Note: the 'hole' : Japanese, Mandarin, Korean, Polish
pl.imshow(-dm2)
pl.colorbar()
pl.figure()
pl.imshow(-dm)
pl.colorbar()
Some clustering
delta=10.
n_clusters = 6
spec_clustering = cluster.SpectralClustering(n_clusters=n_clusters,
affinity="precomputed")
if do_similarity_analyses:
dm=(prox_d_kl+prox_d_kl.T)/2
similarity_matrix=np.exp(- dm**2 / (2. * delta ** 2))
clusters = spec_clustering.fit_predict(similarity_matrix)
print("All languages : ")
for i in range(n_clusters):
cluster_labels = [languages[j] for j in np.where(clusters == i)[0]]
print("> cluster {} : {} \n".format(i, cluster_labels))
plt=pl
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack([model.children_, model.distances_,
counts]).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
if do_similarity_analyses:
# Ward based on hist_act
# setting distance_threshold=0 ensures we compute the full tree.
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None) #affinity='precomputed'
#XXX hist_act or sqrt hist_act ??
model_clustering = model_clustering.fit(np.sqrt(hist_act.T))
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
def labels_dendogram(k):
return languages[k]
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Ward + Hellinger distance")
plt.show()
#DM #try different linkage methods
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity='precomputed', linkage='complete' )
dm=(prox_d_kl+prox_d_kl.T)/2
model_clustering = model_clustering.fit(dm) #if dm, needs to add affinity='precomputed' and change linkage method
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
def labels_dendogram(k):
return languages[k]
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Complete linkage based on symmetrized D_KL ")
plt.show()
# Bhattacharyya distance
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity='precomputed', linkage='complete' )
dm=prox_d_bhat
#dm+=16*prox_d_hell
model_clustering = model_clustering.fit(dm) #if dm, needs to add affinity='precomputed' and change linkage method
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Complete linkage based on Bhattacharyya distance ")
plt.show()
max_batchs=np.infty
if compute_confusion_matrix:
nb_batchs_iter=min(networkInput.nbr_batchs, max_batchs)
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
#lang_true=[]
#lang_predicted=[]
conf_matrix=np.zeros((len(languages), len(languages)))
conf_matrix_filtered=np.zeros((len(languages_test), len(languages_test)))
i_batch=0
while(i_batch<nb_batchs_iter):
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch, training=force_dropout_conf_matrix) #training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
for j in range(batch_size):
ind0=np.argmax(y[j])
ind=np.argmax(y_[j])
lang0=languages[ind0]
lang=languages[ind]
#lang_true.append(lang0)
#lang_predicted.append(lang)
conf_matrix[ind0][ind]+=1
ind0bis=np.argmax(y[j][inds_lang_test])
indbis=np.argmax(y_[j][inds_lang_test])
conf_matrix_filtered[ind0bis][indbis]+=1
i_batch+=1
if compute_confusion_matrix:
#plot_distance_matrix(conf_matrix, languages, vmin=0, vmax=20, normalize=True, invert_colors=True)
permut=[1,2,5, 13,12,4,16,18, 15,11, 14,19,20,7, 17, 0, 3, 6, 8, 9, 10]
conf_matrix_permut=np.zeros_like(conf_matrix)
for i, ind in enumerate(permut):
conf_matrix_permut[i]=conf_matrix[ind][permut]
languages_permut=[languages[ind] for ind in permut]
if useRamus:
plot_distance_matrix(conf_matrix_filtered, languages_test, vmin=0, vmax=5, invert_colors=True,title='Confusion matrix')
permut=[0,1,2,5,3,6,7, 4]
conf_matrix_filtered_permut=np.zeros_like(conf_matrix_filtered)
for i, ind in enumerate(permut):
conf_matrix_filtered_permut[i]=conf_matrix_filtered[ind][permut]
languages_test_permut=[languages_test[ind] for ind in permut]
pl.figure(figsize=(6,6))
pl.ylim([7.5, -0.5])
plot_distance_matrix(conf_matrix_filtered_permut, languages_test_permut , vmin=0, vmax=5, invert_colors=True, title='Confusion matrix')
pl.figure(figsize=(6,6))
plot_distance_matrix(conf_matrix_filtered_permut, languages_test_permut , normalize=True, vmin=0, vmax=50, invert_colors=True, title='Confusion matrix')
pl.ylim([7.5, -0.5])
else:
pl.figure(figsize=(15,15))
plot_distance_matrix(conf_matrix_permut, languages_permut , normalize=True, vmin=0, vmax=10, invert_colors=True, title='Confusion matrix')
pl.ylim([20.5, -0.5])
pl.figure(figsize=(15,15))
plot_distance_matrix(conf_matrix_permut, languages_permut ,normalize=True, vmin=0, vmax=30,
cmap=pl.cm.Purples, invert_colors=True, title='Confusion matrix')
pl.ylim([20.5, -0.5])
#pl.savefig('conf_matrix_train.svg')