import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as pl
mpl.rcParams['axes.titlesize']=20
mpl.rcParams['axes.labelsize']= 20
mpl.rcParams['lines.linewidth']= 4
mpl.rcParams['font.size']= 15
mpl.rcParams['lines.markersize']= 11
mpl.rcParams['xtick.labelsize']= 16
mpl.rcParams['ytick.labelsize']= 16
import tensorflow as tf
import tensorflow.keras.optimizers as optimizers
from model import *
from input import NetworkInput
from data import createFeaturesDescription
from config import *
from tensorflow.keras.models import Model
from datetime import datetime
from IPython.display import Audio
import soundfile as sf
import os
import json
import csv
import itertools
from tensorboard.plugins import projector
from sklearn import cluster
from scipy.cluster.hierarchy import dendrogram
run_with_papermill=False
batch_size=16 #16 #512
stride=2
use_F0=False
F0_binary_values=False
use_deltas=False
hidden_size=128
num_layers=2
weights_filename='./logdir/2020-11-20_18-39-24-2_150_multiple_dropout/weights/weights_2020-11-20_18-39-24.h5'
weights_name='2_150_multiple_dropout'
useExScores=False
useRamus=False
use1dScores=False
useBalancedDataSet=False
useTest=False
balanced_dataset_folder='balanced_20_1'
evaluate_model=False
save_activations=False #save activations to json files #mode ALL_CELLS or SELECTED_CELLS defined later
save_embeddings=True #save embeddings as checkpoint file (for tensorboard)
do_similarity_analyses=True
compute_confusion_matrix=False
embeddings_Hellinger=True #embeddings based on sqrt of output
force_dropout=True #for embeddings, similarity analyses
force_dropout_conf_matrix=False #force dropout for confusion matrix
#For the examples of outputs, set manually
keep_prob=1
keep_prob_recurrent=1
keep_prob_dense_layer=1
#note: others useful parameters in rest of code, e.g. max_batches for analyses
max_examples_language=120 #max examples by language for tSNE embeddings
do_pairwise_analysis=False #pairewise analysis inside every batch
# Parameters
run_with_papermill = True
hidden_size = 180
use_deltas = True
use_F0 = True
F0_binary_values = False
useBalancedDataSet = True
keep_prob = 0.8
keep_prob_recurrent = 1
keep_prob_dense_layer = 1
weights_filename = "./logdir/2020-12-02_12-27-00-2_180_w_deltas_dropout_20_w_F0/weights/weights_2020-12-02_12-27-00.h5"
weights_name = "weights_2_180_F0"
compute_confusion_matrix = False
max_examples_language = 150
save_embeddings=False
if not(run_with_papermill): #manual settings
force_dropout=True
keep_prob=0.7
keep_prob_dense_layer=0.9
keep_prob_recurrent=0.9
useBalancedDataSet=True
#useRamus=True
use_deltas=True
hidden_size=150
evaluate_model=False
save_embeddings=False
save_activations=True
do_similarity_analyses=False
compute_confusion_matrix=False
weights_config={
'num_steps':32,
'features_description':createFeaturesDescription(F0=use_F0),
'stride':stride
}
load_weights=True
#old models (old inputs)
#1D models
#weights_filename="./models/weights0420/weights20epochs.h5"
#weights_filename, weights_name="./models/weights0420b/weights_2020-04-07_17-55-05.h5", "weights_2020-04-07"
#weights_config['features_description']=createFeaturesDescription(HRmsValue=False, F0=False)
#2D models
#weights_filename, weights_name='./models/weights0715-2d-60Hz/weights_2020-07-15_12-15-21.h5', 'weights0715-2d-60Hz'
#weights_filename, weights_name='./models/weights0717-2d-60Hz-64/weights_2020-07-17_17-35-40.h5', 'weights0717-2d-60Hz-64'
#weights_config['features_description']=createFeaturesDescription(F0=False)
#weights_config['stride']=1
#weights_filename, weights_name="./models/weights0710-3d/weights_2020-07-03_17-40-31.h5", "weights0710-3d"
num_steps=weights_config["num_steps"]
features_description=weights_config['features_description']
stride=weights_config['stride']
if not(force_dropout or force_dropout_keep_prob_dense_layer):
keep_prob_dense_layer=1
keep_prob=1
config=Config(batch_size, num_steps, hidden_size=hidden_size,
num_layers=num_layers,
keep_prob=keep_prob, keep_prob_recurrent=keep_prob_recurrent,
keep_prob_dense_layer=keep_prob_dense_layer)
config=completedConfig(config) #take default params for unspecified params
WARNING:root:No cell type specified in config: using LSTM
languages = ["Danish", "Dutch", "English", "Finnish",
"French", "German", "Hungarian", "Italian",
"Japanese", "Korean", "Mandarin", "Polish",
"Portuguese", "Russian", "Spanish",
"Swedish", "Turkish", "Estonian", "Arabic", "Czech", "Romanian",
"Basque", "Catalan"] #NB: check that the order of elements is consistent with model
#Remove languages with not enough data
languages.remove("Czech")
languages.remove("Romanian")
#languages_dataset=languages+['Romanian'] #None -> autodetect, languages-> same as model (defined above)
languages_dataset=None
#scores Folder #default: "./Scores"
assert useExScores^useRamus^use1dScores^useBalancedDataSet^useTest , "choose a unique dataset"
if useExScores:
scores_folder='./ex_Scores'
elif useRamus:
scores_folder='./Scores_Ramus'
elif use1dScores:
scores_folder='./Scores_1d'
elif useTest or useBalancedDataSet:
scores_folder='./Scores'
max_files_evaluation= 2024 #np.inf
# FIRST VERSION
#languages = ['Danish', 'Russian', 'Mandarin', 'Finnish', 'Dutch', 'English', 'Hungarian', 'Swedish',
# 'Italian', 'French', 'Japanese', 'German', 'Portuguese', 'Polish', 'Spanish', 'Korean']
sets ={}
set_folds=[0]
if useExScores or useRamus:
sets_folds={"test":[0]}
elif useBalancedDataSet:
sets_folds={"test":[0]} #subfolder defined later
else:
sets_folds = {"train" : [0, 1, 2],
"test":[3,4],
"test1" : [3],
"test2" : [4]}
initial_sample_length=3*2**14 if useRamus else 10*2**14
TFRecords_batch_size=1 if useRamus else 16
set_name='train'
if set_name in sets_folds:
if useRamus:
subfolders=[""]
elif useBalancedDataSet:
subfolders=[balanced_dataset_folder]
else:
subfolders=["fold_{}/".format(k_fold) for k_fold in sets_folds[set_name]]
sets[set_name] = NetworkInput(config, folder=scores_folder,
subfolder=subfolders,
stride=stride, verbose=True, for_evaluation=True,
languages=languages, name=set_name, features_description=features_description,
initial_sample_length=initial_sample_length, TFRecords_batch_size=TFRecords_batch_size,
use_deltas=use_deltas,
F0_binary_values=F0_binary_values) #TRAINING SET BUT FOR EVALUATION
set_name='test'
if useRamus:
subfolders=[""]
elif useBalancedDataSet:
subfolders=[balanced_dataset_folder]
else:
subfolders=["fold_{}/".format(k_fold) for k_fold in sets_folds[set_name]]
sets[set_name] = NetworkInput(config, folder=scores_folder, for_evaluation=True,
subfolder=subfolders,
stride=stride, verbose=True,
languages=languages, languages_model=languages, name=set_name, features_description=features_description,
initial_sample_length=initial_sample_length, TFRecords_batch_size=TFRecords_batch_size,
use_deltas=use_deltas,
F0_binary_values=F0_binary_values) #autodetect languages
'''
sets_folds = {"train" : [0, 1, 2],
"test1" : [3],
"test2" : [4]}
sets ={}
sets_folds={"train":sets_folds["train"]}
for set_name, set_folds in sets_folds.items():
print("{} : folds {}".format(set_name, set_folds))
sets[set_name] = NetworkInput(config, folder='./Scores',
subfolder=["fold_{}/".format(k_fold) for k_fold in set_folds],
stride=stride, verbose=True,
languages=languages, name=set_name)
'''
WARNING ; some languages in the dataset are not considered by the model DATASET test for evaluation only (test/validation set) Data augmentation: off. Input params/info: sampling frequency of inputs : 31.25 Hz sample length : 320 (initial sample length : 163840, step : 256, stride : 2) sample duration : 10.24 s batch size : 16 num slices by example: 10 (num timesteps by slices: 32) WARNING ; some languages in the dataset are not considered by the model languages (total: 21) 0: Danish 1: Dutch 2: English 3: Finnish 4: French 5: German 6: Hungarian 7: Italian 8: Japanese 9: Korean 10: Mandarin 11: Polish 12: Portuguese 13: Russian 14: Spanish 15: Swedish 16: Turkish 17: Estonian 18: Arabic 19: Basque 20: Catalan (Sub)folders: ['balanced_20_1'] Total number of examples - test - : 6720 (420 batchs) Per language : Danish : 320 (4.76 %) Dutch : 320 (4.76 %) English : 320 (4.76 %) Finnish : 320 (4.76 %) French : 320 (4.76 %) German : 320 (4.76 %) Hungarian : 320 (4.76 %) Italian : 320 (4.76 %) Japanese : 320 (4.76 %) Korean : 320 (4.76 %) Mandarin : 320 (4.76 %) Polish : 320 (4.76 %) Portuguese : 320 (4.76 %) Russian : 320 (4.76 %) Spanish : 320 (4.76 %) Swedish : 320 (4.76 %) Turkish : 320 (4.76 %) Estonian : 320 (4.76 %) Arabic : 320 (4.76 %) Basque : 320 (4.76 %) Catalan : 320 (4.76 %) input depth (nb features) : 3 x2 (using deltas) = 6
'\nsets_folds = {"train" : [0, 1, 2],\n "test1" : [3],\n "test2" : [4]}\n \n \nsets ={}\n\nsets_folds={"train":sets_folds["train"]}\n\nfor set_name, set_folds in sets_folds.items():\n print("{} : folds {}".format(set_name, set_folds))\n sets[set_name] = NetworkInput(config, folder=\'./Scores\', \n subfolder=["fold_{}/".format(k_fold) for k_fold in set_folds],\n stride=stride, verbose=True, \n languages=languages, name=set_name)\n'
inds_lang_test=list(np.flatnonzero(sets['test'].frequencies))
filter_lang_test=np.array(sets['test'].frequencies)>0
languages_test= [languages[i] for i in inds_lang_test]
networkInput=sets["test"]
model=build_model(config, networkInput, return_state=True) #return_state will be useful to retrieve cell states
model.summary()
Model: "model_3" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input_4 (InputLayer) [(16, 32, 6)] 0 __________________________________________________________________________________________________ lstm_1 (LSTM) [(16, 32, 180), (16, 134640 input_4[0][0] __________________________________________________________________________________________________ lstm_2 (LSTM) [(16, 32, 180), (16, 259920 lstm_1[0][0] __________________________________________________________________________________________________ dropout_1 (Dropout) (16, 32, 180) 0 lstm_2[0][0] __________________________________________________________________________________________________ time_distributed_1 (TimeDistrib (16, 32, 21) 3801 dropout_1[0][0] __________________________________________________________________________________________________ input_5 (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ input_6 (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ softmax_1 (Softmax) (16, 32, 21) 0 time_distributed_1[0][0] ================================================================================================== Total params: 398,361 Trainable params: 398,361 Non-trainable params: 0 __________________________________________________________________________________________________
if load_weights:
model.load_weights(weights_filename)
#METRICS
#acc_end_seq=AccuracyStateless(networkInput, includeSampleWeights=False)
acc_slices=[AccuracyStateless(networkInput, ind_batch_compute=k) for k in range(networkInput.num_slices_by_example)]
top3_slices=[TopKAccuracyStateless(networkInput, k=3, ind_batch_compute=j) for j in range(networkInput.num_slices_by_example)]
metricsList=[#accuracy_on_last_step, top_k_accuracy_on_last_step_partial(k=3)
KL_div_on_last_step, cross_entropy_on_last_step]
metricsList+=acc_slices
metricsList+=top3_slices
KLLoss=tf.keras.losses.KLDivergence()
model.compile(loss=KLLoss, metrics=metricsList)
if evaluate_model:#EVALUATION
true_nb_batches=networkInput.nbr_batchs*networkInput.num_slices_by_example
max_nb_batches=max_files_evaluation/config.batch_size*networkInput.num_slices_by_example
nb_steps=np.minimum(max_nb_batches, true_nb_batches)
forgetStates=Forget_states_callback(networkInput, model, verbose=False)
callbacksList=[forgetStates]
metrics_end=model.evaluate(networkInput.sliced_batch, verbose=1, steps=nb_steps,callbacks=callbacksList)
def print_top5(st, y, y_):
ind0=np.argmax(y)
ind=np.argsort(-y_)
print(f"{st}\nlanguage: {languages[ind0]}")
st=" "
for k in range(7):
st+=f"{k+1}: {languages[ind[k]]}, "
st+='\n'
#for k, lang in enumerate(languages):
# print(lang)
# print(y_[k].numpy())
print(st)
def gen_yy_(return_all_outputs=False):
'''return_all_outputs: if True, returns all the outputs associated with each example (at every slice and every step)'''
model.reset_states()
batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example)
#option 1, use y_=model(x, training=False)
#option 2 (first axis has size batch_size x steps)
#predictions=model.predict(batch, steps = networkInput.num_slices_by_example)
outputs=[]
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch) #, training=False
outputs.append(y_)
y=y.numpy()[:,-1]
y_=y_[:,-1]
res=[]
for k in range(networkInput.config.batch_size):
if return_all_outputs:
res.append((filenames[k][0].numpy().decode('utf-8'),
y[k], [outputs_2[k] for outputs_2 in outputs] ))
else:
res.append((filenames[k][0].numpy().decode('utf-8'),
y[k], y_[k]))
return res
for i in range(min(5, 10//batch_size*5+1)):
batch_yy_ = gen_yy_()
for k in range( min(5, batch_size)):
print_top5(*batch_yy_[k])
Creating 'look-up tables' for filenames fold_0_CommonVoice_9acfd55862ec238_9acfd55862ec238_slice1 language: Mandarin 1: Mandarin, 2: German, 3: Spanish, 4: Dutch, 5: Catalan, 6: Italian, 7: French, fold_0_WLI_file_24_kor-5ebf26c7ff0be2dc73b71bb8586bf78d_65 language: Korean 1: Korean, 2: Basque, 3: Swedish, 4: Mandarin, 5: Danish, 6: Spanish, 7: Catalan, fold_0_librivox_reader2909_reader2909_6719_sec2_44 language: Swedish 1: Swedish, 2: Dutch, 3: German, 4: Polish, 5: Spanish, 6: Arabic, 7: Danish, fold_0_CommonVoice_ab69353623c69f0_ab69353623c69f0_slice2 language: Turkish 1: Basque, 2: French, 3: Turkish, 4: Spanish, 5: Italian, 6: Catalan, 7: Portuguese, fold_2_CommonVoice_fa7f67d93b2f3a6_fa7f67d93b2f3a6_slice25 language: Estonian 1: Estonian, 2: Spanish, 3: Russian, 4: Italian, 5: Basque, 6: Turkish, 7: Catalan,
test audio
def cell_st(cell_type):
if cell_type==LSTM_CELL:
return "lstm"
elif cell_type==GRU_CELL:
return "gru"
nb_batchs_iter=networkInput.nbr_batchs if save_activations else 1 #nbr batchs (before splitted) to iter on
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
dic_list=[{} for i in range(nb_batchs_iter*config.batch_size)]
i_batch=0
#specific activations
selected_cells={'lstm_2': {
'cell_states': [3, 4, 92, 115, 116, 121],
'outputs': [3, 4, 92, 115, 116, 121]
},
'lstm_1':{
'outputs': [],
'cell_states': []
}
}
#mode, save all cells or selected cells
SELECTED_CELLS=1
ALL_CELLS=0
mode=ALL_CELLS
while(i_batch<nb_batchs_iter):
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
for l in range(config.num_layers): #NB: very inefficient because the network activations are computed several times
layerName=f'{cell_st(config.cell_type)}_{l+1}'
if l==0: #also add output scores
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch) # training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
for i in range(batch_size):
ind_batch=i_batch*batch_size+i
filename=filenames[i][0].numpy().decode('utf-8')
if useRamus: #HACK
filename="_".join(filename.split("_")[2:])
dic_list[ind_batch]['filename']=filename
ind0=np.argmax(y[i])
ind=np.argmax(y_[i])
dic_list[ind_batch]['label']=languages[ind0]
dic_list[ind_batch]['predicted']=languages[ind]
dic_list[ind_batch]['activations']={}
if mode==ALL_CELLS:
dic_list[ind_batch]['scores']={}
for j, lang in enumerate(languages):
dic_list[ind_batch]['scores'][lang]=str(y_[i][j])
modelBis = Model(inputs=model.input, outputs=model.get_layer(layerName).output)
modelBis.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
act_seq, act_h, act_c=modelBis(trueBatch) #training=False
for i in range(batch_size):
ind_batch=i_batch*batch_size+i
dic_list[ind_batch]['activations'][layerName]={}
if mode==SELECTED_CELLS:
dic_list[ind_batch]['activations'][layerName]['outputs']={}
for j in selected_cells[layerName]['outputs']:
dic_list[ind_batch]['activations'][layerName]['outputs'][str(j)]=str(act_h.numpy()[i][j])
dic_list[ind_batch]['activations'][layerName]['cell_states']={}
for j in selected_cells[layerName]['cell_states']:
dic_list[ind_batch]['activations'][layerName]['cell_states'][str(j)]=str(act_c.numpy()[i][j])
elif mode==ALL_CELLS:
dic_list[ind_batch]['activations'][layerName]['outputs']=[str(x) for x in act_h.numpy()[i]]
dic_list[ind_batch]['activations'][layerName]['cell_states']=[str(x) for x in act_c.numpy()[i]]
i_batch+=1
#save activations to files
if save_activations:
for i in range(nb_batchs_iter*batch_size):
dic=dic_list[i]
filename=dic['filename']
mode_text = 'ALL' if mode==ALL_CELLS else 'SELECTED'
jsonFolder=f'./activations/{scores_folder}/{weights_name}_{mode_text}/'
os.makedirs(jsonFolder, exist_ok=True)
jsonFilename=f'{jsonFolder}{filename}.json'
with open(jsonFilename, 'w') as f:
json.dump(dic, f, indent=4)
if force_dropout:
dropout_flag=f'dropout_{int(round(100*(1-keep_prob_dense_layer)))}'
else:
dropout_flag='no_dropout'
if embeddings_Hellinger:
Hell_flag='_Hellinger'
else:
Hell_flag=''
embFolder=f'./embeddings/{scores_folder}/{weights_name}/{dropout_flag}{Hell_flag}/'
TRUE_LANGUAGE=0
PREDICTED_LANGUAGE=1
mode_label = 1 #write both labels either ways, but differs for max nb examples by language strategy
max_nb_batchs_iter=np.inf
if save_embeddings:
if not os.path.exists(embFolder):
os.makedirs(embFolder)
proj_config = projector.ProjectorConfig()
#proj_config.model_checkpoint_path = embeddings_ckpt_name
#TODO diff tensors with diff. datasets?
embeddings = proj_config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`
embeddings.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
if mode_label == TRUE_LANGUAGE:
embeddings.metadata_path = 'labels_true.tsv'
elif mode_label == PREDICTED_LANGUAGE:
embeddings.metadata_path = 'labels_predicted.tsv'
projector.visualize_embeddings(embFolder, proj_config)
# save checkpoint/metadata
if save_embeddings:
st_info=''
nb_batchs_iter=min(networkInput.nbr_batchs, max_nb_batchs_iter)
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
languages_true_list=[]
languages_predicted_list=[]
count_lang=dict([(lang, 0) for lang in languages])
scores=[]
mean_act=np.zeros(len(languages)) #for normalization purposes if needed
i_batch=0
while(i_batch<nb_batchs_iter):
above_thr=[count_lang[lang]>=max_examples_language for lang in languages]
if all(above_thr):
break
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch, training=force_dropout) #training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
#mean_act=np.sum(y_, axis=0)
for i in range(batch_size):
ind_batch=i_batch*batch_size+i
ind0=np.argmax(y[i])
ind=np.argmax(y_[i])
lang0=languages[ind0]
lang=languages[ind]
if mode_label == TRUE_LANGUAGE:
if count_lang[lang0]>=max_examples_language:
continue
count_lang[lang0] += 1
if count_lang[lang0] == max_examples_language:
st_info+=f'{lang0}: max examples reached at seq {ind_batch}\n'
print(f'{lang0}: max examples reached at seq {ind_batch}')
elif mode_label == PREDICTED_LANGUAGE:
if count_lang[lang]>=max_examples_language:
continue
count_lang[lang] += 1
if count_lang[lang] == max_examples_language:
st_info+=f'{lang}: max examples reached at seq {ind_batch}\n'
print(f'{lang}: max examples reached at seq {ind_batch}')
languages_true_list.append(lang0)
languages_predicted_list.append(lang)
if embeddings_Hellinger:
scores.append(np.sqrt(y_[i]))
else:
scores.append(y_[i])
i_batch+=1
#mean_act/=i_batch
#mean_act_copy=np.copy(mean_act)
scores_arr=np.stack(scores)
#checkpoint
checkpoint = tf.train.Checkpoint(embedding=tf.Variable(scores_arr))
checkpoint.save(os.path.join(embFolder, "embeddings.ckpt"))
#metadata
with open(os.path.join(embFolder, 'labels_true.tsv'), "w") as f:
for lang in languages_true_list:
f.write("{}\n".format(lang))
with open(os.path.join(embFolder, 'labels_predicted.tsv'), "w") as f:
for lang in languages_predicted_list:
f.write("{}\n".format(lang))
#save all data to csv
with open(os.path.join(embFolder, 'data.csv'), "w") as f:
csvWriter=csv.writer(f, delimiter='\t')
csvWriter.writerow(['label_true', 'label_predicted']+languages)
for i in range(len(languages_true_list)):
row=[languages_true_list[i], languages_predicted_list[i]]
row+=list(scores[i])
csvWriter.writerow(row)
with open(f'{embFolder}/info.txt', 'w') as f:
f.write(st_info)
max_batchs=10000//batch_size #limit analysis to a certain number of batches #np.inf if no limitation
Correlation matrix / histogram of activations
if do_similarity_analyses:
nb_batchs_iter=min(networkInput.nbr_batchs, max_batchs)
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
corr_matrix=np.zeros((len(languages), len(languages))) #r coefficient
hist_act=np.zeros((nb_batchs_iter*batch_size, len(languages))) #non normalized activation scores
hist_act_true_lang=[]
hist_act_predicted_lang=[]
if do_pairwise_analysis:
pairwise_corr=np.zeros((len(languages), len(languages)))
pairwise_count=np.zeros((len(languages), len(languages)))
i_batch=0
while(i_batch<nb_batchs_iter):
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch, training=force_dropout) # training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
for i in range(batch_size):
ind0=np.argmax(y[i])
ind=np.argmax(y_[i])
lang0=languages[ind0]
lang=languages[ind]
hist_act_true_lang.append(lang0)
hist_act_predicted_lang.append(lang)
corr_matrix+= np.sum(np.expand_dims(y_, 1)*np.expand_dims(y_, 2), axis=0)
#for i in range(batch_size):
# corr_matrix+=np.outer(y_[i], y_[i])
hist_act[i_batch*batch_size:(i_batch+1)*batch_size]=y_
if do_pairwise_analysis:
modelBis = Model(inputs=model.input, outputs=model.get_layer('lstm_2').output)
modelBis.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
act_seq, act_h, act_c=modelBis(trueBatch) #training=False
for i in range(batch_size):
ind0=np.argmax(y[i])
for j in range(i+1, batch_size):
ind0bis=np.argmax(y[j])
ii=ind0
jj=ind0bis
corr=np.sum(act_h[i]*act_h[j])
pairwise_corr[ii][jj]+=corr
pairwise_corr[jj][ii]+=corr
pairwise_count[ii][jj]+=1
pairwise_count[jj][ii]+=1
i_batch+=1
corr_matrix/=i_batch
dev=np.sqrt(corr_matrix.diagonal())
corr_matrix/=np.outer(dev, dev) #normalization by deviations
hist_act/=np.sum(hist_act, axis=0)
if do_pairwise_analysis:
pairwise_corr/=(pairwise_count+1e-4)
#proximity measures based on activation histograms
if do_similarity_analyses:
prox_d_kl=np.zeros((len(languages), len(languages)))
prox_d_jensen=np.zeros((len(languages), len(languages)))
prox_d_hell=np.zeros((len(languages), len(languages)))
prox_d_bhat=np.zeros((len(languages), len(languages)))
for i in range(len(languages)):
for j in range(len(languages)):
p_i=hist_act[:, i]+1e-8
p_j=hist_act[:, j]+1e-8
p_mean=(p_i+p_j)/2
prox_d_kl[i][j]=np.sum(p_i*np.log2(p_i/p_j))
prox_d_kl[j][i]=np.sum(p_j*np.log2(p_j/p_i))
prox_d_jensen[i][j]=0.5*(np.sum(p_i*np.log2(p_i/p_mean))+np.sum(p_j*np.log2(p_j/p_mean)))
prox_d_jensen[j][i]=prox_d_jensen[i][j]
prox_d_hell[i][j]=np.sqrt(np.sum((np.sqrt(p_i)-np.sqrt(p_j))**2))
prox_d_hell[j][i]=prox_d_hell[i][j]
prox_d_bhat[i][j]=-2*np.log2(np.sum(np.sqrt(p_i*p_j)))
def plot_distance_matrix(dm, classes, normalize=False, title='Distance matrix', cmap=pl.cm.Blues, vmin=0, vmax=4, invert_colors=False):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
dm = dm.astype('float') / (0.00001+dm.sum(axis=1)[:, np.newaxis])
dm = np.round(dm*100, decimals=2)
#print(cm)
#pl.imshow(-np.log2(dm), interpolation='nearest', cmap=cmap, vmax=-3,vmin=-5)
#pl.imshow(-dm, interpolation='nearest', cmap=cmap, vmax=-8,vmin=-17)
if not(invert_colors):
pl.imshow(-dm, interpolation='nearest', cmap=cmap, vmin=-vmax, vmax=-vmin)
else:
pl.imshow(dm, interpolation='nearest', cmap=cmap, vmin=vmin, vmax=vmax)
pl.title(title)
#pl.colorbar()
tick_marks = np.arange(len(classes))
pl.xticks(tick_marks, classes, rotation=45)
pl.yticks(tick_marks, classes)
thresh = dm.max()*(1-2*invert_colors) / 2.
for i, j in itertools.product(range(dm.shape[0]), range(dm.shape[1])):
pl.text(j, i, int(dm[i,j]*100)*1./100,
horizontalalignment="center",
color="white" if dm[i, j]*(1-2*invert_colors) < thresh else "black")
#pl.tight_layout()
pl.ylabel('Label1 (reference)')
pl.xlabel('Label2')
def permut_mat(mat, permut=[1,2,5, 13,12,4,16,18, 15,11, 14,19,20,7, 17, 0, 3, 6, 8, 9, 10]):
conf_matrix=mat
conf_matrix_permut=np.zeros_like(conf_matrix)
for i, ind in enumerate(permut):
conf_matrix_permut[i]=conf_matrix[ind][permut]
languages_permut=[languages[ind] for ind in permut]
return languages_permut, conf_matrix_permut
if do_similarity_analyses:
#pl.figure(figsize=(10, 10))
#plot_distance_matrix(corr_matrix, languages, title='Correlation matrix', vmin=0.05, vmax=0.5)
pl.figure(figsize=(10, 10))
languages_permut, prox_d_hell_permut=permut_mat(prox_d_hell)
plot_distance_matrix(prox_d_hell_permut, languages_permut, title='Dissimilarity matrix (Hellinger distance)', vmin=0.8, vmax=1.3)
pl.figure(figsize=(10, 10))
languages_permut, prox_d_kl_permut=permut_mat(prox_d_kl)
plot_distance_matrix(prox_d_kl_permut, languages_permut, title='Dissimilarity matrix (KL div)', vmin=3, vmax=15)
pl.figure(figsize=(10, 10))
plot_distance_matrix( (prox_d_kl_permut-prox_d_kl_permut.T)/(prox_d_kl_permut+1e-3)*100, languages_permut, title='KL div, diff transpose', vmin=-20, vmax=20, invert_colors=True)
pl.figure(figsize=(10, 10))
languages_permut, prox_d_bhat_permut=permut_mat(prox_d_bhat)
plot_distance_matrix(prox_d_bhat_permut, languages_permut, title='Dissimilarity matrix (Bhattacharyya distance)', vmin=0, vmax=8)
Multidimensional scaling (nonmetric scaling)
See http://scikit-learn.org/stable/modules/manifold.html#multidimensional-scaling
import itertools
from sklearn import cluster
from sklearn import manifold
from sklearn.metrics import pairwise_distances
mds = manifold.MDS(n_components=2, metric=True,
verbose=0, dissimilarity='precomputed', n_init=10)
#NON METRIC
#mds = manifold.MDS(n_components=2, metric=False,
# n_init=30, max_iter=300,
# verbose=0, eps=0.001, dissimilarity='precomputed')
if do_similarity_analyses:
dist_measure_str = "Hellinger distance"
dm=prox_d_hell
dist_measure_str = "(symm.) KL divergence"
dm=prox_d_kl
languages2=languages
'''
print('not shown: Hungarian, Finnish, (and Polish?)')
#HACK delete hungarian
dm=np.delete(dm, 6, axis=0)
dm=np.delete(dm, 6, axis=1)
languages2=languages[0:6]+languages[7::]
#HACK delete Finnish
dm=np.delete(dm, 3, axis=0)
dm=np.delete(dm, 3, axis=1)
languages2=languages2[0:3]+languages2[4::]
#HACK delete Polish
dm=np.delete(dm, 9, axis=0)
dm=np.delete(dm, 9, axis=1)
languages2=languages2[0:9]+languages2[10::]
'''
coord_pts = mds.fit_transform((dm.T+dm)/2) #symmetrize if necessary
delta = 0.01
fig = pl.figure(figsize=(10,10))
ax = pl.gca()
ax.scatter(coord_pts[:,0], coord_pts[:,1])
for i, txt in enumerate(languages2):
ax.annotate(txt, coord_pts[i]+(delta, delta))
pl.title("Metric multidimensional scaling (computed with {})".format(dist_measure_str))
pl.plot()
stress=np.sqrt(mds.stress_/(np.sum(dm**2)/2))
dm2=pairwise_distances(coord_pts)
stress2=np.sqrt(np.sum((dm2-dm)**2)/np.sum(dm**2))
print(f'stress : {stress}')
print(f'stress : {stress2}')
for i in range(len(languages2)):
act_diss=dm[i]
diff=(dm2[i]-dm[i])
lang=languages2[i]
print(f'{lang.rjust(10)}\t actual dissimilarity: {np.sum(act_diss):.3f} \t difference: {np.sum(np.abs(diff)):.3f} \t percent diff.: {np.sum(np.abs(diff))/np.sum(act_diss)*100:.2f} %')
'''
coord_pts = mds.fit_transform((dm_selected_modified.T+dm_selected_modified)/2)
fig = pl.figure(figsize=(10,10))
ax = pl.gca()
ax.scatter(coord_pts[:,0], coord_pts[:,1])
for i, txt in enumerate(selected_languages):
ax.annotate(txt, coord_pts[i]+(delta, delta))
dist_measure_str = "KL divergence" if dist_measure == D_KL else "Hellinger distance"
pl.title("Metric multidimensional scaling (computed with {})".format(dist_measure_str))
pl.plot()
'''
stress : 0.2819610127080249 stress : 0.2852081492786389 Danish actual dissimilarity: 300.958 difference: 78.363 percent diff.: 26.04 % Dutch actual dissimilarity: 257.103 difference: 57.909 percent diff.: 22.52 % English actual dissimilarity: 265.671 difference: 56.265 percent diff.: 21.18 % Finnish actual dissimilarity: 306.161 difference: 72.399 percent diff.: 23.65 % French actual dissimilarity: 236.010 difference: 54.075 percent diff.: 22.91 % German actual dissimilarity: 253.626 difference: 54.753 percent diff.: 21.59 % Hungarian actual dissimilarity: 345.660 difference: 83.323 percent diff.: 24.11 % Italian actual dissimilarity: 237.796 difference: 50.873 percent diff.: 21.39 % Japanese actual dissimilarity: 323.254 difference: 81.212 percent diff.: 25.12 % Korean actual dissimilarity: 327.122 difference: 82.303 percent diff.: 25.16 % Mandarin actual dissimilarity: 315.665 difference: 88.184 percent diff.: 27.94 % Polish actual dissimilarity: 288.762 difference: 75.825 percent diff.: 26.26 % Portuguese actual dissimilarity: 228.465 difference: 48.201 percent diff.: 21.10 % Russian actual dissimilarity: 244.728 difference: 64.032 percent diff.: 26.16 % Spanish actual dissimilarity: 222.186 difference: 43.816 percent diff.: 19.72 % Swedish actual dissimilarity: 254.501 difference: 63.148 percent diff.: 24.81 % Turkish actual dissimilarity: 266.057 difference: 64.477 percent diff.: 24.23 % Estonian actual dissimilarity: 297.532 difference: 68.256 percent diff.: 22.94 % Arabic actual dissimilarity: 250.125 difference: 59.805 percent diff.: 23.91 % Basque actual dissimilarity: 278.248 difference: 61.101 percent diff.: 21.96 % Catalan actual dissimilarity: 268.953 difference: 59.042 percent diff.: 21.95 %
import itertools
from sklearn import cluster
from sklearn import manifold
from sklearn.metrics import pairwise_distances
mds = manifold.MDS(n_components=2, metric=True,
verbose=0, dissimilarity='precomputed', n_init=10)
#NON METRIC
#mds = manifold.MDS(n_components=2, metric=False,
# n_init=30, max_iter=300,
# verbose=0, eps=0.001, dissimilarity='precomputed')
dist_measure_str = "Hellinger distance"
dm=prox_d_hell
dist_measure_str= "Bhattacharyya distance "
dm=prox_d_bhat
#dist_measure_str = "(symm.) KL divergence"
#dm=prox_d_kl
#dist_measure_str= "Jensen-Shannon divergence"
#dm=prox_d_jensen
languages2=languages
'''print('not shown: Hungarian, Finnish, (and Polish?)')
#HACK delete hungarian
dm=np.delete(dm, 6, axis=0)
dm=np.delete(dm, 6, axis=1)
languages2=languages[0:6]+languages[7::]
#HACK delete Finnish
dm=np.delete(dm, 3, axis=0)
dm=np.delete(dm, 3, axis=1)
languages2=languages2[0:3]+languages2[4::]
#HACK delete Polish
dm=np.delete(dm, 9, axis=0)
dm=np.delete(dm, 9, axis=1)
languages2=languages2[0:9]+languages2[10::]
'''
idx_lang=[1,2,4,5,7,12,13,14,15,16,17,18,19,20]
languages2=[languages[ind] for ind in idx_lang]
dm=dm[np.ix_(idx_lang, idx_lang)]
coord_pts = mds.fit_transform((dm.T+dm)/2) #symmetrize if necessary
delta = 0.01
fig = pl.figure(figsize=(10,10))
ax = pl.gca()
ax.scatter(coord_pts[:,0], coord_pts[:,1], marker='+', color='black')
for i, txt in enumerate(languages2):
ax.annotate(txt, coord_pts[i]+(delta, delta))
pl.title("Metric multidimensional scaling (computed with {})".format(dist_measure_str))
pl.plot()
stress=np.sqrt(mds.stress_/(np.sum(dm**2)/2))
dm2=pairwise_distances(coord_pts)
stress2=np.sqrt(np.sum((dm2-dm)**2)/np.sum(dm**2))
print(f'stress : {stress}')
print(f'stress : {stress2}')
for i in range(len(languages2)):
act_diss=dm[i]
diff=(dm2[i]-dm[i])
lang=languages2[i]
print(f'{lang.rjust(10)}\t actual dissimilarity: {np.sum(act_diss):.3f} \t difference: {np.sum(np.abs(diff)):.3f} \t percent diff.: {np.sum(np.abs(diff))/np.sum(act_diss)*100:.2f} %')
#pl.savefig('mds_hellinger_selected.svg')
#pl.savefig('mds_bhat_selected_stress_019.svg')
#pl.savefig('mds_d_kl_selected.svg')
#pl.savefig('mds_jensen_shannon_selected_stress024.svg')
'''
coord_pts = mds.fit_transform((dm_selected_modified.T+dm_selected_modified)/2)
fig = pl.figure(figsize=(10,10))
ax = pl.gca()
ax.scatter(coord_pts[:,0], coord_pts[:,1])
for i, txt in enumerate(selected_languages):
ax.annotate(txt, coord_pts[i]+(delta, delta))
dist_measure_str = "KL divergence" if dist_measure == D_KL else "Hellinger distance"
pl.title("Metric multidimensional scaling (computed with {})".format(dist_measure_str))
pl.plot()
'''
stress : 0.19491792950428946 stress : 0.19487657358164456 Dutch actual dissimilarity: 74.417 difference: 12.743 percent diff.: 17.12 % English actual dissimilarity: 82.117 difference: 11.032 percent diff.: 13.44 % French actual dissimilarity: 63.747 difference: 12.125 percent diff.: 19.02 % German actual dissimilarity: 72.810 difference: 10.842 percent diff.: 14.89 % Italian actual dissimilarity: 64.540 difference: 10.735 percent diff.: 16.63 % Portuguese actual dissimilarity: 67.282 difference: 12.084 percent diff.: 17.96 % Russian actual dissimilarity: 68.423 difference: 15.013 percent diff.: 21.94 % Spanish actual dissimilarity: 58.163 difference: 8.784 percent diff.: 15.10 % Swedish actual dissimilarity: 90.959 difference: 14.128 percent diff.: 15.53 % Turkish actual dissimilarity: 80.375 difference: 17.051 percent diff.: 21.21 % Estonian actual dissimilarity: 109.560 difference: 12.315 percent diff.: 11.24 % Arabic actual dissimilarity: 80.535 difference: 15.955 percent diff.: 19.81 % Basque actual dissimilarity: 84.462 difference: 9.280 percent diff.: 10.99 % Catalan actual dissimilarity: 74.830 difference: 11.412 percent diff.: 15.25 %
'\ncoord_pts = mds.fit_transform((dm_selected_modified.T+dm_selected_modified)/2)\n\nfig = pl.figure(figsize=(10,10))\nax = pl.gca()\nax.scatter(coord_pts[:,0], coord_pts[:,1])\n\nfor i, txt in enumerate(selected_languages):\n ax.annotate(txt, coord_pts[i]+(delta, delta))\n\ndist_measure_str = "KL divergence" if dist_measure == D_KL else "Hellinger distance"\npl.title("Metric multidimensional scaling (computed with {})".format(dist_measure_str))\n\npl.plot()\n'
Note: the 'hole' : Japanese, Mandarin, Korean, Polish
if do_similarity_analyses:
pl.imshow(-dm2)
pl.colorbar()
pl.figure()
pl.imshow(-dm)
pl.colorbar()
Some clustering
delta=10.
n_clusters = 6
spec_clustering = cluster.SpectralClustering(n_clusters=n_clusters,
affinity="precomputed")
if do_similarity_analyses:
dm=(prox_d_kl+prox_d_kl.T)/2
similarity_matrix=np.exp(- dm**2 / (2. * delta ** 2))
clusters = spec_clustering.fit_predict(similarity_matrix)
print("All languages : ")
for i in range(n_clusters):
cluster_labels = [languages[j] for j in np.where(clusters == i)[0]]
print("> cluster {} : {} \n".format(i, cluster_labels))
All languages : > cluster 0 : ['Spanish', 'Estonian', 'Basque', 'Catalan'] > cluster 1 : ['Hungarian', 'Japanese', 'Korean', 'Mandarin'] > cluster 2 : ['Dutch', 'English', 'German'] > cluster 3 : ['French', 'Italian', 'Portuguese', 'Turkish', 'Arabic'] > cluster 4 : ['Danish', 'Finnish', 'Swedish'] > cluster 5 : ['Polish', 'Russian']
plt=pl
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack([model.children_, model.distances_,
counts]).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
if do_similarity_analyses:
# Ward based on hist_act
# setting distance_threshold=0 ensures we compute the full tree.
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None) #affinity='precomputed'
#XXX hist_act or sqrt hist_act ??
model_clustering = model_clustering.fit(np.sqrt(hist_act.T))
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
def labels_dendogram(k):
return languages[k]
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Ward + Hellinger distance")
#plt.savefig('cluster-ward.svg')
plt.show()
#DM #try different linkage methods
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity='precomputed', linkage='complete' )
dm=(prox_d_kl+prox_d_kl.T)/2
model_clustering = model_clustering.fit(dm) #if dm, needs to add affinity='precomputed' and change linkage method
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
def labels_dendogram(k):
return languages[k]
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Complete linkage based on symmetrized D_KL ")
#plt.savefig('cluster-dkl.svg')
plt.show()
#Jensen-Shannon + complete linkage
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity='precomputed', linkage='complete' )
dm=prox_d_jensen
model_clustering = model_clustering.fit(dm) #if dm, needs to add affinity='precomputed' and change linkage method
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Complete linkage based on Jensen-Shannon divergence ")
#plt.savefig('cluster-js.svg')
plt.show()
# Bhattacharyya distance
model_clustering = cluster.AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity='precomputed', linkage='complete' )
dm=prox_d_bhat
#dm+=16*prox_d_hell
model_clustering = model_clustering.fit(dm) #if dm, needs to add affinity='precomputed' and change linkage method
plt.title('Hierarchical Clustering Dendrogram')
# plot the dendrogram
plot_dendrogram(model_clustering, leaf_label_func=labels_dendogram, leaf_rotation=70) #, truncate_mode='level', p=3
plt.xlabel("Complete linkage based on Bhattacharyya distance ")
#plt.savefig('cluster-bhat.svg')
plt.show()
max_batchs=np.infty
if compute_confusion_matrix:
nb_batchs_iter=min(networkInput.nbr_batchs, max_batchs)
gen_batchs=iter(networkInput.sliced_batch.take(networkInput.num_slices_by_example*nb_batchs_iter))
#lang_true=[]
#lang_predicted=[]
conf_matrix=np.zeros((len(languages), len(languages)))
conf_matrix_filtered=np.zeros((len(languages_test), len(languages_test)))
i_batch=0
while(i_batch<nb_batchs_iter):
#batch=networkInput.sliced_batch.take(networkInput.num_slices_by_example) #defined with generator instead
batch=[next(gen_batchs) for slice_ in range(networkInput.num_slices_by_example)]
model.reset_states()
for trueBatch in batch:
x, y, w= trueBatch
filenames=x[2]
y_=model(trueBatch, training=force_dropout_conf_matrix) #training=False
y=y.numpy()[:,-1]
y_=y_.numpy()[:,-1]
for j in range(batch_size):
ind0=np.argmax(y[j])
ind=np.argmax(y_[j])
lang0=languages[ind0]
lang=languages[ind]
#lang_true.append(lang0)
#lang_predicted.append(lang)
conf_matrix[ind0][ind]+=1
ind0bis=np.argmax(y[j][inds_lang_test])
indbis=np.argmax(y_[j][inds_lang_test])
conf_matrix_filtered[ind0bis][indbis]+=1
i_batch+=1
if compute_confusion_matrix:
#plot_distance_matrix(conf_matrix, languages, vmin=0, vmax=20, normalize=True, invert_colors=True)
permut=[1,2,5, 13,12,4,16,18, 15,11, 14,19,20,7, 17, 0, 3, 6, 8, 9, 10]
conf_matrix_permut=np.zeros_like(conf_matrix)
for i, ind in enumerate(permut):
conf_matrix_permut[i]=conf_matrix[ind][permut]
languages_permut=[languages[ind] for ind in permut]
if useRamus:
plot_distance_matrix(conf_matrix_filtered, languages_test, vmin=0, vmax=5, invert_colors=True,title='Confusion matrix')
permut=[0,1,2,5,3,6,7, 4]
conf_matrix_filtered_permut=np.zeros_like(conf_matrix_filtered)
for i, ind in enumerate(permut):
conf_matrix_filtered_permut[i]=conf_matrix_filtered[ind][permut]
languages_test_permut=[languages_test[ind] for ind in permut]
pl.figure(figsize=(6,6))
pl.ylim([7.5, -0.5])
plot_distance_matrix(conf_matrix_filtered_permut, languages_test_permut , vmin=0, vmax=5, invert_colors=True, title='Confusion matrix')
pl.figure(figsize=(6,6))
plot_distance_matrix(conf_matrix_filtered_permut, languages_test_permut , normalize=True, vmin=0, vmax=50, invert_colors=True, title='Confusion matrix')
pl.ylim([7.5, -0.5])
else:
pl.figure(figsize=(15,15))
plot_distance_matrix(conf_matrix_permut, languages_permut , normalize=True, vmin=0, vmax=10, invert_colors=True, title='Confusion matrix')
pl.ylim([20.5, -0.5])
pl.figure(figsize=(15,15))
plot_distance_matrix(conf_matrix_permut, languages_permut ,normalize=True, vmin=0, vmax=30,
cmap=pl.cm.Purples, invert_colors=True, title='Confusion matrix')
pl.ylim([20.5, -0.5])
#pl.savefig('conf_matrix.svg')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-58-cc4135eb304a> in <module> 1 pl.figure(figsize=(15,15)) ----> 2 plot_distance_matrix(conf_matrix_permut, languages_permut ,normalize=True, vmin=0, vmax=30, 3 cmap=pl.cm.Purples, invert_colors=True, title='Confusion matrix') 4 5 pl.ylim([20.5, -0.5]) NameError: name 'conf_matrix_permut' is not defined
<Figure size 1080x1080 with 0 Axes>