import numpy as np
import matplotlib as mpl
#mpl.rcParams['text.usetex'] = True
'''mpl.rcParams['axes.titlesize']=24
mpl.rcParams['axes.labelsize']= 20
mpl.rcParams['lines.linewidth']= 3
mpl.rcParams['font.size']= 14
mpl.rcParams['lines.markersize']= 10
mpl.rcParams['xtick.labelsize']= 16
mpl.rcParams['ytick.labelsize']= 16
'''
mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['axes.labelsize'] = 16
mpl.rcParams['lines.linewidth'] = 2.5
mpl.rcParams['font.size'] = 12
mpl.rcParams['lines.markersize'] = 8
mpl.rcParams['xtick.labelsize'] = 14
mpl.rcParams['ytick.labelsize'] = 14
import matplotlib.pyplot as pl
pl.style.use('seaborn-deep')
from scipy.stats.stats import pearsonr
from create_dictionaries import create_dictionaries
import copy
import csv
import json
import itertools
ALL_CELLS=0
SELECTED_CELLS=1
#balanced set (from training dataset)
activations_name = "weights0904-2d-30Hz-newinputs-dataaugmentation-30epochs_ALL" #"2_150_multiple_dropout_ALL"
saveFigs=False
#further parameters: conservative, which units/metrics
# Parameters
activations_name = "weights_2_150_voiced_unvoiced_bis_TEST_ALL" #only recordings from the test set
mode_act=ALL_CELLS
#corpus Ramus
activations_folder_Ramus=f'../activations/Scores_Ramus/{activations_name}'
#activations_folder_Ramus="../activations/Scores_Ramus/weights_2020-04-07"
metrics_folder='./corpus_ramus/Files/'
activations_folder=f'../activations/Scores/{activations_name}'
#In original code, d_act is used for activations on training set and datay used for Ramus corpus
_, _, _, d_act, _, _, _ = create_dictionaries(activations_folder=activations_folder,
metrics_folder=metrics_folder)
#_, d_match, datay, _, _, D, _ = create_dictionaries(activations_folder=activations_folder_Ramus,
# metrics_folder=metrics_folder)
# not considering corpus Ramus
#cellule en abscisse
xlstm='lstm_2'
xcelltype='outputs'
xcellnumber='35'
xlabel=xlstm+" "+xcelltype+" "+xcellnumber
#cellule en ordonnée
ylstm='lstm_1'
ycelltype='cell_states'
ycellnumber='3'
ylabel=ylstm+" "+ycelltype+" "+ycellnumber
plt=pl
with pl.style.context('seaborn-deep'):
plt.figure(figsize=(8,6))
'''
#cellule en abscisse (fichier Ramus)
Daudiox={}
for langue in D:
Daudiox[langue]={}
for filename in D[langue]:
if filename in d_match:
fileaudio=d_match[filename]
activ=datay[fileaudio.split('.')[0]][xlstm][xcelltype][int(xcellnumber)]
Daudiox[langue][fileaudio]=float(activ)
#cellule en ordonnée (fichiers Ramus)
Daudioy={}
for langue in D:
Daudioy[langue]={}
for filename in D[langue]:
if filename in d_match:
fileaudio=d_match[filename]
activ=datay[fileaudio.split('.')[0]][ylstm][ycelltype][int(ycellnumber)]
Daudioy[langue][fileaudio]=float(activ)
#Figure
for langue in D: #en rouge les points correspondants aux fichiers de la base de F.Ramus
std=np.std(list(Daudiox[langue].values()))
xsterr=std/np.sqrt(len(list(Daudiox[langue].values())))
x=np.mean(list(Daudiox[langue].values()))
std=np.std(list(Daudioy[langue].values()))
ysterr=std/np.sqrt(len(list(Daudioy[langue].values())))
y=np.mean(list(Daudioy[langue].values()))
plt.errorbar(x,y,label=langue,fmt=".r",xerr=xsterr,yerr=ysterr,capsize=2)
plt.text(x+0.0005,y+0.0005,langue)
'''
langues={'English':'en','French':'fr','Polish':'pol','Japanese':'ja',
'Catalan':'cat','Spanish':'esp','Dutch':'du','Italian':'it'}
#cellule en abscisse
dlanguesx={}
for file in list(d_act.keys()):
language=d_act[file]['label']
if language not in dlanguesx:
dlanguesx[language]=[]
if mode_act == ALL_CELLS:
xcellnumber = int(xcellnumber)
activ=float(d_act[file]['activations'][xlstm][xcelltype][xcellnumber])
dlanguesx[language].append(activ)
#cellule en ordonnée
dlanguesy={}
for file in list(d_act.keys()):
language=d_act[file]['label']
if language not in dlanguesy:
dlanguesy[language]=[]
if mode_act == ALL_CELLS:
ycellnumber = int(ycellnumber)
activ=float(d_act[file]['activations'][ylstm][ycelltype][ycellnumber])
dlanguesy[language].append(activ)
#Figure (fichiers hors de la base de F.Ramus)
for language in dlanguesy:
std=np.std(dlanguesx[language])
xsterr=std/(np.sqrt(len(dlanguesx[language])))
std=np.std(dlanguesy[language])
ysterr=std/(np.sqrt(len(dlanguesy[language])))
x=np.mean(dlanguesx[language])
y=np.mean(dlanguesy[language])
if language in langues: #en bleu les langues qui existent dans la base de F. Ramus
plt.errorbar(x,y,label=language,fmt=".b",xerr=xsterr,yerr=ysterr,capsize=2)
plt.text(x+0.0001,y+0.0002,language)
else: #en noir les autres langues
plt.errorbar(x,y,label=language,fmt=".k",xerr=xsterr,yerr=ysterr,capsize=2)
plt.text(x+0.0001,y+0.0002,language)
title='Distribution des langues selon '+xlabel+', '+ylabel
plt.title(title)
plt.ylabel(ylabel)
plt.xlabel(xlabel)
name=activations_name+'_distrib_'+xlabel.split(' ')[2]+'_'+ylabel.split(' ')[2]+'_ramus.svg'
if saveFigs:
plt.savefig(f'./corpus_ramus/Figures/{name}')
#plt.close()
#plt.show()
langues={'English':'en','French':'fr','Polish':'pol','Japanese':'ja',
'Catalan':'cat','Spanish':'esp','Dutch':'du','Italian':'it'}
langues_reversed = {v: k for k, v in langues.items()}
Import coeffs
conservative=True
#activations_name=activations_folder_Ramus.split('/')[-1]
json_filename=activations_name
if conservative:
json_filename+='_conservative'
with open(f'./corpus_ramus/coeffs/{json_filename}.json', 'r') as f:
coeffs_dic=json.load(f)
metrique_x = 'nPVI_V'
metrique_y = 'rPVI_C'
layer='lstm_2'
celltype='outputs'
model = 'enet' #lasso #enet
plt=pl
with pl.style.context('seaborn-deep'):
plt.figure(figsize=(10,8))
coeffs_x=np.array(coeffs_dic[metrique_x][model]['coeffs'])
intercept_x=coeffs_dic[metrique_x][model]['intercept']
coeffs_y=np.array(coeffs_dic[metrique_y][model]['coeffs'])
intercept_y=coeffs_dic[metrique_y][model]['intercept']
if metrique_x in ['deltC', 'deltV']:
shift_x=0.00002
elif metrique_x in ['propV', 'nPVI_V', 'rPVI_C']:
shift_x=0.02
elif metrique_x in ['rPVI_C']:
shift_x=0.002
if metrique_y in ['deltC', 'deltV']:
shift_y=0.00005
elif metrique_y in ['propV', 'nPVI_V']:
shift_y=0.05
elif metrique_y in ['rPVI_C']:
shift_y=0.005
xlabel=metrique_x
ylabel=metrique_y
'''
#fichiers Ramus
Daudiox={}
Daudioy={}
for langue in D:
Daudiox[langue]={}
Daudioy[langue]={}
for filename in D[langue]:
if filename in d_match:
fileaudio=d_match[filename]
activ=np.array([float(st) for st in datay[fileaudio.split('.')[0]][layer][celltype]])
Daudiox[langue][fileaudio]=intercept_x + np.dot(coeffs_x, activ)
Daudioy[langue][fileaudio]=intercept_y + np.dot(coeffs_y, activ)
#Figure
for langue in D: #en rouge les points correspondants aux fichiers de la base de F.Ramus
std=np.std(list(Daudiox[langue].values()))
xsterr=std/np.sqrt(len(list(Daudiox[langue].values())))
x=np.mean(list(Daudiox[langue].values()))
std=np.std(list(Daudioy[langue].values()))
ysterr=std/np.sqrt(len(list(Daudioy[langue].values())))
y=np.mean(list(Daudioy[langue].values()))
plt.errorbar(x,y,label=langue,xerr=xsterr,yerr=ysterr,capsize=2, ecolor='orangered')
plt.text(x+shift_x,y+shift_y,langue)
dic_lang_ramus[langues_reversed[langue]][f'ramus_{metrique_x}']=x
dic_lang_ramus[langues_reversed[langue]][f'ramus_{metrique_y}']=y
'''
#cellule en abscisse
dlanguesx={}
dlanguesy={}
for file in list(d_act.keys()):
language=d_act[file]['label']
if language not in dlanguesx:
dlanguesx[language]=[]
dlanguesy[language]=[]
activ=np.array([float(st) for st in d_act[file]['activations'][layer][celltype]])
dlanguesx[language].append(intercept_x + np.dot(coeffs_x, activ))
dlanguesy[language].append(intercept_y + np.dot(coeffs_y, activ))
#Figure (fichiers hors de la base de F.Ramus)
for language in dlanguesy:
std=np.std(dlanguesx[language])
xsterr=std/(np.sqrt(len(dlanguesx[language])))
std=np.std(dlanguesy[language])
ysterr=std/(np.sqrt(len(dlanguesy[language])))
x=np.mean(dlanguesx[language])
y=np.mean(dlanguesy[language])
if language in langues: #en bleu les langues qui existent dans la base de F. Ramus
plt.errorbar(x,y,label=language,xerr=xsterr,yerr=ysterr,capsize=2, ecolor='blue')
plt.text(x+shift_x,y+shift_y,language)
else: #en noir les autres langues
plt.errorbar(x,y,label=language,xerr=xsterr,yerr=ysterr,capsize=2, ecolor='black')
plt.text(x+shift_x,y+shift_y,language)
title='Distribution des langues selon les activations corrélées à '+xlabel+', '+ylabel + f' ({model})'
#plt.title(title)
if ylabel=='deltC':
#pl.ylabel('Correlated with $\Delta C$')
pl.ylabel('Correlated with ')
else:
plt.ylabel(ylabel)
if xlabel=='propV' or xlabel=='deltV':
pl.xlabel('Correlated with ')
else:
plt.xlabel(xlabel)
name=activations_name+'_distrib_'+xlabel+'_'+ylabel+'_'+model
if conservative:
name=name+"_conservative"
name=name+'.svg'
if saveFigs:
plt.savefig(f'./corpus_ramus/Figures/{name}')
#plt.close()
#plt.show()
activations_name
'weights_2_150_voiced_unvoiced_bis_TEST_ALL'
Features array
use_selected_languages=True
selected_languages=['French', 'English', 'Mandarin', 'Russian', 'Spanish'] #Loukina2011 (except Greek)
#selected_languages=['Dutch','English','French','Polish','Spanish','Italian', 'Catalan', 'Japanese'] #RAMUS
list_labels=[] #languages
metric_names=['propV', 'deltC', 'nPVI_V', 'rPVI_C']
array_metrics=[] #as list of list for construction
list_scores_nn=[] #List scores (neural network): scores as dict lang -> score
for file in list(d_act.keys()):
language=d_act[file]['label']
if language not in ['Czech', 'Romanian'] and (not use_selected_languages or language in selected_languages): #not in list of model languages
features=[]
list_labels.append(language)
activ=np.array([float(st) for st in d_act[file]['activations'][layer][celltype]])
for metric in metric_names:
intercept_x=coeffs_dic[metric][model]['intercept']
coeffs_x=np.array(coeffs_dic[metric][model]['coeffs'])
feature=intercept_x + np.dot(coeffs_x, activ)
features.append(feature)
array_metrics.append(features)
with open(f'{activations_folder}/{file}', 'r') as json_file:
data_json=json.load(json_file)
list_scores_nn.append(data_json["scores"])
array_metrics=np.array(array_metrics)
if use_selected_languages:
for l in selected_languages:
print(f' nb occurrences {l}: {list_labels.count(l)}')
nb occurrences French: 51 nb occurrences English: 90 nb occurrences Mandarin: 74 nb occurrences Russian: 91 nb occurrences Spanish: 56
Target vector
#languages=set(list_labels)
languages=list(list_scores_nn[0].keys())
target=[]
scores_nn_arr=[]
selected_languages = selected_languages if use_selected_languages else languages
target_multarr=np.zeros((len(list_labels), len(selected_languages)))
i=0
for label, scores in zip(list_labels, list_scores_nn):
ind=selected_languages.index(label)
target.append(ind)
target_multarr[i, ind]=1
scores_nn_arr.append([float(scores[lang]) for lang in languages])
target=np.array(target)
scores_nn_arr=np.array(scores_nn_arr)
#get indices
selected_languages_ind=[languages.index(l) for l in selected_languages]
selected_languages_ind=np.array(selected_languages_ind)
QDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from scipy import linalg
#QDA=QuadraticDiscriminantAnalysis
#model_qda=QDA(store_covariance=True)
#model_qda.fit(array_metrics, target)
from joblib import dump, load
model_qda=load('qda.joblib')
%matplotlib inline
#plot elipses
def plot_ellipse(splot, mean, cov, lang, axes=(0,1)):
cov=np.array([[cov[axes[0], axes[0]], cov[axes[0], axes[1]] ],
[cov[axes[1], axes[0]], cov[axes[1], axes[1]] ]])
mean=np.array([mean[axes[0]], mean[axes[1]]])
v, w = linalg.eigh(cov)
u = w[0] / linalg.norm(w[0])
angle = np.arctan(u[1] / u[0])
angle = 180 * angle / np.pi # convert to degrees
# filled Gaussian at X standard deviation
ell = mpl.patches.Ellipse(mean, 0.2 * v[0] ** 0.5, 0.2 * v[1] ** 0.5,
180 + angle, # facecolor=color,
edgecolor='black', linewidth=2)
ell.set_clip_box(splot.bbox)
#ell.set_alpha(0.2)
#splot.scatter(mean[0], mean[1])
splot.add_artist(ell)
#splot.set_xticks(())
#splot.set_yticks(())
pl.figure(figsize=(10,10))
splot=pl.gca()
for lang, mean, cov in zip(languages, model_qda.means_, model_qda.covariance_):
axes=(2,3)
mean2=np.array([mean[axes[0]], mean[axes[1]]])
pl.scatter(mean2[0], mean2[1])
pl.text(mean2[0], mean2[1], lang)
plot_ellipse(splot, mean, cov, lang, axes=axes)
pl.show()
if not use_selected_languages:
print(model_qda.score(array_metrics, target)) #accuracy
from sklearn.metrics import top_k_accuracy_score
proba_=model_qda.predict_proba(array_metrics)
if use_selected_languages:
proba_=proba_[:, selected_languages_ind]
print( top_k_accuracy_score(target, proba_, k=1)) #Accuracy (QDA)
print( top_k_accuracy_score(target, proba_, k=3)) #top-3 accuracy (QDA)
0.5027624309392266 0.8839779005524862
scores_nn_arr_selected=scores_nn_arr[:, selected_languages_ind]
print( top_k_accuracy_score(target, scores_nn_arr_selected, k=1)) #Accuracy (NN)
print( top_k_accuracy_score(target, scores_nn_arr_selected, k=3)) #top-3 accuracy (NN)
0.6215469613259669 0.9226519337016574
from sklearn.metrics import confusion_matrix
if use_selected_languages:
est_=np.argmax(proba_, axis=1)
M=confusion_matrix(target, est_,normalize='true')
def plot_distance_matrix(dm, classes, title='Confusion matrix', cmap=pl.cm.Blues, vmin=0, vmax=1, invert_colors=False):
if not(invert_colors):
pl.imshow(-dm, interpolation='nearest', cmap=cmap, vmin=-vmax, vmax=-vmin)
else:
pl.imshow(dm, interpolation='nearest', cmap=cmap, vmin=vmin, vmax=vmax)
pl.title(title)
#pl.colorbar()
tick_marks = np.arange(len(classes))
pl.xticks(tick_marks, classes, rotation=45)
pl.yticks(tick_marks, classes)
thresh = dm.max()*(1-2*invert_colors) / 2.
for i, j in itertools.product(range(dm.shape[0]), range(dm.shape[1])):
pl.text(j, i, int(dm[i,j]*100)*1./100,
horizontalalignment="center",
color="white" if dm[i, j]*(1-2*invert_colors) < thresh else "black")
#pl.tight_layout()
pl.ylabel('Label1 (reference)')
pl.xlabel('Label2')
if use_selected_languages:
plot_distance_matrix(M, selected_languages, invert_colors=True, vmax=0.5)
Measures of information
cross_ent_nn=0
cross_ent_qda=0
acc_nn=0
jsd=0
tv=0
for i in range(len(list_labels)):
scores_nn=scores_nn_arr[i]
features=array_metrics[i]
scores_qda=model_qda.predict_proba(features[np.newaxis, :])[0]
ind=target[i]
ind_nn=np.argmax(scores_nn)
acc_nn+= (ind_nn)==ind
cross_ent_nn+=-np.log2(scores_nn[ind])
cross_ent_qda+=-np.log2(scores_qda[ind])
jsd+=-0.5*np.sum(scores_nn*np.log2( (1+scores_qda/scores_nn)/2))
jsd+=-0.5*np.sum(scores_qda*np.log2( (1+scores_nn/scores_qda)/2))
tv+=np.sum(np.abs(scores_nn-scores_qda))
if i>10 and i<20:
pl.subplot(10, 1, i-10)
pl.bar(range(len(languages)), scores_nn, alpha=0.5)
pl.bar(np.arange(len(languages))+0.25, scores_qda, alpha=0.5)
cross_ent_nn/=len(list_labels)
cross_ent_qda/=len(list_labels)
acc_nn/=len(list_labels)
jsd/=len(list_labels)
tv/=len(list_labels)
print(f'cross entropy NN :{cross_ent_nn:.4f} (perplexity {2**cross_ent_nn:.4f})')
print(f'accuracy NN :{acc_nn:.2f}')
print(f'cross entropy QDA :{cross_ent_qda:.4f} (perplexity {2**cross_ent_qda:.4f})')
print(f'mean jensen-shannon divergence :{jsd:.4f} bit')
print(f'mean total variation :{tv:.3f} ')
cross entropy NN :8.8183 (perplexity 451.4198) accuracy NN :0.02 cross entropy QDA :6.3163 (perplexity 79.6886) mean jensen-shannon divergence :0.3647 bit mean total variation :1.132