import numpy as np
import matplotlib as mpl
#mpl.rcParams['text.usetex'] = True
'''mpl.rcParams['axes.titlesize']=24
mpl.rcParams['axes.labelsize']= 20
mpl.rcParams['lines.linewidth']= 3
mpl.rcParams['font.size']= 14
mpl.rcParams['lines.markersize']= 10
mpl.rcParams['xtick.labelsize']= 16
mpl.rcParams['ytick.labelsize']= 16
'''
mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['axes.labelsize'] = 16
mpl.rcParams['lines.linewidth'] = 2.5
mpl.rcParams['font.size'] = 12
mpl.rcParams['lines.markersize'] = 8
mpl.rcParams['xtick.labelsize'] = 14
mpl.rcParams['ytick.labelsize'] = 14
import matplotlib.pyplot as pl
pl.style.use('seaborn-deep')
from scipy.stats.stats import pearsonr
from create_dictionaries import create_dictionaries
import copy
import csv
import json
ALL_CELLS=0
SELECTED_CELLS=1
#balanced set (from training dataset)
activations_name = "weights0904-2d-30Hz-newinputs-dataaugmentation-30epochs_ALL" #"2_150_multiple_dropout_ALL"
saveFigs=False
#further parameters: conservative, which units/metrics
# Parameters
activations_name = "weights_2_150_voiced_unvoiced_bis_ALL"
mode_act=ALL_CELLS
#corpus Ramus
activations_folder_Ramus=f'../activations/Scores_Ramus/{activations_name}'
#activations_folder_Ramus="../activations/Scores_Ramus/weights_2020-04-07"
metrics_folder='./corpus_ramus/Files/'
activations_folder=f'../activations/Scores/{activations_name}'
#In original code, d_act is used for activations on training set and datay used for Ramus corpus
_, _, _, d_act, _, _, _ = create_dictionaries(activations_folder=activations_folder,
metrics_folder=metrics_folder)
_, d_match, datay, _, _, D, _ = create_dictionaries(activations_folder=activations_folder_Ramus,
metrics_folder=metrics_folder)
#cellule en abscisse
xlstm='lstm_2'
xcelltype='outputs'
xcellnumber='35'
xlabel=xlstm+" "+xcelltype+" "+xcellnumber
#cellule en ordonnée
ylstm='lstm_1'
ycelltype='cell_states'
ycellnumber='3'
ylabel=ylstm+" "+ycelltype+" "+ycellnumber
plt=pl
with pl.style.context('seaborn-deep'):
plt.figure(figsize=(10,8))
#cellule en abscisse (fichier Ramus)
Daudiox={}
for langue in D:
Daudiox[langue]={}
for filename in D[langue]:
if filename in d_match:
fileaudio=d_match[filename]
activ=datay[fileaudio.split('.')[0]][xlstm][xcelltype][int(xcellnumber)]
Daudiox[langue][fileaudio]=float(activ)
#cellule en ordonnée (fichiers Ramus)
Daudioy={}
for langue in D:
Daudioy[langue]={}
for filename in D[langue]:
if filename in d_match:
fileaudio=d_match[filename]
activ=datay[fileaudio.split('.')[0]][ylstm][ycelltype][int(ycellnumber)]
Daudioy[langue][fileaudio]=float(activ)
#Figure
for langue in D: #en rouge les points correspondants aux fichiers de la base de F.Ramus
std=np.std(list(Daudiox[langue].values()))
xsterr=std/np.sqrt(len(list(Daudiox[langue].values())))
x=np.mean(list(Daudiox[langue].values()))
std=np.std(list(Daudioy[langue].values()))
ysterr=std/np.sqrt(len(list(Daudioy[langue].values())))
y=np.mean(list(Daudioy[langue].values()))
plt.errorbar(x,y,label=langue,fmt=".r",xerr=xsterr,yerr=ysterr,capsize=2)
plt.text(x+0.0005,y+0.0005,langue)
langues={'English':'en','French':'fr','Polish':'pol','Japanese':'ja',
'Catalan':'cat','Spanish':'esp','Dutch':'du','Italian':'it'}
#cellule en abscisse
dlanguesx={}
for file in list(d_act.keys()):
language=d_act[file]['label']
if language not in dlanguesx:
dlanguesx[language]=[]
if mode_act == ALL_CELLS:
xcellnumber = int(xcellnumber)
activ=float(d_act[file]['activations'][xlstm][xcelltype][xcellnumber])
dlanguesx[language].append(activ)
#cellule en ordonnée
dlanguesy={}
for file in list(d_act.keys()):
language=d_act[file]['label']
if language not in dlanguesy:
dlanguesy[language]=[]
if mode_act == ALL_CELLS:
ycellnumber = int(ycellnumber)
activ=float(d_act[file]['activations'][ylstm][ycelltype][ycellnumber])
dlanguesy[language].append(activ)
#Figure (fichiers hors de la base de F.Ramus)
for language in dlanguesy:
std=np.std(dlanguesx[language])
xsterr=std/(np.sqrt(len(dlanguesx[language])))
std=np.std(dlanguesy[language])
ysterr=std/(np.sqrt(len(dlanguesy[language])))
x=np.mean(dlanguesx[language])
y=np.mean(dlanguesy[language])
if language in langues: #en bleu les langues qui existent dans la base de F. Ramus
plt.errorbar(x,y,label=language,fmt=".b",xerr=xsterr,yerr=ysterr,capsize=2)
plt.text(x+0.0001,y+0.0002,language)
else: #en noir les autres langues
plt.errorbar(x,y,label=language,fmt=".k",xerr=xsterr,yerr=ysterr,capsize=2)
plt.text(x+0.0001,y+0.0002,language)
title='Distribution des langues selon '+xlabel+', '+ylabel
plt.title(title)
plt.ylabel(ylabel)
plt.xlabel(xlabel)
name=activations_name+'_distrib_'+xlabel.split(' ')[2]+'_'+ylabel.split(' ')[2]+'_ramus.svg'
if saveFigs:
plt.savefig(f'./corpus_ramus/Figures/{name}')
#plt.close()
#plt.show()
langues={'English':'en','French':'fr','Polish':'pol','Japanese':'ja',
'Catalan':'cat','Spanish':'esp','Dutch':'du','Italian':'it'}
langues_reversed = {v: k for k, v in langues.items()}
dic_lang_ramus={k: {} for k, v in langues.items()} #to write means of correlates (Ramus corpus and DNN dataset)
Import coeffs
conservative=True
#activations_name=activations_folder_Ramus.split('/')[-1]
json_filename=activations_name
if conservative:
json_filename+='_conservative'
with open(f'./corpus_ramus/coeffs/{json_filename}.json', 'r') as f:
coeffs_dic=json.load(f)
metrique_x = 'nPVI_V'
metrique_y = 'rPVI_C'
layer='lstm_2'
celltype='outputs'
model = 'enet' #lasso #enet
plt=pl
with pl.style.context('seaborn-deep'):
plt.figure(figsize=(10,8))
coeffs_x=np.array(coeffs_dic[metrique_x][model]['coeffs'])
intercept_x=coeffs_dic[metrique_x][model]['intercept']
coeffs_y=np.array(coeffs_dic[metrique_y][model]['coeffs'])
intercept_y=coeffs_dic[metrique_y][model]['intercept']
if metrique_x in ['deltC', 'deltV']:
shift_x=0.00002
elif metrique_x in ['propV', 'nPVI_V', 'rPVI_C']:
shift_x=0.02
elif metrique_x in ['rPVI_C']:
shift_x=0.002
if metrique_y in ['deltC', 'deltV']:
shift_y=0.00005
elif metrique_y in ['propV', 'nPVI_V']:
shift_y=0.05
elif metrique_y in ['rPVI_C']:
shift_y=0.005
xlabel=metrique_x
ylabel=metrique_y
#fichiers Ramus
Daudiox={}
Daudioy={}
for langue in D:
Daudiox[langue]={}
Daudioy[langue]={}
for filename in D[langue]:
if filename in d_match:
fileaudio=d_match[filename]
activ=np.array([float(st) for st in datay[fileaudio.split('.')[0]][layer][celltype]])
Daudiox[langue][fileaudio]=intercept_x + np.dot(coeffs_x, activ)
Daudioy[langue][fileaudio]=intercept_y + np.dot(coeffs_y, activ)
#Figure
for langue in D: #en rouge les points correspondants aux fichiers de la base de F.Ramus
std=np.std(list(Daudiox[langue].values()))
xsterr=std/np.sqrt(len(list(Daudiox[langue].values())))
x=np.mean(list(Daudiox[langue].values()))
std=np.std(list(Daudioy[langue].values()))
ysterr=std/np.sqrt(len(list(Daudioy[langue].values())))
y=np.mean(list(Daudioy[langue].values()))
plt.errorbar(x,y,label=langue,xerr=xsterr,yerr=ysterr,capsize=2, ecolor='orangered')
plt.text(x+shift_x,y+shift_y,langue)
dic_lang_ramus[langues_reversed[langue]][f'ramus_{metrique_x}']=x
dic_lang_ramus[langues_reversed[langue]][f'ramus_{metrique_y}']=y
#cellule en abscisse
dlanguesx={}
dlanguesy={}
for file in list(d_act.keys()):
language=d_act[file]['label']
if language not in dlanguesx:
dlanguesx[language]=[]
dlanguesy[language]=[]
activ=np.array([float(st) for st in d_act[file]['activations'][layer][celltype]])
dlanguesx[language].append(intercept_x + np.dot(coeffs_x, activ))
dlanguesy[language].append(intercept_y + np.dot(coeffs_y, activ))
#Figure (fichiers hors de la base de F.Ramus)
for language in dlanguesy:
std=np.std(dlanguesx[language])
xsterr=std/(np.sqrt(len(dlanguesx[language])))
std=np.std(dlanguesy[language])
ysterr=std/(np.sqrt(len(dlanguesy[language])))
x=np.mean(dlanguesx[language])
y=np.mean(dlanguesy[language])
if language in langues: #en bleu les langues qui existent dans la base de F. Ramus
plt.errorbar(x,y,label=language,xerr=xsterr,yerr=ysterr,capsize=2, ecolor='blue')
plt.text(x+shift_x,y+shift_y,language)
dic_lang_ramus[language][f'DNN_{metrique_x}']=x
dic_lang_ramus[language][f'DNN_{metrique_y}']=y
else: #en noir les autres langues
plt.errorbar(x,y,label=language,xerr=xsterr,yerr=ysterr,capsize=2, ecolor='black')
plt.text(x+shift_x,y+shift_y,language)
title='Distribution des langues selon les activations corrélées à '+xlabel+', '+ylabel + f' ({model})'
#plt.title(title)
if ylabel=='deltC':
#pl.ylabel('Correlated with $\Delta C$')
pl.ylabel('Correlated with ')
else:
plt.ylabel(ylabel)
if xlabel=='propV' or xlabel=='deltV':
pl.xlabel('Correlated with ')
else:
plt.xlabel(xlabel)
name=activations_name+'_distrib_'+xlabel+'_'+ylabel+'_'+model
if conservative:
name=name+"_conservative"
name=name+'.svg'
if saveFigs:
plt.savefig(f'./corpus_ramus/Figures/{name}')
#plt.close()
#plt.show()
dic_lang_ramus
{'English': {'ramus_nPVI_V': 52.62486983978617, 'ramus_rPVI_C': 5.373503655737228, 'DNN_nPVI_V': 54.15277643457332, 'DNN_rPVI_C': 5.39930126068207}, 'French': {'ramus_nPVI_V': 49.18460943400069, 'ramus_rPVI_C': 5.044298368334439, 'DNN_nPVI_V': 48.4875890633136, 'DNN_rPVI_C': 5.043264342708177}, 'Polish': {'ramus_nPVI_V': 46.45563184099493, 'ramus_rPVI_C': 5.3137291748721385, 'DNN_nPVI_V': 48.52225906467783, 'DNN_rPVI_C': 5.532296059740451}, 'Japanese': {'ramus_nPVI_V': 49.03521009502212, 'ramus_rPVI_C': 4.7341315782314615, 'DNN_nPVI_V': 48.79310629238002, 'DNN_rPVI_C': 4.638784475728118}, 'Catalan': {'ramus_nPVI_V': 45.75442988610156, 'ramus_rPVI_C': 4.950074265987346, 'DNN_nPVI_V': 45.46720603161214, 'DNN_rPVI_C': 5.00143224506126}, 'Spanish': {'ramus_nPVI_V': 47.62568352647738, 'ramus_rPVI_C': 5.106079857311986, 'DNN_nPVI_V': 44.889945552730524, 'DNN_rPVI_C': 4.952883053427439}, 'Dutch': {'ramus_nPVI_V': 55.391632921727265, 'ramus_rPVI_C': 5.727717800748612, 'DNN_nPVI_V': 54.928240669084495, 'DNN_rPVI_C': 5.544485813707008}, 'Italian': {'ramus_nPVI_V': 47.0195665932991, 'ramus_rPVI_C': 4.931826131868736, 'DNN_nPVI_V': 46.910062335102715, 'DNN_rPVI_C': 4.873653348184477}}
activations_name
'weights_2_150_voiced_unvoiced_bis_ALL'
Features array
list_labels=[] #languages
metric_names=['propV', 'deltC', 'nPVI_V', 'rPVI_C']
array_metrics=[] #as list of list for construction
list_scores_nn=[] #List scores (neural network): scores as dict lang -> score
for file in list(d_act.keys()):
language=d_act[file]['label']
if language not in ['Czech', 'Romanian']: #not in list of model languages
features=[]
list_labels.append(language)
activ=np.array([float(st) for st in d_act[file]['activations'][layer][celltype]])
for metric in metric_names:
intercept_x=coeffs_dic[metric][model]['intercept']
coeffs_x=np.array(coeffs_dic[metric][model]['coeffs'])
feature=intercept_x + np.dot(coeffs_x, activ)
features.append(feature)
array_metrics.append(features)
with open(f'{activations_folder}/{file}', 'r') as json_file:
data_json=json.load(json_file)
list_scores_nn.append(data_json["scores"])
array_metrics=np.array(array_metrics)
Target vector
#languages=set(list_labels)
languages=list(list_scores_nn[0].keys())
target=[]
scores_nn_arr=[]
target_multarr=np.zeros((len(list_labels), len(languages)))
i=0
for label, scores in zip(list_labels, list_scores_nn):
ind=languages.index(label)
target.append(ind)
target_multarr[i, ind]=1
scores_nn_arr.append([float(scores[lang]) for lang in languages])
target=np.array(target)
scores_nn_arr=np.array(scores_nn_arr)
QDA with all data
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from scipy import linalg
QDA=QuadraticDiscriminantAnalysis
model_qda=QDA(store_covariance=True)
model_qda.fit(array_metrics, target)
QuadraticDiscriminantAnalysis(store_covariance=True)
model_qda.score(array_metrics, target)
0.35773809523809524
%matplotlib inline
#plot elipses
def plot_ellipse(splot, mean, cov, lang, axes=(0,1)):
cov=np.array([[cov[axes[0], axes[0]], cov[axes[0], axes[1]] ],
[cov[axes[1], axes[0]], cov[axes[1], axes[1]] ]])
mean=np.array([mean[axes[0]], mean[axes[1]]])
v, w = linalg.eigh(cov)
u = w[0] / linalg.norm(w[0])
angle = np.arctan(u[1] / u[0])
angle = 180 * angle / np.pi # convert to degrees
# filled Gaussian at X standard deviation
ell = mpl.patches.Ellipse(mean, 0.2 * v[0] ** 0.5, 0.2 * v[1] ** 0.5,
180 + angle, # facecolor=color,
edgecolor='black', linewidth=2)
ell.set_clip_box(splot.bbox)
#ell.set_alpha(0.2)
#splot.scatter(mean[0], mean[1])
splot.add_artist(ell)
#splot.set_xticks(())
#splot.set_yticks(())
pl.figure(figsize=(10,10))
splot=pl.gca()
for lang, mean, cov in zip(languages, model_qda.means_, model_qda.covariance_):
axes=(2,3)
mean2=np.array([mean[axes[0]], mean[axes[1]]])
pl.scatter(mean2[0], mean2[1])
pl.text(mean2[0], mean2[1], lang)
plot_ellipse(splot, mean, cov, lang, axes=axes)
pl.show()
Cross validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, top_k_accuracy_score #requires scikit-learn 0.24
scores_cv_acc = cross_val_score(model_qda, array_metrics, target, cv=5)
scores_cv_top_3 = cross_val_score(model_qda, array_metrics, target, cv=5,
scoring=make_scorer(top_k_accuracy_score, k=3, needs_proba=True, labels=np.arange(len(languages))) )
print(f'cross-validation accuracy: {np.mean(scores_cv_acc):.4f} ({scores_cv_acc})')
print(f'cross-validation top3 accuracy: {np.mean(scores_cv_top_3):.4f} ({scores_cv_top_3})')
cross-validation accuracy: 0.3504 ([0.35639881 0.3608631 0.34970238 0.33854167 0.34672619]) cross-validation top3 accuracy: 0.6079 ([0.60342262 0.59970238 0.59895833 0.61830357 0.61904762])
Measures of information (w/ fit on all data)
cross_ent_nn=0
cross_ent_qda=0
acc_nn=0
jsd=0
tv=0
for i in range(len(list_labels)):
scores_nn=scores_nn_arr[i]
features=array_metrics[i]
scores_qda=model_qda.predict_proba(features[np.newaxis, :])[0]
ind=target[i]
ind_nn=np.argmax(scores_nn)
acc_nn+= (ind_nn)==ind
cross_ent_nn+=-np.log2(scores_nn[ind])
cross_ent_qda+=-np.log2(scores_qda[ind])
jsd+=-0.5*np.sum(scores_nn*np.log2( (1+scores_qda/scores_nn)/2))
jsd+=-0.5*np.sum(scores_qda*np.log2( (1+scores_nn/scores_qda)/2))
tv+=np.sum(np.abs(scores_nn-scores_qda))
if i>10 and i<20:
pl.subplot(10, 1, i-10)
pl.bar(range(len(languages)), scores_nn)
pl.bar(np.arange(len(languages))+0.5, scores_qda)
cross_ent_nn/=len(list_labels)
cross_ent_qda/=len(list_labels)
acc_nn/=len(list_labels)
jsd/=len(list_labels)
tv/=len(list_labels)
print(f'cross entropy NN :{cross_ent_nn:.4f} (perplexity {2**cross_ent_nn:.4f})')
print(f'accuracy NN :{acc_nn:.2f}')
print(f'cross entropy QDA :{cross_ent_qda:.4f} (perplexity {2**cross_ent_qda:.4f})')
print(f'mean jensen-shannon divergence :{jsd:.4f} bit')
print(f'mean total variation :{tv:.3f} ')
cross entropy NN :1.6619 (perplexity 3.1643) accuracy NN :0.64 cross entropy QDA :2.9744 (perplexity 7.8594) mean jensen-shannon divergence :0.3375 bit mean total variation :1.052