import numpy as np
import matplotlib.pyplot as pl
from scipy.stats.stats import pearsonr
from create_dictionaries import create_dictionaries
import copy
from itertools import cycle
from sklearn.linear_model import Lasso, ElasticNet, lasso_path, enet_path
from sklearn.linear_model import LassoCV, ElasticNetCV, LassoLarsCV, LassoLarsIC
from sklearn.model_selection import KFold
import json
import sys
from IPython.utils.io import Tee
activations_name='weights0904-2d-30Hz-newinputs-dataaugmentation-30epochs'
#activations_folder="../activations/Scores_Ramus/weights_2020-04-07"
n_cells=128
metrics_folder='./corpus_ramus/Files/'
saveTxtFile=False #save best cells in text file
l1_ratio=0.2 #for ElasticNet
#NB further parameters: conservative, type of cells
saveCoeffs=False #save regression coeffs to json file
normalizeFeatures=False
# Parameters
activations_name = "weights_2_150_voiced_unvoiced_bis_ALL"
n_cells = 150
activations_folder=f'../activations/Scores_Ramus/{activations_name}'
dfiles, d_match, datay, d_act, d_lang, D, dmetrics = create_dictionaries(activations_folder=activations_folder,
metrics_folder=metrics_folder)
metrique_keys=['deltV', 'deltC', 'propV', 'varco_V', 'varco_C', 'nPVI_V', 'rPVI_C']
list_files=list(d_match.keys())
metric_arrays={}
for metrique in metrique_keys:
l=[]
for file in list_files:
l.append(dmetrics[file][metrique])
metric_arrays[metrique]=np.array(l)
Figures metrics (NB: they differ slightly from Ramus' figures as not every audio file was available, especially for English)
metrique_tuples=[('propV', 'deltC'), ('propV', 'deltV'), ('deltV', 'deltC'), ('nPVI_V', 'rPVI_C')]
plt=pl
for tup in metrique_tuples:
plt.figure(figsize=(7,5))
metrique_x, metrique_y = tup
xarr=metric_arrays[metrique_x]
yarr=metric_arrays[metrique_y]
if metrique_x in ['deltC', 'deltV']:
shift_x=0.001
elif metrique_x in ['propV', 'nPVI_V', 'rPVI_C']:
shift_x=0.1
if metrique_y in ['deltC', 'deltV']:
shift_y=0.001
elif metrique_y in ['propV', 'nPVI_V', 'rPVI_C']:
shift_y=0.1
for lang in D:
xarr_lang=[]
yarr_lang=[]
for j, file in enumerate(list_files):
if d_lang[file] == lang:
xarr_lang.append(xarr[j])
yarr_lang.append(yarr[j])
xarr_lang=np.array(xarr_lang)
yarr_lang=np.array(yarr_lang)
std=np.std(xarr_lang)
xsterr=std/np.sqrt(len(xarr_lang))
std=np.std(yarr_lang)
ysterr=std/(np.sqrt(len(yarr_lang)))
x,y=np.mean(xarr_lang),np.mean(yarr_lang)
plt.errorbar(x,y,label=lang,fmt=".k",xerr=xsterr,yerr=ysterr,capsize=2)
plt.text(x+shift_x,y+shift_y,lang)
plt.title(f"Distribution des langues selon {metrique_x}, {metrique_y}")
plt.xlabel(metrique_x)
plt.ylabel(metrique_y)
plt.show()
NB: Pearson coefficients are signed (contrary to original version where it was the absolute value)
pearson={}
for metrique in metrique_keys:
pearson[metrique]={'lstm_1':{'cell_states':[],'outputs':[]},'lstm_2':{'cell_states':[],'outputs':[]}}
for lstm in ['lstm_1','lstm_2']:
for celltype in ['outputs','cell_states']:
for i in range(n_cells): #i passe sur chacune des cellules
y=[]
for file in list_files:
audio=d_match[file].split('.')[0]
y.append(datay[audio][lstm][celltype][i])
y=np.array(y).astype(np.float)
for metrique in metrique_keys:
x=metric_arrays[metrique]
pearson[metrique][lstm][celltype].append(pearsonr(y,x)[0])
/tmp/ipykernel_35138/129649858.py:13: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations y=np.array(y).astype(np.float)
Meilleures cellules
##Meilleures cellules
'''
d1={}
for metrique in list(pearson.keys()):
d1[metrique]=[]
for lstm in list(pearson[metrique].keys()):
for celltype in list(pearson[metrique][lstm].keys()):
d1[metrique]=d1[metrique]+pearson[metrique][lstm][celltype]
'''
if saveTxtFile:
#f=open(f'./corpus_ramus/best_cells_{activations_name}.txt', 'w')
#orig_stdout = sys.stdout
#sys.stdout=f
tee=Tee(f'./corpus_ramus/best_cells_{activations_name}.txt')
for metrique in list(pearson.keys()):
pearson2 = copy.deepcopy(pearson)
print(metrique)
max_ref_previous=np.inf
for i in range(5):
max_ref=-np.inf
for lstm in list(pearson2[metrique].keys()):
for celltype in list(pearson2[metrique][lstm].keys()):
for ind, act in enumerate(pearson2[metrique][lstm][celltype]):
if max_ref<np.abs(act) and np.abs(act)<max_ref_previous:
ind_max=lstm, celltype, ind
max_ref=np.abs(act)
lstm, celltype, ind=ind_max
text=lstm+", "+celltype+", "+str(ind)+'\t\t'
print(text,max_ref)
max_ref_previous=max_ref
print("\n")
if saveTxtFile:
tee.close()
#f.close()
#sys.stdout=orig_stdout
deltV lstm_2, cell_states, 49 0.3689776597306671 lstm_2, cell_states, 36 0.3680320013286456 lstm_2, outputs, 88 0.34628539126637936 lstm_2, outputs, 51 0.34383809066805654 lstm_2, cell_states, 51 0.34095421487203703 deltC lstm_1, outputs, 3 0.4534718509891723 lstm_1, cell_states, 3 0.4506392162252305 lstm_1, cell_states, 51 0.36374490379362534 lstm_1, outputs, 51 0.3333297666505066 lstm_2, cell_states, 126 0.32234809249489815 propV lstm_2, cell_states, 35 0.5159651733009365 lstm_2, outputs, 35 0.4985352339979508 lstm_2, cell_states, 82 0.48276053426729537 lstm_2, cell_states, 54 0.47596042491406887 lstm_2, cell_states, 144 0.4683552173518527 varco_V lstm_2, cell_states, 64 0.31585891815269485 lstm_2, outputs, 64 0.3152343254749988 lstm_2, cell_states, 49 0.306683036442078 lstm_2, cell_states, 36 0.30545091782915135 lstm_2, outputs, 51 0.30107630490875653 varco_C lstm_1, outputs, 3 0.2932430510879917 lstm_1, cell_states, 3 0.29031834826730984 lstm_1, outputs, 131 0.25080252112755025 lstm_1, cell_states, 131 0.21140309775863725 lstm_1, cell_states, 28 0.19810116031030836 nPVI_V lstm_2, outputs, 64 0.46744099891837587 lstm_2, cell_states, 64 0.46557907564320145 lstm_2, cell_states, 36 0.44177625506277024 lstm_2, cell_states, 89 0.4181265438714806 lstm_2, outputs, 89 0.4161945234667573 rPVI_C lstm_2, outputs, 112 0.3986798974818543 lstm_2, cell_states, 112 0.36969689499742914 lstm_2, cell_states, 3 0.36293515195944276 lstm_2, outputs, 3 0.35342045425501745 lstm_1, outputs, 140 0.34561645628994847
Figures
metrique_inds={'deltV':0, 'deltC':1, 'propV':2}
c_lang={"en":'C0',"pol":'C1',"du":'C2',"fr":'C3',"esp":'C4',"it":'C5',"cat":'C6',"ja":'C7'}
metrique_inds.keys()
dict_keys(['deltV', 'deltC', 'propV'])
#reference layer
layer='lstm_2'
celltype='outputs'
#metric_coeffs={'propV':0.05, 'deltC':20, 'deltV':20, 'nPVI_V':0.02, 'rPVI_C':0.15} #the metrics are first multiplied by these coeffs (y in [0, 1] range)
metric_coeffs=dict([(metrique, 1./metric_arrays[metrique].std()) for metrique in metrique_keys])
Matrix of activations / metric vectors
X=np.zeros((n_cells, len(d_match)))
metric_arrays={}
for metrique in metrique_keys:
l=[]
for j, file in enumerate(list_files):
l.append(dmetrics[file][metrique])
audio=d_match[file].split('.')[0]
X[:, j] = np.array(datay[audio][layer][celltype]).astype(np.float)
metric_arrays[metrique]=np.array(l)
/tmp/ipykernel_35138/3199245412.py:9: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations X[:, j] = np.array(datay[audio][layer][celltype]).astype(np.float)
AIC/BIC criterion / CV
#normalize data?
X2=np.copy(X)
XT=X2.T
if normalizeFeatures:
XT -= np.mean(XT, axis=0)
XT /= XT.std(axis=0)
k_fold=7
shuffling=False
EPSILON= 1e-4
random_state = 1 if shuffling else None
CVsplitter=KFold(n_splits=k_fold, shuffle=shuffling, random_state=random_state)
metric_alphaCV_lasso = {}
metric_alphaCV_enet = {}
plt=pl
for metrique in metric_arrays:
try:
y=metric_coeffs[metrique]*metric_arrays[metrique]
except KeyError as e:
print(f'{metrique} not in metric_coeffs, lasso not performed')
continue
'''
model_bic = LassoLarsIC(criterion='bic')
model_bic.fit(XT, y)
alpha_bic_ = model_bic.alpha_
model_aic = LassoLarsIC(criterion='aic')
model_aic.fit(XT, y)
alpha_aic_ = model_aic.alpha_
def plot_ic_criterion(model, name, color):
criterion_ = model.criterion_
plt.semilogx(model.alphas_ + EPSILON, criterion_, '--', color=color,
linewidth=3, label='%s criterion' % name)
plt.axvline(model.alpha_ + EPSILON, color=color, linewidth=3,
label='alpha: %s estimate' % name)
plt.xlabel(r'$\alpha$')
plt.ylabel('criterion')
plt.figure()
plot_ic_criterion(model_aic, 'AIC', 'b')
plot_ic_criterion(model_bic, 'BIC', 'r')
plt.legend()
plt.title(f'Information-criterion for model selection {metrique}')
'''
# #############################################################################
# LassoCV: coordinate descent
# Compute paths
eps = 1e-2 # the smaller it is the longer is the path
print("Computing regularization path using the coordinate descent lasso...")
model_lasso = LassoCV(cv=CVsplitter, eps=eps, fit_intercept=True).fit(XT, y)
print("Computing regularization path using the elastic net...")
model_enet = ElasticNetCV(cv=CVsplitter, l1_ratio=l1_ratio, eps=eps, fit_intercept=True).fit(XT, y)
metric_alphaCV_lasso[metrique] = model_lasso.alpha_
metric_alphaCV_enet[metrique] = model_enet.alpha_
# Display results
plt.figure(figsize=(10,6))
plt.subplot(1, 2, 1)
ymin, ymax = 0.02, 0.1
plt.semilogx(model_lasso.alphas_ + EPSILON, model_lasso.mse_path_, ':')
plt.plot(model_lasso.alphas_ + EPSILON, model_lasso.mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
plt.axvline(model_lasso.alpha_ + EPSILON, linestyle='--', color='k',
label='alpha: CV estimate')
plt.legend()
plt.xlabel(r'$\alpha$')
plt.ylabel('Mean square error')
plt.title(f'Lasso {metrique}')
plt.axis('tight')
#plt.ylim(ymin, ymax)
plt.subplot(1, 2, 2)
plt.semilogx(model_enet.alphas_ + EPSILON, model_enet.mse_path_, ':')
plt.plot(model_enet.alphas_ + EPSILON, model_enet.mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
plt.axvline(model_enet.alpha_ + EPSILON, linestyle='--', color='k',
label='alpha: CV estimate')
plt.legend()
plt.xlabel(r'$\alpha$')
plt.ylabel('Mean square error')
plt.title(f'ElasticNet {metrique}')
plt.axis('tight')
#plt.ylim(ymin, ymax)
plt.suptitle('Mean square error on each fold: coordinate descent')
Computing regularization path using the coordinate descent lasso... Computing regularization path using the elastic net... Computing regularization path using the coordinate descent lasso... Computing regularization path using the elastic net... Computing regularization path using the coordinate descent lasso... Computing regularization path using the elastic net... Computing regularization path using the coordinate descent lasso... Computing regularization path using the elastic net... Computing regularization path using the coordinate descent lasso...
/home/fdeloche/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:647: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.525e-02, tolerance: 1.396e-02 model = cd_fast.enet_coordinate_descent(
Computing regularization path using the elastic net... Computing regularization path using the coordinate descent lasso... Computing regularization path using the elastic net... Computing regularization path using the coordinate descent lasso... Computing regularization path using the elastic net...
Lasso/ENet path
#normalize data?
X2=np.copy(X)
XT=X2.T
if normalizeFeatures:
XT -= np.mean(XT, axis=0)
XT /= XT.std(axis=0)
metric_alphas_lasso = {}
metric_coefs_lasso = {}
metric_alphas_enet = {}
metric_coefs_enet = {}
for metrique in metric_arrays:
try:
y=metric_coeffs[metrique]*metric_arrays[metrique]
except KeyError as e:
print(f'{metrique} not in metric_coeffs, lasso not performed')
continue
# Compute paths
eps = 1e-3 # the smaller it is the longer is the path
print(metrique)
print("Computing regularization path using the lasso...")
alphas_lasso, coefs_lasso, _ = lasso_path(XT, y, eps=eps) #fit_intercept=True (deprecated)
metric_alphas_lasso[metrique]=alphas_lasso
metric_coefs_lasso[metrique]=coefs_lasso
print("Computing regularization path using the elastic net...")
alphas_enet, coefs_enet, _ = enet_path(
XT, y, eps=eps, l1_ratio=l1_ratio) #fit_intercept=True (deprecated)
metric_alphas_enet[metrique]=alphas_enet
metric_coefs_enet[metrique]=coefs_enet
# Display results
plt=pl
plt.figure(1, figsize=(8,6))
colors = cycle(['b', 'r', 'g', 'c', 'k'])
neg_log_alphas_lasso = -np.log10(alphas_lasso)
neg_log_alphas_enet = -np.log10(alphas_enet)
for coef_l, coef_e, c in zip(coefs_lasso, coefs_enet, colors):
l1 = plt.plot(neg_log_alphas_lasso, coef_l, c=c)
l2 = plt.plot(neg_log_alphas_enet, coef_e, linestyle=':', c=c, alpha=0.30)
plt.axvline(-np.log10(metric_alphaCV_lasso[metrique]), linestyle='-', color='k')
plt.axvline(-np.log10(metric_alphaCV_enet[metrique]), linestyle=':', color='k')
plt.xlabel('-Log(alpha)')
plt.ylabel('coefficients')
plt.title(f'Lasso and Elastic-Net Paths - {metrique}')
plt.legend((l1[-1], l2[-1]), ('Lasso', 'Elastic-Net'), loc='lower left')
plt.axis('tight')
plt.show()
deltV Computing regularization path using the lasso... Computing regularization path using the elastic net...
deltC Computing regularization path using the lasso... Computing regularization path using the elastic net...
propV Computing regularization path using the lasso... Computing regularization path using the elastic net...
varco_V Computing regularization path using the lasso... Computing regularization path using the elastic net...
varco_C Computing regularization path using the lasso... Computing regularization path using the elastic net...
/tmp/ipykernel_35138/2385859735.py:51: RuntimeWarning: invalid value encountered in log10 plt.axvline(-np.log10(metric_alphaCV_lasso[metrique]), linestyle='-', color='k') /tmp/ipykernel_35138/2385859735.py:52: RuntimeWarning: invalid value encountered in log10 plt.axvline(-np.log10(metric_alphaCV_enet[metrique]), linestyle=':', color='k')
nPVI_V Computing regularization path using the lasso... Computing regularization path using the elastic net...
rPVI_C Computing regularization path using the lasso... Computing regularization path using the elastic net...
Lasso with custom alpha
metric_alphaCV_lasso, metric_alphaCV_enet
({'deltV': 0.03, 'deltC': 0.01, 'propV': 0.004, 'varco_V': 0.01, 'varco_C': -0.002944350495534282, 'nPVI_V': 0.015, 'rPVI_C': 0.02}, {'deltV': 0.01, 'deltC': 0.03, 'propV': 0.01, 'varco_V': 0.02, 'varco_C': -0.012944350495534282, 'nPVI_V': 0.02, 'rPVI_C': 0.03})
metric_alpha_lasso_custom = metric_alphaCV_lasso
metric_alpha_enet_custom = metric_alphaCV_enet
conservative=True
#more conservative
if conservative:
metric_alpha_lasso_custom['deltV']=0.03
metric_alpha_lasso_custom['deltC']=0.01
metric_alpha_lasso_custom['propV']=0.004
metric_alpha_lasso_custom['varco_V']=0.01
metric_alpha_lasso_custom['varco_C']=metric_alpha_lasso_custom['varco_C']-0.01
metric_alpha_lasso_custom['nPVI_V']=0.015
metric_alpha_lasso_custom['rPVI_C']=0.02
metric_alpha_enet_custom['deltV']=0.01
metric_alpha_enet_custom['deltC']=0.03
metric_alpha_enet_custom['propV']=0.01
metric_alpha_enet_custom['varco_V']=0.02
metric_alpha_enet_custom['varco_C']=metric_alpha_lasso_custom['varco_C']-0.01
metric_alpha_enet_custom['nPVI_V']=0.02
metric_alpha_enet_custom['rPVI_C']=0.03
metric_coeffs_lasso = {} #double f, caution: a dictionary _coefs have been defined previously
metric_coeffs_enet = {}
metric_intercept_lasso = {}
metric_intercept_enet = {}
def print_coeffs(intercept, coeffs):
st_l=[str(intercept)]
for ind in np.nonzero(coeffs)[0]:
st_l.append(f' + ({coeffs[ind]:0.4f})*x[{ind}]')
print(''.join(st_l))
for metrique in metric_arrays:
y=metric_coeffs[metrique]*metric_arrays[metrique]
model_lasso = Lasso(alpha=metric_alpha_lasso_custom[metrique], fit_intercept=True)
model_lasso.fit(XT, y)
metric_intercept_lasso[metrique] = model_lasso.intercept_/metric_coeffs[metrique]
metric_coeffs_lasso[metrique] = model_lasso.coef_/metric_coeffs[metrique] #XXX multiplied again by normalization factors
if normalizeFeatures:
metric_coeffs_lasso[metrique] /= X.std(axis=1)
print(f'Lasso {metrique} : {np.count_nonzero(model_lasso.coef_)} non-zero coefficients')
#print_coeffs(model_lasso.intercept_, model_lasso.coef_)
model_enet = ElasticNet(alpha=metric_alpha_enet_custom[metrique], l1_ratio=l1_ratio, fit_intercept=True)
model_enet.fit(XT, y)
metric_intercept_enet[metrique] = model_enet.intercept_/metric_coeffs[metrique]
metric_coeffs_enet[metrique] = model_enet.coef_/metric_coeffs[metrique]
if normalizeFeatures:
metric_coeffs_enet[metrique]/=X.std(axis=1)
print(f'ElasticNet {metrique} : {np.count_nonzero(model_enet.coef_)} non-zero coefficients')
#print_coeffs(model_enet.intercept_, model_enet.coef_)
Lasso deltV : 1 non-zero coefficients ElasticNet deltV : 80 non-zero coefficients Lasso deltC : 11 non-zero coefficients ElasticNet deltC : 37 non-zero coefficients Lasso propV : 26 non-zero coefficients ElasticNet propV : 74 non-zero coefficients Lasso varco_V : 8 non-zero coefficients ElasticNet varco_V : 38 non-zero coefficients Lasso varco_C : 150 non-zero coefficients ElasticNet varco_C : 0 non-zero coefficients Lasso nPVI_V : 5 non-zero coefficients ElasticNet nPVI_V : 55 non-zero coefficients Lasso rPVI_C : 7 non-zero coefficients ElasticNet rPVI_C : 34 non-zero coefficients
/home/fdeloche/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:647: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: nan, tolerance: 1.530e-02 Linear regression models with null weight for the l1 regularization term are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead. model = cd_fast.enet_coordinate_descent(
Scatter plots
for metrique in metric_arrays:
for (coeffs, intercept, name) in [(metric_coeffs_lasso[metrique], metric_intercept_lasso[metrique], 'Lasso'),
(metric_coeffs_enet[metrique], metric_intercept_enet[metrique], 'ElasticNet')]:
c=[]
met_arr=metric_arrays[metrique]
for file in list_files:
c.append(c_lang[d_lang[file]])
y=np.dot(X.T, coeffs)
y+=intercept
#HACK for legend
from matplotlib.lines import Line2D
custom_lines = [Line2D([0], [0], c=f'C{i}') for i in range(len(c_lang))]
pl.figure(figsize=(8,6))
pl.scatter(met_arr, y, c=c)
pl.xlabel(metrique)
pl.legend(custom_lines, c_lang)
pl.title(f'Regression with activations ({name}) for {metrique}')
Save coeffs to JSON file
if saveCoeffs:
coeffs_dic={}
for metrique in metric_arrays:
dic={}
dic['mult_factor']=metric_coeffs[metrique]
dic2={}
dic2['alpha']=metric_alpha_lasso_custom[metrique]
dic2['coeffs']=list(metric_coeffs_lasso[metrique])
dic2['intercept']=metric_intercept_lasso[metrique]
dic['lasso']=dic2
dic2={}
dic2['alpha']=metric_alpha_enet_custom[metrique]
dic2['coeffs']=list(metric_coeffs_enet[metrique])
dic2['intercept']=metric_intercept_enet[metrique]
dic['enet']=dic2
coeffs_dic[metrique]=dic
#activations_name=activations_folder.split('/')[-1]
json_filename=activations_name
if conservative:
json_filename+='_conservative'
with open(f'./corpus_ramus/coeffs/{json_filename}.json', 'w') as f:
json.dump(coeffs_dic, f, sort_keys=True, indent=4)