We try to recover the dimension of a lattice polytope from its Ehrhart series using machine learning.
import pandas as pd
import ast
import numpy as np
from pcas import keyvalue
from sklearn import svm
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
# Connect to the table
conn = keyvalue.Connection('ml_polydim', address='localhost:12356')
t = conn.connect_to_table('deduplicated_dimension')
# Prepare to read the data
template = {'ehrhart':'', 'log_ehrhart':'', 'dimension':0}
all_data = []
# Read in the data and convert 'ehrhart' and 'log_ehrhart' to Python sequences
for d in t.select(template):
all_data.append({
'ehrhart': np.array(ast.literal_eval(d['ehrhart'])),
'log_ehrhart': np.array(ast.literal_eval(d['log_ehrhart'])),
'dimension': d['dimension']
})
# Clean up
t.close()
conn.close()
Deduplication seems to have mostly removed low-dimensional examples, which makes intuitive sense as there are fewer distinct polytopes in low dimensions.
for n in range(9):
print(f"{n}: {len([x for x in all_data if x['dimension'] == n])}")
0: 0 1: 0 2: 431 3: 787 4: 812 5: 399 6: 181 7: 195 8: 113
X = np.array([x['log_ehrhart'] for x in all_data])
y = np.array([x['dimension'] for x in all_data])
pca = PCA()
pca.fit(X)
print(pca.explained_variance_ratio_)
[9.99503545e-01 4.95662126e-04 6.95420696e-07 ... 9.97724247e-33 9.96032466e-33 4.64393603e-34]
Visualise
X_new = pca.fit_transform(X)
plt.scatter(X_new[:, 0], X_new[:, 1], c=[x['dimension'] for x in all_data])
<matplotlib.collections.PathCollection at 0x2816d6400>
Make a more beautiful seaborn scatter plot
df = pd.DataFrame(X_new[:,:2])
df = df.rename(columns={0:'PC1', 1:'PC2'})
df['Dimension'] = y
g = sns.scatterplot(data=df, x='PC1', y='PC2', hue='Dimension', palette="flare")
g.legend(loc='right', bbox_to_anchor=(1.25, 0.5), ncol=1, title='Dimension')
for dpi in [300,600,1200]:
plt.savefig(f'../images_{dpi}/dim_pca.png', dpi=dpi,bbox_inches='tight')
plt.show()
for i in [0,1]:
a = pca.components_[i]
plt.scatter(x=range(len(a)), y=a, label=f'PC{i+1}')
plt.xlabel('Component $m$ of the logarithmic Ehrhart vector $(\log y_m)$')
plt.ylabel('Coefficient in the PCA component')
plt.legend(loc='right', bbox_to_anchor=(1.25, 0.5), ncol=1)
for dpi in [300,600,1200]:
plt.savefig(f'../images_{dpi}/pca_dependence.png', dpi=dpi,bbox_inches='tight')
plt.show()
We use 50% for training and 50% for test. We stratify the split by dimension.
X_test, X_train, y_test, y_train = train_test_split(X_new[:,:2], y, test_size=0.5, random_state=1, stratify=y)
# Make pipeline with SVM and standard scaler
pipe_SVM = Pipeline([
('scaler', StandardScaler()),
('SVM', svm.SVC(random_state = 42, kernel='linear'))
])
# Parameter grid for SVM
grid_params_SVM = [{'SVM__C': [0.001,0.01,0.1,1,5,10,50,100]}]
# Cross validation to find the best parameter
CV_SVM = GridSearchCV(
estimator = pipe_SVM,
param_grid = grid_params_SVM,
cv = 5,
return_train_score=True,
verbose=0,
n_jobs=8)
# Train
CV_SVM.fit(X_train, y_train)
# Predict (using the best parameters)
predictions=CV_SVM.predict(X_test)
print(f'The best parameters are: {CV_SVM.best_params_}')
a=accuracy_score(y_test, predictions)
print(f'Accuracy: {a}')
The best parameters are: {'SVM__C': 0.1} Accuracy: 1.0
We do a similar SVM analysis for the log Ehrhart data, without PCA or dimensionality reduction.
X_test, X_train, y_test, y_train = train_test_split(X[:,:], y, test_size=0.5, random_state=1, stratify=y)
# Parameter grid for SVM
grid_params_SVM = [{'SVM__C': [0.001,0.01,0.1,1,5,10,50,100]}]
# Cross validation to find the best parameter
CV_SVM = GridSearchCV(
estimator = pipe_SVM,
param_grid = grid_params_SVM,
cv = 5,
return_train_score=True,
verbose=0,
n_jobs=8)
# Train
CV_SVM.fit(X_train, y_train)
# Predict (using the best parameters)
predictions=CV_SVM.predict(X_test)
print(f'The best parameters are: {CV_SVM.best_params_}')
a=accuracy_score(y_test, predictions)
print(f'Accuracy: {a}')
The best parameters are: {'SVM__C': 0.01} Accuracy: 1.0
Dimensional reduction without PCA
X_test, X_train, y_test, y_train = train_test_split(X[:,:5], y, test_size=0.5, random_state=1, stratify=y)
# Parameter grid for SVM
grid_params_SVM = [{'SVM__C': [0.1,1,10,100,1000,5000,6000,7000,8000,9000,10000]}]
# Cross validation to find the best parameter
CV_SVM = GridSearchCV(
estimator = pipe_SVM,
param_grid = grid_params_SVM,
cv = 5,
return_train_score=True,
verbose=0,
n_jobs=8)
# Train
CV_SVM.fit(X_train, y_train)
# Predict (using the best parameters)
predictions=CV_SVM.predict(X_test)
print(f'The best parameters are: {CV_SVM.best_params_}')
a=accuracy_score(y_test, predictions)
print(f'Accuracy: {a}')
The best parameters are: {'SVM__C': 1000} Accuracy: 0.9972583961617546
We repeat the analysis for the original Ehrhart data, without taking log, doing PCA, or reducing dimensionality.
X = np.array([x['ehrhart'] for x in all_data])
y = np.array([x['dimension'] for x in all_data])
X_test, X_train, y_test, y_train = train_test_split(X[:,:], y, test_size=0.5, random_state=1, stratify=y)
# Parameter grid for SVM
grid_params_SVM = [{'SVM__C': [500,1000,10000,50000]}]
# Cross validation to find the best parameter
CV_SVM = GridSearchCV(
estimator = pipe_SVM,
param_grid = grid_params_SVM,
cv = 5,
return_train_score=True,
verbose=0,
n_jobs=8)
# Train
CV_SVM.fit(X_train, y_train)
# Predict (using the best parameters)
predictions=CV_SVM.predict(X_test)
print(f'The best parameters are: {CV_SVM.best_params_}')
a=accuracy_score(y_test, predictions)
print(f'Accuracy: {a}')
The best parameters are: {'SVM__C': 50000} Accuracy: 0.9869773817683345
We repeat the analysis for the original Ehrhart data, but reducing dimensionality by moving to truncated PCA co-ordinates.
pca = PCA()
pca.fit(X)
print(pca.explained_variance_ratio_)
X_new = pca.fit_transform(X)
[9.99999972e-01 2.75877888e-08 4.12886380e-16 ... 3.66337442e-33 2.00335990e-33 4.89241993e-34]
Project to the first 30 dimensions
X_test, X_train, y_test, y_train = train_test_split(X_new[:,:30], y, test_size=0.5, random_state=1, stratify=y)
# Parameter grid for SVM
grid_params_SVM = [{'SVM__C': [0.1,1,10,20,30,40,50,60,70,100]}]
# Cross validation to find the best parameter
CV_SVM = GridSearchCV(
estimator = pipe_SVM,
param_grid = grid_params_SVM,
cv = 5,
return_train_score=True,
verbose=0,
n_jobs=8)
# Train
CV_SVM.fit(X_train, y_train)
# Predict (using the best parameters)
predictions=CV_SVM.predict(X_test)
print(f'The best parameters are: {CV_SVM.best_params_}')
a=accuracy_score(y_test, predictions)
print(f'Accuracy: {a}')
The best parameters are: {'SVM__C': 20} Accuracy: 0.9369431117203564
What happens if we project to just two dimensions?
X_test, X_train, y_test, y_train = train_test_split(X_new[:,:2], y, test_size=0.5, random_state=1, stratify=y)
grid_params_SVM = [{'SVM__C': [0.1,1,10,100,1000,10000,100000]}]
# Cross validation to find the best parameter
CV_SVM = GridSearchCV(
estimator = pipe_SVM,
param_grid = grid_params_SVM,
cv = 5,
return_train_score=True,
verbose=0,
n_jobs=8)
# Train
CV_SVM.fit(X_train, y_train)
# Predict (using the best parameters)
predictions=CV_SVM.predict(X_test)
print(f'The best parameters are: {CV_SVM.best_params_}')
a=accuracy_score(y_test, predictions)
print(f'Accuracy: {a}')
The best parameters are: {'SVM__C': 100000} Accuracy: 0.43385880740233035