Feature selection

In this tutorial we explore the concept of feature selection while considering three different methods

These methods solve the over or under-determined linear problem

\[\boldsymbol{A}\boldsymbol{x} = \boldsymbol{b}.\]

where \(\boldsymbol{A}\) is the fit matrix, \(\boldsymbol{x}\) represents the parameters to be determined, and \(\boldsymbol{b}\) are the target values.

The present tutorial has the objective to showcase the different algorithms and is not indented as a comprehensive introduction. The reader is rather strongly encouraged to turn to the existing literature on feature selection in machine learning. Here, the links include in the list above provide useful starting points. It should also be noted that for instance scikit-learn, which underpins the algorithms employed here, provides further interesting feature selection techniques, such as the recursive feature elimination, which can be used with hiPhive with little additional effort.


The least absolute shrinkage and selection operator (LASSO) minimizes the objective function

\[\frac{1}{2 N} || \boldsymbol{A}\boldsymbol{x} - \boldsymbol{b} ||^2_2 + \alpha ||\boldsymbol{x}||_1,\]

where \(\alpha\) is the regularization parameter which determines the sparsity of the solution and \(N\) is the number of rows in \(\boldsymbol{A}\).

Lasso thus aims to achieve sparse solutions by directly minimizing the \(\ell_1\)-norm of the model parameter vector

\[||\boldsymbol{x}||_1 = n^{-1} \sum_i |x_i|,\]

where \(n\) is the number of parameters (or elements of \(\boldsymbol{x}\)).


Automatic relevance detection regression (ARDR) is a Bayesian regression techniques and thus seeks to find a solution, which assigns priors to both the model parameters and the hyperparameters. It is also known as sparse Bayesian learning (SBL) or relevance vector machine (RVM).

In ARDR the \(\lambda\)-threshold parameter can be used for removing (pruning) weights with high precision from the computation.


Bootstrapping (Bootstrap Aggregating) can be used for pruning models via removing parameters according to the criterion

\[\begin{split}\forall x_i \in \boldsymbol{x} \quad x_i = \begin{cases} \left<x_i\right> & \text{if}\qquad \left|\left<x_i\right>\right| > f \sigma(x_i) \\ 0 & \text{else} \end{cases}\end{split}\]

where mean \(\left<x_i\right>\) and standard deviation \(\sigma(x_i)\) are defined over an ensemble of models.


The feature selection process looks like


The obtained force constant models for a few selection parameters are shown below.


Source code

Structure preparation
import os
from ase.io import write
from ase.cluster import Icosahedron
from ase.optimize import BFGS
from ase.calculators.emt import EMT
from hiphive.structure_generation import generate_rattled_structures

# parameters
structures_fname = 'structures/rattled_structures.extxyz'
number_of_structures = 10
particle_size = 3
a0 = 4.05
rattle_std = 0.02

# setup
atoms_ideal = Icosahedron('Al', particle_size, a0)
calc = EMT()

# center atoms and add pbc
atoms_ideal.pbc = True

# relax particle
dyn = BFGS(atoms_ideal)
converged = dyn.run(fmax=0.0001, steps=1000)

# generate structures
structures = generate_rattled_structures(
    atoms_ideal, number_of_structures, rattle_std)
for structure in structures:
    forces = structure.get_forces()

    displacements = structure.positions - atoms_ideal.get_positions()
    structure.new_array('displacements', displacements)
    structure.new_array('forces', forces)

    structure.positions = atoms_ideal.get_positions()
    structure.calc = None

# save structures
if not os.path.isdir(os.path.dirname(structures_fname)):
write(structures_fname, structures)
Setup of structure container
from ase.io import read
from hiphive import ClusterSpace, StructureContainer

structures = read('structures/rattled_structures.extxyz@:')
cutoffs = [12]

cs = ClusterSpace(structures[0], cutoffs)
sc = StructureContainer(cs)
for structure in structures:

Feature selection
Feature selection

Runs in 10 minutes on an Intel Core i5-4670K CPU.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from hiphive import StructureContainer, ForceConstantPotential
from hiphive.fitting import Optimizer, EnsembleOptimizer

def compute_rmse(A, x, y):
    y_predicted = np.dot(A, x)
    delta_y = y_predicted - y
    rmse = np.sqrt(np.mean(delta_y**2))
    return rmse

def test_pruned_model(features):
    opt = Optimizer((A_train[:, features], y_train), train_size=1.0)
    pruned_parameters = np.zeros(A.shape[1])
    pruned_parameters[features] = opt.parameters
    summary = dict()
    summary['parameters'] = pruned_parameters
    summary['rmse_test'] = compute_rmse(A_test, pruned_parameters, y_test)
    summary['nzp'] = len(np.nonzero(pruned_parameters)[0])
    return summary

# parameters
alphas = np.logspace(-6, -3.5, 20)
lambdas = np.logspace(3.5, 6, 20)
fvals = np.linspace(0, 4, 20)
train_size = 1200
max_iter = 25000
ensemble_size = 500

# indices for which fcs will be plotted
alpha_inds = [0, 15]
lambda_inds = [4, 19]
f_inds = [0, 8]

# load data
sc = StructureContainer.read('structure_container')
cs = sc.cluster_space
A, y = sc.get_fit_data()
test_size = A.shape[0] - train_size

opt = Optimizer((A, y), train_size=train_size)
A_train, y_train = A[opt.train_set, :], y[opt.train_set]
A_test, y_test = A[opt.test_set, :], y[opt.test_set]

# Lasso feature selection
summary_list = []
for alpha in alphas:

    # select features
    opt = Optimizer((A_train, y_train), fit_method='lasso', alpha=alpha,
                    max_iter=max_iter, tol=1e-6)
    features = opt.parameters != 0

    # test pruned model
    summary = test_pruned_model(features)
    summary['alpha'] = alpha
df_lasso = pd.DataFrame(summary_list)

# Automatic Relevance Detection
summary_list = []
for threshold_lambda in lambdas:

    # select features
    opt = Optimizer((A_train, y_train), fit_method='ardr',
    features = opt.parameters != 0

    # test pruned model
    summary = test_pruned_model(features)
    summary['threshold_lambda'] = threshold_lambda
df_ardr = pd.DataFrame(summary_list)

# Bootstrap feature selection
eopt = EnsembleOptimizer((A_train, y_train), bootstrap=True, train_size=1.0,
summary_list = []
for f in fvals:
    # select features
    params, std = eopt.parameters, eopt.parameters_std
    features = np.abs(params) > f * std

    # test pruned model
    summary = test_pruned_model(features)
    summary['f'] = f
df_ensemble = pd.DataFrame(summary_list)

# analyze the predicted force constant models for a selection parameters
lasso_orbits = dict()
for ind in alpha_inds:
    fcp = ForceConstantPotential(cs, df_lasso.parameters[ind])
    lasso_orbits[ind] = pd.DataFrame(fcp.orbit_data)

ardr_orbits = dict()
for ind in lambda_inds:
    fcp = ForceConstantPotential(cs, df_ardr.parameters[ind])
    ardr_orbits[ind] = pd.DataFrame(fcp.orbit_data)

ensemble_orbits = dict()
for ind in f_inds:
    fcp = ForceConstantPotential(cs, df_ensemble.parameters[ind])
    ensemble_orbits[ind] = pd.DataFrame(fcp.orbit_data)

# plotting feature selection
figsize = (18, 4.8)
fs = 14
ms = 8
lw = 1.5
color1 = '#1f77b4'
color2 = '#ff7f0e'

fig = plt.figure(figsize=figsize)
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2)
ax3 = fig.add_subplot(1, 3, 3)

ylim1 = [0.0065, 0.01]
ylim2 = [0, 600]

ax1_y2 = ax1.twinx()
ax2_y2 = ax2.twinx()
ax3_y2 = ax3.twinx()

kwargs_rmse = dict(lw=lw, ms=ms, color=color1, label='RMSE Test')
kwargs_nzp = dict(lw=lw, ms=ms, color=color2, label='N')

ax1.semilogx(df_lasso.alpha, df_lasso.rmse_test, '-o', **kwargs_rmse)
ax1_y2.semilogx(df_lasso.alpha, df_lasso.nzp, '-o', **kwargs_nzp)

ax2.semilogx(df_ardr.threshold_lambda, df_ardr.rmse_test, '-o', **kwargs_rmse)
ax2_y2.semilogx(df_ardr.threshold_lambda, df_ardr.nzp, '-o', **kwargs_nzp)

ax3.plot(df_ensemble.f, df_ensemble.rmse_test, '-o', **kwargs_rmse)
ax3_y2.plot(df_ensemble.f, df_ensemble.nzp, '-o', **kwargs_nzp)

ax1.set_xlim([np.min(alphas), np.max(alphas)])
ax2.set_xlim([np.min(lambdas), np.max(lambdas)])
ax3.set_xlim([np.min(fvals), np.max(fvals)])

ax1.set_xlabel('Regularization parameter $\\alpha$', fontsize=fs)
ax2.set_xlabel(r'$\lambda$ - Threshold', fontsize=fs)
ax3.set_xlabel('Pruning parameter f', fontsize=fs)

ax1.set_title('Lasso', fontsize=fs)
ax2.set_title('Automatic Relevance Detection', fontsize=fs)
ax3.set_title('Bootstrap', fontsize=fs)

for ax in [ax1, ax2, ax3]:
    ax.set_ylabel('RMSE test (eV/Å)', color=color1, fontsize=fs)
    ax.tick_params(axis='y', labelcolor=color1)

for ax in [ax1_y2, ax2_y2, ax3_y2]:
    ax.set_ylabel('Number of non zero parameters', color=color2, fontsize=fs)
    ax.tick_params(labelsize=fs, labelcolor=color2)


# plot resulting models
ms = 10
alpha = 0.8
ylim = [2e-3, 20]
fig = plt.figure(figsize=figsize)
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2)
ax3 = fig.add_subplot(1, 3, 3)

for ind, df in lasso_orbits.items():
    ax1.semilogy(df.radius, df.force_constant_norm, 'o', ms=ms, alpha=alpha,
                 label='alpha = {:.1e}'.format(alphas[ind]))

for ind, df in ardr_orbits.items():
    ax2.semilogy(df.radius, df.force_constant_norm, 'o', ms=ms, alpha=alpha,
                 label='f = {:.1e}'.format(lambdas[ind]))

for ind, df in ensemble_orbits.items():
    ax3.semilogy(df.radius, df.force_constant_norm, 'o', ms=ms, alpha=alpha,
                 label='f = {:.2f}'.format(fvals[ind]))

ax1.set_title('Lasso', fontsize=fs)
ax2.set_title('Automatic Relevance Detection', fontsize=fs)
ax3.set_title('Bootstrap', fontsize=fs)

for ax in [ax1, ax2, ax3]:
    ax.set_xlabel('Pair radius (A)', fontsize=fs)
ax1.set_ylabel('Force constant norm (eV/A$^2$)', fontsize=fs)