Learning curve¶

This tutorial demonstrates the convergence behavior of two common optimization algorithms with respect to training set size. Specifically, it considers standard least-squares regression and automated relevance detection regression (ARDR).

Following the definition of basic parameters and the generation of a set of reference structures (10), a fourth-order model for EMT Ni is prepared. Based on these data fit matrix and target vector are constructed. Subsequently, the CrossValidationEstimator is used to obtain the evoluation of the cross validation (CV) score as a function of training set size and the results are plotted.

Warning

Please note that calling functions that rely on the generation of pseudo- random numbers repeatedly with the same seed (i.e., repeatedly falling back to the default value) is strongly discouraged as it will lead to correlation. To circumvent this problem one can for example seed a sequence of random numbers and then use these numbers in turn as seeds.

Source code¶

The complete source code is available in
tutorial/advanced/learning_curve/learning_curve.py
"""
Creates a convergence plot comparing L2 and ardr

Runs in approximately 600 seconds on an Intel i7-950
"""

import os
from ase.io import write
from ase.build import bulk
from ase.calculators.emt import EMT
from hiphive.structure_generation import generate_mc_rattled_structures
from hiphive import ClusterSpace, StructureContainer
from hiphive.fitting import CrossValidationEstimator as CVE
from matplotlib import pyplot as plt
from matplotlib.patches import ConnectionPatch
import numpy as np

# parameters
structures_fname = 'structures/rattled_structures.extxyz'
number_of_structures = 10
cell_size = 4
rattle_std = 0.03
minimum_distance = 2.3

# setup
atoms_ideal = bulk('Ni', cubic=True).repeat(cell_size)
calc = EMT()

# generate structures
structures = generate_mc_rattled_structures(
atoms_ideal, number_of_structures, rattle_std, minimum_distance)
for structure in structures:
structure.set_calculator(calc)
forces = structure.get_forces()

displacements = structure.positions - atoms_ideal.get_positions()
structure.new_array('displacements', displacements)
structure.new_array('forces', forces)

structure.positions = atoms_ideal.get_positions()
structure.calc = None

# save structures
if not os.path.isdir(os.path.dirname(structures_fname)):
os.mkdir(os.path.dirname(structures_fname))
write(structures_fname, structures)

# read structures containing displacements and forces

# set up cluster space
cutoffs = [6.0, 5.0, 4.0]
cs = ClusterSpace(structures[0], cutoffs)
print(cs)
cs.print_orbits()

# ... and structure container
sc = StructureContainer(cs)
for structure in structures:
print(sc)

# First we generate a lot of models for each training size by chosing from the
# pool (the complete set of force components). Then they are validated against
# the remaining pool

# The complete data set
M, f = sc.get_fit_data()

# some setup for the plots
colors = plt.get_cmap('tab10').colors
fit_methods = ['least-squares', 'ardr']
# the first is for L2 the second for ardr
training_sizes = [np.logspace(np.log10(30), np.log10(2000), 30),
np.logspace(np.log10(30), np.log10(400), 10)]
plt.clf()
fig = plt.figure()
ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
plt.title('Learning curve')
plt.xlabel('Training size [# force components]')
plt.ylabel('Validation rms error [eV/Å]')

# Using the CVE we can construct the plot by looping over different traning
# sizes. We plot the mean and std for all the models produces at each traning
# size.
for i, fit_method in enumerate(fit_methods):
mean = []
std = []
x = [int(ts) for ts in training_sizes[i]]
for train_size in x:
opt = CVE((M, f),
fit_method=fit_method,
validation_method='shuffle-split',
number_of_splits=10,
train_size=train_size,
test_size=5000,
)
opt.validate()
mean.append(opt.rmse_validation)
std.append(np.std(opt.rmse_validation_splits))
# Plot the result for the current method
mean = np.array(mean)
std = np.array(std)
ax.loglog(x, mean, color=colors[i], label=fit_method)
ax.loglog(x, mean + std, '--', color=colors[i])
ax.loglog(x, mean - std, '--', color=colors[i])
fig.savefig('convergence.svg')
ax.set_ylim(0.01, 1)

# Now we run a cv for one sample at 1000 force components
size = 1000
test_size = 100
fit_method = 'least-squares'
training_sizes_cv = training_sizes[0][training_sizes[0] <= (size-test_size)]
sample = np.random.choice(range(len(f)), size=size, replace=False)
M_partial = M[sample, :]
f_partial = f[sample]
rmse_list = []
for train_size in (int(ts) for ts in training_sizes_cv):
opt = CVE((M_partial, f_partial),
fit_method=fit_method,
validation_method='shuffle-split',
train_size=train_size,
test_size=100,
number_of_splits=100,
)
opt.validate()
rmse_list.append(opt.rmse_validation)
opt.train()
rmse = np.sqrt(np.mean((f-np.dot(M, opt.parameters))**2))
ax.loglog(training_sizes_cv, rmse_list, color=colors[3],
label=fit_method+' CV')
ax.loglog([size], [rmse], marker='x', color=colors[3])
plt.savefig('convergence.svg')

ax.legend(loc='lower left')

# Ad some scatter plots for different training sizes
scatters = []
method = 'least-squares'
sizes = [200, 250, 1000]
for size in sizes:
sample = np.random.choice(range(len(f)), size=size, replace=False)
opt = CVE((M[sample, :], f[sample]),
fit_method=method,
validation_method='shuffle-split',
number_of_splits=10,
train_size=train_size,
test_size=5000,
)
opt.train()
parameters = opt.parameters
# save the predicted forces
scatters.append(np.dot(M, opt.parameters))
axes = [fig.add_axes([0.55, 0.70, 0.2, 0.18]),
for ax, f_pred, size in zip(axes, scatters, sizes):
rmse = np.mean((f-f_pred)**2)**0.5
ax.scatter(f[::10], f_pred[::10], s=0.1)
ax.set_aspect('equal', anchor='SW')
ax.axis([-5, 5, -5, 5])
ax.plot([-3, 3], [-3, 3], 'k')
ax.tick_params(labelsize='small')
ax.set_xticks([-3, 0, 3])
ax.set_yticks([-3, 0, 3])
cp = ConnectionPatch((0, 0), (size, rmse),
coordsA='axes fraction', coordsB='data',
axesA=ax, axesB=fig.axes[0],
arrowstyle='->')