Source code for hiphive.fitting.ensemble_optimizer

"""
Ensemble Optimizer

https://en.wikipedia.org/wiki/Bootstrap_aggregating
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html  # NOQA
"""

import numpy as np
from .base_optimizer import BaseOptimizer
from .optimizer import Optimizer


[docs]class EnsembleOptimizer(BaseOptimizer): """Ensemble optimizer that carries out a series of single optimization runs using the :class:`Optimizer` class and then provides access to various ensemble averaged quantities including e.g., errors and parameters. Warning ------- Repeatedly setting up a EnsembleOptimizer and training *without* changing the seed for the random number generator will yield identical or correlated results, to avoid this please specify a different seed when setting up multiple EnsembleOptimizer instances. Parameters ---------- fit_data : tuple of (N, M) NumPy array and (N) NumPy array the first element of the tuple represents the fit matrix `A` whereas the second element represents the vector of target values `y`; here `N` (=rows of `A`, elements of `y`) equals the number of target values and `M` (=columns of `A`) equals the number of parameters fit_method : string method to be used for training; possible choice are "least-squares", "lasso", "elasticnet", "bayesian-ridge", "ardr" ensemble_size : int number of fits in the ensemble train_size : float or int If float represents the fraction of `fit_data` (rows) to be used for training. If int, represents the absolute number of rows to be used for training. bootstrap : boolean if True sampling will be carried out with replacement seed : int seed for pseudo random number generator """ def __init__(self, fit_data, fit_method='least-squares', ensemble_size=50, train_size=1.0, bootstrap=True, seed=42, **kwargs): BaseOptimizer.__init__(self, fit_data, fit_method, seed) # set training size if isinstance(train_size, float): self._train_size = int( np.round(train_size * self.number_of_target_values)) elif isinstance(train_size, int): self._train_size = train_size else: raise TypeError('Training size must be int or float') self._ensemble_size = ensemble_size self._bootstrap = bootstrap self._kwargs = kwargs self._train_set_list = None self._test_set_list = None self._parameter_vectors = None self._parameters_std = None self._rmse_train_ensemble = None self._rmse_test_ensemble = None
[docs] def train(self): """ Carry out ensemble training and construct the final model by averaging over all models in the ensemble. """ self._run_ensemble() self._construct_final_model()
def _run_ensemble(self): """ Construct an ensemble of models. """ rs = np.random.RandomState(self.seed) optimizers = [] for _ in range(self.ensemble_size): # construct training and test sets train_set = rs.choice(np.arange(self.number_of_target_values), self.train_size, replace=self.bootstrap) test_set = np.setdiff1d( range(self.number_of_target_values), train_set) # train opt = Optimizer( (self._A, self._y), self.fit_method, train_set=train_set, test_set=test_set, **self._kwargs) opt.train() optimizers.append(opt) # collect data from each fit self._parameter_vectors = np.array( [opt.parameters for opt in optimizers]) self._train_set_list = [opt.train_set for opt in optimizers] self._test_set_list = [opt.test_set for opt in optimizers] self._rmse_train_ensemble = np.array( [opt.rmse_train for opt in optimizers]) self._rmse_test_ensemble = np.array( [opt.rmse_test for opt in optimizers]) def _construct_final_model(self): """ Construct final model by averaging over all models in the ensemble. """ self._fit_results['parameters'] = np.mean( self.parameter_vectors, axis=0) self._parameters_std = np.std(self.parameter_vectors, axis=0)
[docs] def predict(self, A, return_std=False): """ Predict data given an input matrix `A`, i.e., `Ax`, where `x` is the vector of the fitted parameters. By using all parameter vectors in the ensemble a standard deviation of the prediction can be obtained. Parameters ---------- A : NumPy (N, M) array fit matrix where `N` (=rows of `A`, elements of `y`) equals the number of target values and `M` (=columns of `A`) equals the number of parameters return_std : bool whether or not to return the standard deviation of the prediction Returns ------- (NumPy (N) array, NumPy (N) array) or (float, float) vector of predicted values, vector of standard deviations """ prediction = np.dot(A, self.parameters) if return_std: predictions = np.dot(A, self.parameter_vectors.T) if len(predictions.shape) == 1: # shape is (N, ) std = np.std(predictions) else: # shape is (N, M) std = np.std(predictions, axis=1) return prediction, std else: return prediction
@property def error_matrix(self): """ NumPy (N,M) array : matrix of fit errors where `N` is the number of target values and `M` is the number of fits (i.e., the size of the ensemble) """ error_matrix = np.zeros((self._Nrows, self.ensemble_size)) for i, parameters in enumerate(self.parameter_vectors): error_matrix[:, i] = np.dot(self._A, parameters) - self._y return error_matrix @property def summary(self): """ dict : Comprehensive information about the optimizer. """ info = super().summary # Add class specific data info['parameters_std'] = self.parameters_std info['ensemble_size'] = self.ensemble_size info['rmse_train'] = self.rmse_train info['rmse_train_ensemble'] = self.rmse_train_ensemble info['rmse_test'] = self.rmse_test info['rmse_test_ensemble'] = self.rmse_test_ensemble info['train_size'] = self.train_size info['bootstrap'] = self.bootstrap # add kwargs used for fitting info = {**info, **self._kwargs} return info def __repr__(self): kwargs = dict() kwargs['fit_method'] = self.fit_method kwargs['ensemble_size'] = self.ensemble_size kwargs['train_size'] = self.train_size kwargs['bootstrap'] = self.bootstrap kwargs['seed'] = self.seed kwargs = {**kwargs, **self._kwargs} return 'EnsembleOptimizer((A, y), {})'.format( ', '.join('{}={}'.format(*kwarg) for kwarg in kwargs.items())) @property def parameters_std(self): """ NumPy array : standard deviation for each parameter. """ return self._parameters_std @property def parameter_vectors(self): """ list : all parameter vectors in the ensemble. """ return self._parameter_vectors @property def ensemble_size(self): """ int : number of train rounds. """ return self._ensemble_size @property def rmse_train(self): """ float : ensemble average of root mean squared error over train sets. """ if self.rmse_train_ensemble is None: return None return np.sqrt(np.mean((self.rmse_train_ensemble)**2)) @property def rmse_train_ensemble(self): """ list : root mean squared train errors obtained during for each fit in ensemble. """ return self._rmse_train_ensemble @property def rmse_test(self): """ float : ensemble average of root mean squared error over test sets. """ if self.rmse_test_ensemble is None: return None return np.sqrt(np.mean((self.rmse_test_ensemble)**2)) @property def rmse_test_ensemble(self): """ list : root mean squared test errors obtained during for each fit in ensemble. """ return self._rmse_test_ensemble @property def train_size(self): """ int : number of rows included in train sets. Note that this will be different from the number of unique rows if boostrapping. """ return self._train_size @property def train_fraction(self): """ float : fraction of input data used for training; this value can differ slightly from the value set during initialization due to rounding. """ return self.train_set_size / self._Nrows @property def bootstrap(self): """ boolean : True if sampling is carried out with replacement. """ return self._bootstrap