Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

""" 

Ensemble Optimizer 

 

https://en.wikipedia.org/wiki/Bootstrap_aggregating 

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html # NOQA 

""" 

 

import numpy as np 

from .base_optimizer import BaseOptimizer 

from .optimizer import Optimizer 

 

 

class EnsembleOptimizer(BaseOptimizer): 

"""Ensemble optimizer that carries out a series of single optimization runs 

using the :class:`Optimizer` class and then provides access to various 

ensemble averaged quantities including e.g., errors and parameters. 

 

Warning 

------- 

Repeatedly setting up a EnsembleOptimizer and training 

*without* changing the seed for the random number generator will yield 

identical or correlated results, to avoid this please specify a different 

seed when setting up multiple EnsembleOptimizer instances. 

 

Parameters 

---------- 

fit_data : tuple of (N, M) NumPy array and (N) NumPy array 

the first element of the tuple represents the fit matrix `A` 

whereas the second element represents the vector of target 

values `y`; here `N` (=rows of `A`, elements of `y`) equals the number 

of target values and `M` (=columns of `A`) equals the number of 

parameters 

fit_method : string 

method to be used for training; possible choice are 

"least-squares", "lasso", "elasticnet", "bayesian-ridge", "ardr" 

standardize : bool 

whether or not to standardize the fit matrix before fitting 

ensemble_size : int 

number of fits in the ensemble 

train_size : float or int 

If float represents the fraction of `fit_data` (rows) to be used for 

training. If int, represents the absolute number of rows to be used for 

training. 

bootstrap : boolean 

if True sampling will be carried out with replacement 

seed : int 

seed for pseudo random number generator 

""" 

 

def __init__(self, fit_data, fit_method='least-squares', standardize=True, 

ensemble_size=50, train_size=1.0, bootstrap=True, seed=42, 

**kwargs): 

 

super().__init__(fit_data, fit_method, standardize, seed) 

 

# set training size 

if isinstance(train_size, float): 

self._train_size = int( 

np.round(train_size * self.number_of_target_values)) 

elif isinstance(train_size, int): 

self._train_size = train_size 

else: 

raise TypeError('Training size must be int or float') 

 

self._ensemble_size = ensemble_size 

self._bootstrap = bootstrap 

self._kwargs = kwargs 

self._train_set_list = None 

self._test_set_list = None 

self._parameter_vectors = None 

self._parameters_std = None 

self._rmse_train_ensemble = None 

self._rmse_test_ensemble = None 

 

def train(self): 

""" 

Carry out ensemble training and construct the final model by averaging 

over all models in the ensemble. 

""" 

self._run_ensemble() 

self._construct_final_model() 

 

def _run_ensemble(self): 

""" Construct an ensemble of models. """ 

 

rs = np.random.RandomState(self.seed) 

optimizers = [] 

for _ in range(self.ensemble_size): 

# construct training and test sets 

train_set = rs.choice(np.arange(self.number_of_target_values), 

self.train_size, replace=self.bootstrap) 

test_set = np.setdiff1d( 

range(self.number_of_target_values), train_set) 

 

# train 

opt = Optimizer( 

(self._A, self._y), self.fit_method, train_set=train_set, 

test_set=test_set, **self._kwargs) 

opt.train() 

optimizers.append(opt) 

 

# collect data from each fit 

 

self._parameter_vectors = np.array( 

[opt.parameters for opt in optimizers]) 

self._train_set_list = [opt.train_set for opt in optimizers] 

self._test_set_list = [opt.test_set for opt in optimizers] 

self._rmse_train_ensemble = np.array( 

[opt.rmse_train for opt in optimizers]) 

self._rmse_test_ensemble = np.array( 

[opt.rmse_test for opt in optimizers]) 

 

def _construct_final_model(self): 

""" 

Construct final model by averaging over all models in the ensemble. 

""" 

self._fit_results['parameters'] = np.mean( 

self.parameter_vectors, axis=0) 

self._parameters_std = np.std(self.parameter_vectors, axis=0) 

 

def predict(self, A, return_std=False): 

""" 

Predict data given an input matrix `A`, i.e., `Ax`, where `x` is 

the vector of the fitted parameters. 

 

By using all parameter vectors in the ensemble a standard deviation of 

the prediction can be obtained. 

 

Parameters 

---------- 

A : NumPy (N, M) array 

fit matrix where `N` (=rows of `A`, elements of `y`) equals the 

number of target values and `M` (=columns of `A`) equals the number 

of parameters 

return_std : bool 

whether or not to return the standard deviation of the prediction 

Returns 

------- 

(NumPy (N) array, NumPy (N) array) or (float, float) 

vector of predicted values, vector of standard deviations 

""" 

prediction = np.dot(A, self.parameters) 

if return_std: 

predictions = np.dot(A, self.parameter_vectors.T) 

if len(predictions.shape) == 1: # shape is (N, ) 

std = np.std(predictions) 

else: # shape is (N, M) 

std = np.std(predictions, axis=1) 

return prediction, std 

else: 

return prediction 

 

@property 

def error_matrix(self): 

""" 

NumPy (N,M) array : matrix of fit errors where `N` is the number of 

target values and `M` is the number of fits 

(i.e., the size of the ensemble) 

""" 

160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true if self.parameter_vectors is None: 

return None 

error_matrix = np.zeros((self._n_rows, self.ensemble_size)) 

for i, parameters in enumerate(self.parameter_vectors): 

error_matrix[:, i] = np.dot(self._A, parameters) - self._y 

return error_matrix 

 

@property 

def summary(self): 

""" dict : Comprehensive information about the optimizer. """ 

info = super().summary 

 

# Add class specific data 

info['parameters_std'] = self.parameters_std 

info['ensemble_size'] = self.ensemble_size 

info['rmse_train'] = self.rmse_train 

info['rmse_train_ensemble'] = self.rmse_train_ensemble 

info['rmse_test'] = self.rmse_test 

info['rmse_test_ensemble'] = self.rmse_test_ensemble 

info['train_size'] = self.train_size 

info['bootstrap'] = self.bootstrap 

 

# add kwargs used for fitting 

info = {**info, **self._kwargs} 

return info 

 

def __repr__(self): 

kwargs = dict() 

kwargs['fit_method'] = self.fit_method 

kwargs['ensemble_size'] = self.ensemble_size 

kwargs['train_size'] = self.train_size 

kwargs['bootstrap'] = self.bootstrap 

kwargs['seed'] = self.seed 

kwargs = {**kwargs, **self._kwargs} 

return 'EnsembleOptimizer((A, y), {})'.format( 

', '.join('{}={}'.format(*kwarg) for kwarg in kwargs.items())) 

 

@property 

def parameters_std(self): 

""" NumPy array : standard deviation for each parameter. """ 

return self._parameters_std 

 

@property 

def parameter_vectors(self): 

""" list : all parameter vectors in the ensemble. """ 

return self._parameter_vectors 

 

@property 

def ensemble_size(self): 

""" int : number of train rounds. """ 

return self._ensemble_size 

 

@property 

def rmse_train(self): 

""" 

float : ensemble average of root mean squared error over train sets. 

""" 

if self.rmse_train_ensemble is None: 

return None 

return np.sqrt(np.mean((self.rmse_train_ensemble)**2)) 

 

@property 

def rmse_train_ensemble(self): 

""" list : root mean squared train errors obtained during for each 

fit in ensemble. """ 

return self._rmse_train_ensemble 

 

@property 

def rmse_test(self): 

""" 

float : ensemble average of root mean squared error over test sets. 

""" 

if self.rmse_test_ensemble is None: 

return None 

return np.sqrt(np.mean((self.rmse_test_ensemble)**2)) 

 

@property 

def rmse_test_ensemble(self): 

""" list : root mean squared test errors obtained during for each 

fit in ensemble. """ 

return self._rmse_test_ensemble 

 

@property 

def train_size(self): 

""" int : number of rows included in train sets. Note that this will 

be different from the number of unique rows if boostrapping. """ 

return self._train_size 

 

@property 

def train_fraction(self): 

""" float : fraction of input data used for training; this value can 

differ slightly from the value set during initialization 

due to rounding. """ 

return self.train_set_size / self._n_rows 

 

@property 

def bootstrap(self): 

""" boolean : True if sampling is carried out with replacement. """ 

return self._bootstrap