Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

""" 

Optimizer 

""" 

import numpy as np 

from sklearn.model_selection import train_test_split 

from .base_optimizer import BaseOptimizer 

from .fit_methods import fit 

from .tools import ScatterData 

 

 

class Optimizer(BaseOptimizer): 

""" 

Optimizer for single `Ax = y` fit. 

 

One has to specify either `train_size`/`test_size` or 

`train_set`/`test_set` If either `train_set` or `test_set` (or both) 

is specified the fractions will be ignored. 

 

Warning 

------- 

Repeatedly setting up a Optimizer and training 

*without* changing the seed for the random number generator will yield 

identical or correlated results, to avoid this please specify a different 

seed when setting up multiple Optimizer instances. 

 

Parameters 

---------- 

fit_data : tuple of NumPy (N, M) array and NumPy (N) array 

the first element of the tuple represents the fit matrix `A` 

whereas the second element represents the vector of target 

values `y`; here `N` (=rows of `A`, elements of `y`) equals the number 

of target values and `M` (=columns of `A`) equals the number of 

parameters 

fit_method : string 

method to be used for training; possible choice are 

"least-squares", "lasso", "elasticnet", "bayesian-ridge", "ardr" 

standardize : bool 

whether or not to standardize the fit matrix before fitting 

train_size : float or int 

If float represents the fraction of `fit_data` (rows) to be used for 

training. If int, represents the absolute number of rows to be used for 

training. 

test_size : float or int 

If float represents the fraction of `fit_data` (rows) to be used for 

testing. If int, represents the absolute number of rows to be used for 

testing. 

train_set : tuple/list of ints 

indices of rows of `A`/`y` to be used for training 

test_set : tuple/list of ints 

indices of rows of `A`/`y` to be used for testing 

seed : int 

seed for pseudo random number generator 

 

Attributes 

---------- 

train_scatter_data : ScatterData object (namedtuple) 

target and predicted value for each row in the training set 

test_scatter_data : ScatterData object (namedtuple) 

target and predicted value for each row in the test set 

""" 

 

def __init__(self, fit_data, fit_method='least-squares', standardize=True, 

train_size=0.75, test_size=None, train_set=None, 

test_set=None, seed=42, **kwargs): 

 

super().__init__(fit_data, fit_method, standardize, seed) 

 

self._kwargs = kwargs 

 

# setup train and test sets 

self._setup_rows(train_size, test_size, 

train_set, test_set) 

 

# will be populate once running train 

self._rmse_train = None 

self._rmse_test = None 

self._contributions_train = None 

self._contributions_test = None 

self.train_scatter_data = None 

self.test_scatter_data = None 

 

def train(self): 

""" Carry out training. """ 

 

# select training data 

A_train = self._A[self.train_set, :] 

y_train = self._y[self.train_set] 

 

# perform training 

self._fit_results = fit(A_train, y_train, self.fit_method, 

self.standardize, **self._kwargs) 

self._rmse_train = self.compute_rmse(A_train, y_train) 

self._contributions_train = self.get_contributions(A_train) 

self.train_scatter_data = ScatterData(y_train, self.predict(A_train)) 

 

# perform testing 

if self.test_set is not None: 

A_test = self._A[self.test_set, :] 

y_test = self._y[self.test_set] 

self._rmse_test = self.compute_rmse(A_test, y_test) 

self._contributions_test = self.get_contributions(A_test) 

self.test_scatter_data = ScatterData(y_test, self.predict(A_test)) 

else: 

self._rmse_test = None 

self.test_scatter_data = None 

 

def _setup_rows(self, train_size, test_size, train_set, test_set): 

""" 

Set up train and test rows depending on which arguments are 

specified. 

 

If `train_set` and `test_set` are `None` then `train_size` and 

`test_size` are used. 

""" 

 

if train_set is None and test_set is None: 

train_set, test_set = self._get_rows_via_sizes( 

train_size, test_size) 

else: 

train_set, test_set = self._get_rows_from_indices( 

train_set, test_set) 

 

if len(train_set) == 0: 

raise ValueError('No training rows selected from fit_data') 

 

if test_set is not None: # then check overlap between train and test 

if len(np.intersect1d(train_set, test_set)): 

raise ValueError('Overlap between training and test set') 

if len(test_set) == 0: 

test_set = None 

 

self._train_set = train_set 

self._test_set = test_set 

 

def _get_rows_via_sizes(self, train_size, test_size): 

""" Gets train and test rows via sizes. """ 

 

# Handle special cases 

if test_size is None and train_size is None: 

raise ValueError('Training and test set sizes are None (empty).') 

elif train_size is None and abs(test_size - 1.0) < 1e-10: 

raise ValueError('Traininig set is empty.') 

 

elif test_size is None: 

if train_size == self._n_rows or abs(train_size-1.0) < 1e-10: 

train_set = np.arange(self._n_rows) 

test_set = None 

return train_set, test_set 

 

# split 

train_set, test_set = train_test_split(np.arange(self._n_rows), 

train_size=train_size, 

test_size=test_size, 

random_state=self.seed) 

 

return train_set, test_set 

 

def _get_rows_from_indices(self, train_set, test_set): 

""" Gets row via indices. """ 

if train_set is None and test_set is None: 

raise ValueError('Training and test sets are None (empty)') 

elif test_set is None: 

test_set = [i for i in range(self._n_rows) 

if i not in train_set] 

elif train_set is None: 

train_set = [i for i in range(self._n_rows) 

if i not in test_set] 

return np.array(train_set), np.array(test_set) 

 

@property 

def summary(self): 

""" dict : Comprehensive information about the optimizer. """ 

info = super().summary 

 

# Add class specific data 

info['rmse_train'] = self.rmse_train 

info['rmse_test'] = self.rmse_test 

info['train_size'] = self.train_size 

info['train_set'] = self.train_set 

info['test_size'] = self.test_size 

info['test_set'] = self.test_set 

info['contributions_train'] = self.contributions_train 

info['contributions_test'] = self.contributions_test 

info['train_scatter_data'] = self.train_scatter_data 

info['test_scatter_data'] = self.test_scatter_data 

 

# add kwargs used for fitting 

info = {**info, **self._kwargs} 

return info 

 

def __repr__(self): 

kwargs = dict() 

kwargs['fit_method'] = self.fit_method 

kwargs['traininig_size'] = self.train_size 

kwargs['test_size'] = self.test_size 

kwargs['train_set'] = self.train_set 

kwargs['test_set'] = self.test_set 

kwargs['seed'] = self.seed 

kwargs = {**kwargs, **self._kwargs} 

return 'Optimizer((A, y), {})'.format( 

', '.join('{}={}'.format(*kwarg) for kwarg in kwargs.items())) 

 

@property 

def rmse_train(self): 

""" float : root mean squared error for training set. """ 

return self._rmse_train 

 

@property 

def rmse_test(self): 

""" float : root mean squared error for test set. """ 

return self._rmse_test 

 

@property 

def contributions_train(self): 

""" NumPy array : the average contribution to the predicted values for 

the train set from each parameter.""" 

return self._contributions_train 

 

@property 

def contributions_test(self): 

""" NumPy array : the average contribution to the predicted values for 

the test set from each parameter.""" 

return self._contributions_test 

 

@property 

def train_set(self): 

""" list : indices of the rows included in the training set. """ 

return self._train_set 

 

@property 

def test_set(self): 

""" list : indices of the rows included in the test set. """ 

return self._test_set 

 

@property 

def train_size(self): 

""" int : number of rows included in training set. """ 

return len(self.train_set) 

 

@property 

def train_fraction(self): 

""" float : fraction of rows included in training set. """ 

return self.train_size / self._n_rows 

 

@property 

def test_size(self): 

""" int : number of rows included in test set. """ 

if self.test_set is None: 

return 0 

return len(self.test_set) 

 

@property 

def test_fraction(self): 

""" float : fraction of rows included in test set. """ 

if self.test_set is None: 

return 0.0 

return self.test_size / self._n_rows