# -*- coding: utf-8 -*-
"""
Created on Mon Nov 7 17:28:48 2022
@author: dboateng
This module contains the regression routines. There are three layers for
bootstrapped forward selection regression:
- The ``BootstrappedRegression`` class is the outer layer. This implements the
bootstrapping loop. This class has "regressor" member that implements the
single regression step (i.e. a fit and a predict method). This can be the a
``ForwardSelection`` object, but can also be ``Lasso`` from sklearn or
similar routines.
- The ``ForwardSelection`` class is the next layer. This class implements a
Forward Selection loop. This again has a regressor object that has to
implement ``get_coefs``, ``set_coefs``, and ``average_coefs``. Additionally
the regressor object has to implement ``fit_active``, ``fit``, and
``predict``.
An example of such a regressor object is ``MultipleLSRegression``.
"""
from copy import copy
import numpy as np
import pandas as pd
#import statsmodels.api as sm
from sklearn.model_selection import check_cv
from sklearn.linear_model import LinearRegression
__all__ = ["BootstrappedRegression",
"ForwardSelection",
"BootstrappedForwardSelection",
"MultipleLSRegression",
]
class MetaEstimator:
"""
Meta estimators are classes that implement ``fit`` and ``predict``, but don't perform the
regression themselves. They all have a member variable ``regressor`` that performs this task.
This class contains getters and setters that are simply the ones of the regressor object.
"""
def get_coefs(self):
return self.regressor.get_coefs()
def set_coefs(self, coefs):
self.regressor.set_coefs(coefs)
# set coef_ and intercept_ for ease of use
if hasattr(self.regressor, 'coef_'):
self.coef_ = self.regressor.coef_
if hasattr(self.regressor, 'intercept_'):
self.intercept_ = self.regressor.intercept_
if hasattr(self.regressor, 'coefs_'):
self.coefs_ = self.regressor.coefs_
if hasattr(self.regressor, 'intercepts_'):
self.intercepts_ = self.regressor.intercepts_
def get_params(self):
return self.regressor.get_params()
def set_params(self, **params):
self.regressor.set_params(**params)
###############################################################################################
# BootstrappedRegression and ForwardSelection
###############################################################################################
[docs]class BootstrappedRegression(MetaEstimator):
"""
Performs a regression in a bootstrapping loop.
This splits the data multiple times into training and test data and
performs a regression for each split. In each loop
the calculated parameters are stored. The final model uses the average of
all predictors.
If the model is a ``LinearModel`` from sklearn (i.e. it has the attributes
``coef_`` and ``intercept_``), the averaging routine does not have to be
implemented. However, it can be implemented if something else than a
arithmetic mean should be used (e.g. if only the average of robust
predictors should be taken and everything else should be set to zero).
Since this inherites from sklearn modules, it can to some extent be used
interchangibly with other sklearn regressors.
Parameters
----------
regressor : regression object
This should be an object similar to sklearn-like regressors that
provides the methods ``fit(self, X_train, y_train, X_test, y_test)``
and ``predict(self, X)``.
This must also provide the methods ``get_coefs(self)``,
``set_coefs(self, coefs)``, and ``average_coefs(self, list_of_coefs)``.
An example of this is ``ForwardSelection`` below.
The regressor can also have a member variable ``additional_results``,
which should be a dictionary of parameters that are calculated during
fitting but not needed for predicting, for example metrics like the
explained variance of predictors. In this case the regressor also needs
the method ``average_additional_results(self, list_of_dicts)`` and
``set_additional_results(self, mean_additional_results)``.
cv : integer or cross-validation generator (optional, default: None)
This determines how the data are split:
* If ``cv=None``, 3-fold cross-validation will be used.
* If ``cv=n`` where ``n`` is an integer, n-fold cross-validation will be used.
* If ``cv=some_object``, where ``some_object`` implements a
``some_object.split(X, y)`` method that returns indices for training
and test set, this will be used. It is recommended to use
``YearlyBootstrapper()`` from ``stat_downscaling_tools.bootstrap``.
Attributes
----------
mean_coefs : type and shape depends on regressor, (only after fitting)
Fitted coefficients (mean of all models where the coefficients
were nonzero).
cv_error : float (only after fitting)
Mean of errors on test sets during bootstrapping loop.
If the regressor object has the attributes ``intercept_`` and ``coef_``,
these will also be set here.
"""
def __init__(self, regressor, cv=None):
self.regressor = regressor
self.cv = cv
self.has_additional_params = hasattr(self.regressor, 'additional_params')
[docs] def fit(self, X, y):
"""
Fits a model in a bootstrapping loop.
Parameters
----------
X : pd.DataFrame
DataFrame of predictors
y : pd.Series
Series of predictand
"""
#assign cross validation scheme
cv = check_cv(self.cv, classifier=False)
# number of splits in cross-validation:
# when using cross-validaton procedures from sklearn, there is sometimes
# no attribute cv.n_splits, therefore n_splits has to be get with the
# built-in method cv.get_n_splits()
if not hasattr(cv, 'n_splits'):
n_splits = cv.get_n_splits(X)
# otherwise n_splits is just taken directly:
else:
n_splits = cv.n_splits
cv_error = np.zeros(n_splits)
coefs = []
if self.has_additional_results:
additional_results = []
for k, (train, test) in enumerate(cv.split(X, y)):
# standardize
X_train = X.values[train]
X_test = X.values[test]
y_train = y.values[train]
y_test = y.values[test]
# Regression
self.regressor.fit(X_train, y_train, X_test, y_test)
# get coefficients and error
coefs.append(self.regressor.get_coefs())
if self.has_additional_results:
additional_results.append(copy(self.regressor.additional_results))
cv_error[k] = np.sqrt(np.mean((y_test - self.regressor.predict(X_test))**2))
# average the coefficients and the additional parameters
self.mean_coefs = self.regressor.average_coefs(coefs)
self.regressor.set_coefs(self.mean_coefs)
self.cv_error = np.mean(cv_error)
if self.has_additional_results:
mean_add_results = self.regressor.average_additional_results(additional_results)
self.regressor.set_additional_results(mean_add_results)
# set coef_ and intercept_ for ease of use
if hasattr(self.regressor, 'coef_'):
self.coef_ = self.regressor.coef_
if hasattr(self.regressor, 'intercept_'):
self.intercept_ = self.regressor.intercept_
[docs] def predict(self, X):
"""
Predicts values from previously fitted coefficients.
If the input X is a pandas DataFrame or Series, a Series is returned,
otherwise only a numpy array.
Parameters
----------
X : pd.DataFrame
Returns
-------
y : pd.Series
"""
y = self.regressor.predict(X)
if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
return pd.Series(data=y, index=X.index)
else:
return y
[docs] def fit_predict(self, X, y):
self.fit(X, y)
return self.predict(X)
[docs]class ForwardSelection(MetaEstimator):
"""
Performs a forward selection regression.
This stepwise selects the next most promising candidate predictor and adds
it to the model if it is good enough. The method is outlined in
"Statistical Analysis in Climate Research" (von Storch, 1999).
Since this object is intended to be used in the BootstrappedRegression
class, it implements all necessary methods.
Parameters
----------
regressor : regression object
This should be an object similar to sklearn-like regressors that
provides the methods fit and predict. Furthermore, it must also provide
the methods ``get_coefs``, ``set_coefs``, ``average_coefs``, and ``fit_active``.
An example of this is ``MultipleLSRegression`` below.
min_explained_variance : float, optional (default: 0.02)
If inclusion of the staged predictor doesn't improve the explained
variance on the test set by at least this amount, stop the selection
process.
Attributes
----------
explaned_variances : numpy array
"""
def __init__(self, regressor, min_explained_variance=0.02):
self.regressor = regressor
self.min_explained_variance = min_explained_variance
self.additional_results = {}
[docs] def fit(self, X_train, y_train, X_test, y_test):
"""
Cross-validated forward selection. This fits a regression model
according to the following algorithm:
1) Start with yhat = mean(y), res = y - yhat, active = []
2) for each predictor in inactive set:
- add to active set
- perform regression
- get error and uncertainty of error (standard deviation)
- remove from active set
3) add predictor with lowest error on test set to active set
4) if improvement was not good enough, abort and use previous model.
Parameters
----------
X_train : numpy array of shape #samples x #predictors
Array that holds the values of the predictors (columns) at
different times (rows) for the training dataset.
y_train : numpy array of length #samples
Training predictand data
X_test : numpy array of shape #samples x #predictors
Test predictor data
y_test : numpy array of length #samples
Test predictand data
Returns
-------
exp_var : numpy array of length #predictors
explained variance of each predictor
"""
# get some memory
n_samples, n_predictors = X_train.shape
X_train = np.array(X_train) # deep copy
X_test = np.array(X_test) # deep copy
# I store the index time series in a dictionary, so I can easily
# remove the ones we already have and at the same time keep the number
# of the index so I can set the correct coefficient.
X_inactive = {i:X_train[:,i] for i in range(n_predictors)}
active = []
# initial model: no active predictor
self.fit_active(X_train, y_train, active)
#X_test_active = _get_active(X_test, active)
residual_test = y_test - self.regressor.predict(X_test)
SST = np.mean(residual_test**2)
explained_variance = 0
exp_var_predictors = np.zeros(n_predictors)
error = np.zeros(n_predictors)
old_coefs = self.regressor.get_coefs()
for k in range(n_predictors):
# perform regression with all predictors in inactive set
inactive_mse_test = []
inactive_mse_train = []
inactive_coefs = []
inactive = []
for idx in X_inactive:
inactive.append(idx)
active.append(idx)
self.fit_active(X_train, y_train, active)
inactive_coefs.append(self.regressor.get_coefs())
residual_test = y_test - self.regressor.predict(X_test) # turn the reset in _validate_data to False in sklean base
residual_train = y_train - self.regressor.predict(X_train)
inactive_mse_test.append(np.mean(residual_train**2))
inactive_mse_train.append(np.mean(residual_train**2))
active.pop()
# find best predictor and add to active set/remove from inactive set
imax = np.argmin(inactive_mse_train)
idxmax = inactive[imax]
SSE = inactive_mse_test[imax]
new_explained_variance = 1 - SSE/SST
delta_exp_var = new_explained_variance - explained_variance
if delta_exp_var < self.min_explained_variance:
# abort
self.regressor.set_coefs(old_coefs)
break
else:
# set the current best parameters as old parameters and start again
old_coefs = inactive_coefs[imax]
self.regressor.set_coefs(old_coefs)
active.append(idxmax)
del X_inactive[idxmax]
exp_var_predictors[idxmax] = delta_exp_var
explained_variance = new_explained_variance
# done we only need to set the explained variance
self.additional_results['explained variances'] = exp_var_predictors
[docs] def fit_active(self, X, y, active):
"""
Fits using only the columns of X whose index is in ``active``.
"""
X_active = _get_active(X, active)
self.regressor.fit(X_active, y)
self.regressor.set_expand_coefs(active, X.shape[1])
[docs] def predict(self, X):
return self.regressor.predict(X)
[docs] def predict_active(self, X, active):
X_active = _get_active(X, active)
return self.regressor.predict(X_active)
[docs] def set_additional_results(self, add_results):
self.additional_results = copy(add_results)
self.explained_variances = add_results['explained variances']
[docs] def average_additional_results(self, list_of_params):
n_params = len(list_of_params)
n_predictors = len(list_of_params[0]['explained variances'])
exp_var = np.zeros((n_params, n_predictors))
for i, p in enumerate(list_of_params):
exp_var[i,:] = p['explained variances']
add_results = {'explained variances':robust_average(exp_var)}
return add_results
[docs] def average_coefs(self, list_of_coefs):
return self.regressor.average_coefs(list_of_coefs)
###############################################################################################
# Regressors for Forward Selection
###############################################################################################
class LinearCoefsHandlerMixin:
def get_coefs(self):
"""
Returns all fitted coefficients in the same order as ``set_coefs`` needs them.
"""
coefs = np.zeros(len(self.coef_) + 1)
coefs[0:-1] = self.coef_
coefs[-1] = self.intercept_
return coefs
def set_coefs(self, coefs):
self.coef_ = coefs[0:-1]
self.intercept_ = coefs[-1]
def average_coefs(self, list_of_coefs):
"""
Calculates the average of robust predictors, i.e. of those that are
nonzero in at least 50% of the cases.
"""
coefs = np.asarray(list_of_coefs)
return robust_average(coefs)
def get_params(self):
return {}
def set_params(self, **params):
pass
def fit_active(self, X, y, active):
"""
Fits using only the columns of X whose index is in ``active``.
"""
X_active = _get_active(X, active)
self.fit(X_active, y)
self.set_expand_coefs(active, X.shape[1])
def predict_active(self, X, active):
"""
Predict using only the columns of X whose index is in ``active``.
"""
X_active = _get_active(X, active)
self.predict(X_active)
[docs]class MultipleLSRegression(LinearCoefsHandlerMixin):
"""
Implementation of multiple linear OLS regression to be used with
ForwardSelection and BootstrappedRegression.
The following methods are implemented:
- ``fit``
- ``predict``
- ``get_coefs``
- ``set_coefs``
- ``average_coefs``
- ``fit_active``
"""
def __init__(self):
self.lm = LinearRegression()
[docs] def fit(self, X, y):
if X.shape[1] == 0:
# only intercept model
self.intercept_ = np.mean(y)
else:
self.lm.fit(X, y)
[docs] def predict(self, X):
n, m = X.shape
self.lm.coef_ = self.coef_
self.lm.intercept_ = self.intercept_
return self.lm.predict(X)
[docs] def set_expand_coefs(self, active, n_predictors):
"""
This will be called after ``fit``, since fit will often be called with only some of the
predictors. This expands the current coefficients and expands them in a way such that
``predict`` can be called with all predictors.
"""
# get a full coefficient vector back
coefs = np.zeros(n_predictors)
for i, idx in enumerate(active):
coefs[idx] = self.lm.coef_[i]
self.coef_ = coefs
# get intercept_: if len(active) == 0, the intercept was already set
if len(active) != 0:
self.intercept_ = self.lm.intercept_
# class GammaRegression(LinearCoefsHandlerMixin):
# """
# Implementation of generalized linear gamma regression.
# """
# def __init__(self, family=sm.families.Gamma()):
# self.family = family
# def fit(self, X, y):
# n, m = X.shape
# G = np.zeros((n, m+1))
# G[:,0:-1] = X
# G[:,-1] = np.ones(n)
# self.glm = sm.GLM(y, G, family=self.family)
# gamma_results = self.glm.fit()
# self.coef_ = gamma_results.params[0:-1]
# self.intercept_ = gamma_results.params[-1]
# def predict(self, X):
# n, m = X.shape
# G = np.zeros((n, m+1))
# G[:,0:-1] = X
# G[:,-1] = np.ones(n)
# params = np.zeros(len(self.coef_) + 1)
# params[0:-1] = self.coef_
# params[-1] = self.intercept_
# return self.glm.predict(params, exog=G)
# def set_expand_coefs(self, active, n_predictors):
# # get a full coefficient vector back
# coefs = np.zeros(n_predictors)
# for i, idx in enumerate(active):
# coefs[idx] = self.coef_[i]
# self.coef_ = coefs
###############################################################################################
# Some functions
###############################################################################################
[docs]def _get_active(X, active):
"""
Returns a new matrix X_active with only the columns of X whose index is in ``active``.
"""
Xnew = np.empty((X.shape[0], len(active)))
for i, idx in enumerate(active):
Xnew[:,i] = X[:,idx]
return Xnew
def robust_average(coefs):
"""
Takes the robust average of a coefficient matrix.
Parameters
----------
coefs : numpy 2d-array, n_coefs x n_predictors
Returns
-------
mean_coefs : numpy array, length n_predictors
"""
n_coefs, n_predictors = coefs.shape
mean_coefs = np.zeros(n_predictors)
for i in range(n_predictors):
c = coefs[:,i] # one column
if len(c[c != 0]) > 0.5*len(c):
mean_coefs[i] = np.mean(c)
return mean_coefs
###############################################################################################
# Easy to use classes
###############################################################################################
[docs]class BootstrappedForwardSelection(BootstrappedRegression):
"""
This is an easy to use interface for BootstrappedRegression with ForwardSelection.
Parameters
----------
regressor : regression object
This should be an object similar to sklearn-like regressors that
provides the methods fit and predict. Furthermore, it must also provide
the methods ``get_coefs``, ``set_coefs``, ``average_coefs``, and ``fit_active``.
An example of this is ``MultipleLSRegression`` below.
min_explained_variance : float, optional (default: 0.02)
If inclusion of the staged predictor doesn't improve the explained
variance on the test set by at least this amount, stop the selection
process.
cv : integer or cross-validation generator (optional, default: None)
This determines how the data are split:
* If ``cv=None``, 3-fold cross-validation will be used.
* If ``cv=n`` where ``n`` is an integer, n-fold cross-validation will be used.
* If ``cv=some_object``, where ``some_object`` implements a
``some_object.split(X, y)`` method that returns indices for training
and test set, this will be used. It is recommended to use
``YearlyBootstrapper()`` from ``stat_downscaling_tools.bootstrap``.
"""
def __init__(self, regressor, min_explained_variance=0.02, cv=None):
self.regressor = ForwardSelection(regressor, min_explained_variance)
self.cv = cv
self.has_additional_results = True