Source code for

# -*- coding:utf-8 -*-
__author__ = 'yangjian'

from collections import defaultdict

import joblib
from sklearn.metrics import get_scorer
from sklearn.metrics._scorer import _PredictScorer

from .base_ensemble import BaseEnsemble
from ..cfg import TabularCfg as cfg

[docs]class AveragingEnsemble(BaseEnsemble): def __init__(self, task, estimators, need_fit=False, n_folds=5, method='soft'): super(AveragingEnsemble, self).__init__(task, estimators, need_fit, n_folds, method)
[docs] def fit_predictions(self, predictions, y_true): return self
[docs] def predictions2predict(self, predictions): if len(predictions.shape) == 3 and self.task == 'binary': predictions = predictions[:, :, -1] np = proba = np.mean(predictions, axis=1) pred = self.proba2predict(proba) return pred
[docs] def predictions2predict_proba(self, predictions): if self.task == 'multiclass' and self.method == 'hard': raise ValueError('Multiclass task does not support `hard` method.') np = proba = np.mean(predictions, axis=1) if self.task == 'regression': return proba proba = np.clip(proba, 0, 1) if len(proba.shape) == 1: proba = np.stack([1 - proba, proba], axis=1) return proba
[docs]class GreedyEnsemble(BaseEnsemble): """ References ---------- Caruana, Rich, et al. "Ensemble selection from libraries of models." Proceedings of the twenty-first international conference on Machine learning. 2004. """ def __init__(self, task, estimators, need_fit=False, n_folds=5, method='soft', random_state=9527, scoring='neg_log_loss', ensemble_size=0): super(GreedyEnsemble, self).__init__(task, estimators, need_fit, n_folds, method, random_state=random_state) self.scoring = scoring self.scorer = get_scorer(scoring) self.ensemble_size = ensemble_size # fitted self.weights_ = None self.scores_ = None self.best_stack_ = None self.hits_ = None def __repr__(self) -> str: if self.estimators is None: return 'no estimators' if self.weights_ is None: return 'not fitted' # estimators = [getattr(e, "gbm_model", e) for e in self.estimators] return f'{type(self).__name__}(weight={self.weights_}, scores={self.scores_})' def _repr_html_(self): import pandas as pd df = pd.DataFrame([('weights', self.weights_), ('scores', self.scores_), ('best_stack', self.best_stack_), ('hits', self.hits_), ('ensemble_size', self.ensemble_size)]) return df._repr_html_() # def _score(self, y_true, y_pred): # return self.scorer._score_func(y_true, y_pred, **self.scorer._kwargs) * self.scorer._sign def _score(self, y_ture, y_preds): fn = joblib.delayed(self.scorer._score_func) paral = joblib.Parallel(n_jobs=cfg.joblib_njobs, **cfg.joblib_options) rs = paral(fn(y_ture, p, **self.scorer._kwargs) for p in y_preds) rs = [r * self.scorer._sign for r in rs] return rs
[docs] def fit_predictions(self, predictions, y_true): np = scores = [] best_stack = [] if len(predictions.shape) == 1: self.weights_ = [1] return elif len(predictions.shape) == 2: sum_predictions = np.zeros((predictions.shape[0]), dtype=np.float64) elif len(predictions.shape) == 3: sum_predictions = np.zeros((predictions.shape[0], predictions.shape[2]), dtype=np.float64) else: raise ValueError(f'Wrong shape of predictions. shape:{predictions.shape}') if self.ensemble_size <= 0: size = predictions.shape[1] else: size = self.ensemble_size for i in range(size): # stack_scores = [] preds = [] for j in range(predictions.shape[1]): if len(predictions.shape) == 2: pred = predictions[:, j] else: pred = predictions[:, j, :] mean_predictions = (sum_predictions + pred) / (len(best_stack) + 1) if isinstance(self.scorer, _PredictScorer) and self.classes_ is not None and len(self.classes_) > 0: # pred = np.take(np.array(self.classes_), np.argmax(mean_predictions, axis=1), axis=0) pred = self._indices2predict(np.argmax(mean_predictions, axis=1)) mean_predictions = pred elif self.task == 'binary' and len(mean_predictions.shape) == 2 and mean_predictions.shape[1] == 2: mean_predictions = mean_predictions[:, 1] preds.append(mean_predictions) # score = self._score(y_true, mean_predictions) # stack_scores.append(score) stack_scores = self._score(y_true, preds) # best = np.argmax(stack_scores) # scores.append(stack_scores[best]) best, best_score = (0, stack_scores[0]) for n, score in enumerate(stack_scores): if score > best_score: best, best_score = (n, score) scores.append(best_score) best_stack.append(best) if len(predictions.shape) == 2: sum_predictions += predictions[:, best] else: sum_predictions += predictions[:, best, :] # best_step = int(np.argmax(scores)) # print(f'best_step:{best_step}') # val_steps = best_step + 1 # sum up estimator's hit count val_steps = len(best_stack) hits = defaultdict(int) for i in range(val_steps): hits[best_stack[i]] += 1 weights = np.zeros((len(self.estimators)), dtype=np.float64) for i in range(len(self.estimators)): if hits.get(i) is not None: weights[i] = hits[i] / val_steps # zero_weight_index = np.argwhere(weights == 0.).ravel() # for index in zero_weight_index: # self.estimators[index] = None for index, weight in enumerate(weights): if weight == 0.0: self.estimators[index] = None self.weights_ = weights.tolist() self.scores_ = scores self.hits_ = hits self.best_stack_ = best_stack
[docs] def predictions2predict(self, predictions): assert len(self.weights_) == predictions.shape[1] np = weights = np.array(self.weights_) if len(predictions.shape) == 3 and self.task == 'binary': predictions = predictions[:, :, -1] if len(predictions.shape) == 3: weights = np.expand_dims(weights, axis=1).repeat(predictions.shape[2], 1) proba = np.sum(predictions * weights, axis=1) pred = self.proba2predict(proba) return pred
[docs] def predictions2predict_proba(self, predictions): assert len(self.weights_) == predictions.shape[1] if self.task == 'multiclass' and self.method == 'hard': raise ValueError('Multiclass task does not support `hard` method.') np = weights = np.array(self.weights_) if len(predictions.shape) == 3: weights = np.expand_dims(weights, axis=1).repeat(predictions.shape[2], 1) proba = np.sum(predictions * weights, axis=1) if self.task == 'regression': return proba else: # guaranteed to sum to 1.0 over classes proba = proba * np.expand_dims(1 / (proba.sum(axis=1)), axis=1).repeat(proba.shape[1], 1) if len(proba.shape) == 1: proba = np.stack([1 - proba, proba], axis=1) return proba