1665 lines
63 KiB
Python
1665 lines
63 KiB
Python
"""Forest of trees-based ensemble methods
|
|
|
|
Those methods include random forests and extremely randomized trees.
|
|
|
|
The module structure is the following:
|
|
|
|
- The ``BaseForest`` base class implements a common ``fit`` method for all
|
|
the estimators in the module. The ``fit`` method of the base ``Forest``
|
|
class calls the ``fit`` method of each sub-estimator on random samples
|
|
(with replacement, a.k.a. bootstrap) of the training set.
|
|
|
|
The init of the sub-estimator is further delegated to the
|
|
``BaseEnsemble`` constructor.
|
|
|
|
- The ``ForestClassifier`` and ``ForestRegressor`` base classes further
|
|
implement the prediction logic by computing an average of the predicted
|
|
outcomes of the sub-estimators.
|
|
|
|
- The ``RandomForestClassifier`` and ``RandomForestRegressor`` derived
|
|
classes provide the user with concrete implementations of
|
|
the forest ensemble method using classical, deterministic
|
|
``DecisionTreeClassifier`` and ``DecisionTreeRegressor`` as
|
|
sub-estimator implementations.
|
|
|
|
- The ``ExtraTreesClassifier`` and ``ExtraTreesRegressor`` derived
|
|
classes provide the user with concrete implementations of the
|
|
forest ensemble method using the extremely randomized trees
|
|
``ExtraTreeClassifier`` and ``ExtraTreeRegressor`` as
|
|
sub-estimator implementations.
|
|
|
|
Single and multi-output problems are both handled.
|
|
|
|
"""
|
|
|
|
# Authors: Gilles Louppe <g.louppe@gmail.com>
|
|
# Brian Holt <bdholt1@gmail.com>
|
|
# Joly Arnaud <arnaud.v.joly@gmail.com>
|
|
# Fares Hedayati <fares.hedayati@gmail.com>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
from __future__ import division
|
|
|
|
import copy
|
|
from warnings import warn
|
|
from abc import abstractmethod
|
|
|
|
import numpy as np
|
|
|
|
from utils import string_types, iteritems
|
|
from utils import ClassifierMixin, RegressorMixin
|
|
from utils import check_random_state, check_array, compute_sample_weight
|
|
from utils import DataConversionWarning, check_is_fitted
|
|
from utils import r2_score
|
|
from utils import bincount
|
|
from utils import OneHotEncoder
|
|
from utils import BaseEstimator, MetaEstimatorMixin
|
|
|
|
from .tree import (DecisionTreeClassifier, DecisionTreeRegressor,
|
|
ExtraTreeClassifier, ExtraTreeRegressor)
|
|
from ._tree import DTYPE, DOUBLE
|
|
|
|
__all__ = ["RandomForestClassifier",
|
|
"RandomForestRegressor",
|
|
"ExtraTreesClassifier",
|
|
"ExtraTreesRegressor",
|
|
"RandomTreesEmbedding"]
|
|
|
|
MAX_INT = np.iinfo(np.int32).max
|
|
|
|
|
|
def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
|
|
verbose=0, class_weight=None):
|
|
"""Private function used to fit a single tree in parallel."""
|
|
if verbose > 1:
|
|
print("building tree %d of %d" % (tree_idx + 1, n_trees))
|
|
|
|
if forest.bootstrap:
|
|
n_samples = X.shape[0]
|
|
if sample_weight is None:
|
|
curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
|
|
else:
|
|
curr_sample_weight = sample_weight.copy()
|
|
|
|
random_state = check_random_state(tree.random_state)
|
|
indices = random_state.randint(0, n_samples, n_samples)
|
|
sample_counts = bincount(indices, minlength=n_samples)
|
|
curr_sample_weight *= sample_counts
|
|
|
|
if class_weight == 'subsample':
|
|
curr_sample_weight *= compute_sample_weight('auto', y, indices)
|
|
|
|
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
|
|
|
|
tree.indices_ = sample_counts > 0.
|
|
|
|
else:
|
|
tree.fit(X, y, sample_weight=sample_weight, check_input=False)
|
|
|
|
return tree
|
|
|
|
|
|
def _parallel_helper(obj, methodname, *args, **kwargs):
|
|
"""Private helper to workaround Python 2 pickle limitations"""
|
|
return getattr(obj, methodname)(*args, **kwargs)
|
|
|
|
def _partition_estimators(n_estimators, n_jobs):
|
|
"""Private function used to partition estimators between jobs."""
|
|
# Compute the number of jobs
|
|
# ISAAC change: single-threaded to minimize dependencies
|
|
n_jobs = min(n_jobs, n_estimators)
|
|
|
|
# Partition estimators between jobs
|
|
n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs,
|
|
dtype=np.int)
|
|
n_estimators_per_job[:n_estimators % n_jobs] += 1
|
|
starts = np.cumsum(n_estimators_per_job)
|
|
|
|
return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
|
|
|
|
|
|
|
|
###############################################################################
|
|
def clone(estimator, safe=True):
|
|
"""Constructs a new estimator with the same parameters.
|
|
|
|
Clone does a deep copy of the model in an estimator
|
|
without actually copying attached data. It yields a new estimator
|
|
with the same parameters that has not been fit on any data.
|
|
|
|
Parameters
|
|
----------
|
|
estimator: estimator object, or list, tuple or set of objects
|
|
The estimator or group of estimators to be cloned
|
|
|
|
safe: boolean, optional
|
|
If safe is false, clone will fall back to a deepcopy on objects
|
|
that are not estimators.
|
|
|
|
"""
|
|
estimator_type = type(estimator)
|
|
# XXX: not handling dictionaries
|
|
if estimator_type in (list, tuple, set, frozenset):
|
|
return estimator_type([clone(e, safe=safe) for e in estimator])
|
|
elif not hasattr(estimator, 'get_params'):
|
|
if not safe:
|
|
return copy.deepcopy(estimator)
|
|
else:
|
|
raise TypeError("Cannot clone object '%s' (type %s): "
|
|
"it does not seem to be a scikit-learn estimator "
|
|
"it does not implement a 'get_params' methods."
|
|
% (repr(estimator), type(estimator)))
|
|
klass = estimator.__class__
|
|
new_object_params = estimator.get_params(deep=False)
|
|
for name, param in iteritems(new_object_params):
|
|
new_object_params[name] = clone(param, safe=False)
|
|
new_object = klass(**new_object_params)
|
|
params_set = new_object.get_params(deep=False)
|
|
|
|
# quick sanity check of the parameters of the clone
|
|
for name in new_object_params:
|
|
param1 = new_object_params[name]
|
|
param2 = params_set[name]
|
|
if isinstance(param1, np.ndarray):
|
|
# For most ndarrays, we do not test for complete equality
|
|
if not isinstance(param2, type(param1)):
|
|
equality_test = False
|
|
elif (param1.ndim > 0
|
|
and param1.shape[0] > 0
|
|
and isinstance(param2, np.ndarray)
|
|
and param2.ndim > 0
|
|
and param2.shape[0] > 0):
|
|
equality_test = (
|
|
param1.shape == param2.shape
|
|
and param1.dtype == param2.dtype
|
|
# We have to use '.flat' for 2D arrays
|
|
and param1.flat[0] == param2.flat[0]
|
|
and param1.flat[-1] == param2.flat[-1]
|
|
)
|
|
else:
|
|
equality_test = np.all(param1 == param2)
|
|
else:
|
|
equality_test = new_object_params[name] == params_set[name]
|
|
if not equality_test:
|
|
raise RuntimeError('Cannot clone object %s, as the constructor '
|
|
'does not seem to set parameter %s' %
|
|
(estimator, name))
|
|
|
|
return new_object
|
|
|
|
class BaseEnsemble(BaseEstimator, MetaEstimatorMixin):
|
|
"""Base class for all ensemble classes.
|
|
|
|
Warning: This class should not be used directly. Use derived classes
|
|
instead.
|
|
|
|
Parameters
|
|
----------
|
|
base_estimator : object, optional (default=None)
|
|
The base estimator from which the ensemble is built.
|
|
|
|
n_estimators : integer
|
|
The number of estimators in the ensemble.
|
|
|
|
estimator_params : list of strings
|
|
The list of attributes to use as parameters when instantiating a
|
|
new base estimator. If none are given, default parameters are used.
|
|
|
|
Attributes
|
|
----------
|
|
base_estimator_ : list of estimators
|
|
The base estimator from which the ensemble is grown.
|
|
|
|
estimators_ : list of estimators
|
|
The collection of fitted base estimators.
|
|
"""
|
|
|
|
def __init__(self, base_estimator, n_estimators=10,
|
|
estimator_params=tuple()):
|
|
# Set parameters
|
|
self.base_estimator = base_estimator
|
|
self.n_estimators = n_estimators
|
|
self.estimator_params = estimator_params
|
|
|
|
# Don't instantiate estimators now! Parameters of base_estimator might
|
|
# still change. Eg., when grid-searching with the nested object syntax.
|
|
# This needs to be filled by the derived classes.
|
|
self.estimators_ = []
|
|
|
|
def _validate_estimator(self, default=None):
|
|
"""Check the estimator and the n_estimator attribute, set the
|
|
`base_estimator_` attribute."""
|
|
if self.n_estimators <= 0:
|
|
raise ValueError("n_estimators must be greater than zero, "
|
|
"got {0}.".format(self.n_estimators))
|
|
|
|
if self.base_estimator is not None:
|
|
self.base_estimator_ = self.base_estimator
|
|
else:
|
|
self.base_estimator_ = default
|
|
|
|
if self.base_estimator_ is None:
|
|
raise ValueError("base_estimator cannot be None")
|
|
|
|
def _make_estimator(self, append=True):
|
|
"""Make and configure a copy of the `base_estimator_` attribute.
|
|
|
|
Warning: This method should be used to properly instantiate new
|
|
sub-estimators.
|
|
"""
|
|
estimator = clone(self.base_estimator_)
|
|
estimator.set_params(**dict((p, getattr(self, p))
|
|
for p in self.estimator_params))
|
|
|
|
if append:
|
|
self.estimators_.append(estimator)
|
|
|
|
return estimator
|
|
|
|
def __len__(self):
|
|
"""Returns the number of estimators in the ensemble."""
|
|
return len(self.estimators_)
|
|
|
|
def __getitem__(self, index):
|
|
"""Returns the index'th estimator in the ensemble."""
|
|
return self.estimators_[index]
|
|
|
|
def __iter__(self):
|
|
"""Returns iterator over estimators in the ensemble."""
|
|
return iter(self.estimators_)
|
|
|
|
class BaseForest(BaseEnsemble):
|
|
"""Base class for forests of trees.
|
|
|
|
Warning: This class should not be used directly. Use derived classes
|
|
instead.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def __init__(self,
|
|
base_estimator,
|
|
n_estimators=10,
|
|
estimator_params=tuple(),
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
class_weight=None):
|
|
super(BaseForest, self).__init__(
|
|
base_estimator=base_estimator,
|
|
n_estimators=n_estimators,
|
|
estimator_params=estimator_params)
|
|
|
|
self.bootstrap = bootstrap
|
|
self.oob_score = oob_score
|
|
self.n_jobs = n_jobs
|
|
self.random_state = random_state
|
|
self.verbose = verbose
|
|
self.warm_start = warm_start
|
|
self.class_weight = class_weight
|
|
|
|
def apply(self, X):
|
|
"""Apply trees in the forest to X, return leaf indices.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix, shape = [n_samples, n_features]
|
|
The input samples. Internally, it will be converted to
|
|
``dtype=np.float32`` and if a sparse matrix is provided
|
|
to a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
X_leaves : array_like, shape = [n_samples, n_estimators]
|
|
For each datapoint x in X and for each tree in the forest,
|
|
return the index of the leaf x ends up in.
|
|
"""
|
|
check_is_fitted(self, 'n_outputs_')
|
|
|
|
X = check_array(X, dtype=DTYPE, accept_sparse="csr")
|
|
|
|
results = [tree.tree_.apply(X) for tree in self.estimators_]
|
|
|
|
return np.array(results).T
|
|
|
|
def fit(self, X, y, sample_weight=None):
|
|
"""Build a forest of trees from the training set (X, y).
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix of shape = [n_samples, n_features]
|
|
The training input samples. Internally, it will be converted to
|
|
``dtype=np.float32`` and if a sparse matrix is provided
|
|
to a sparse ``csc_matrix``.
|
|
|
|
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
|
|
The target values (class labels in classification, real numbers in
|
|
regression).
|
|
|
|
sample_weight : array-like, shape = [n_samples] or None
|
|
Sample weights. If None, then samples are equally weighted. Splits
|
|
that would create child nodes with net zero or negative weight are
|
|
ignored while searching for a split in each node. In the case of
|
|
classification, splits are also ignored if they would result in any
|
|
single class carrying a negative weight in either child node.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Returns self.
|
|
"""
|
|
# Validate or convert input data
|
|
X = check_array(X, dtype=DTYPE)
|
|
|
|
# Remap output
|
|
n_samples, self.n_features_ = X.shape
|
|
|
|
y = np.atleast_1d(y)
|
|
if y.ndim == 2 and y.shape[1] == 1:
|
|
warn("A column-vector y was passed when a 1d array was"
|
|
" expected. Please change the shape of y to "
|
|
"(n_samples,), for example using ravel().",
|
|
DataConversionWarning, stacklevel=2)
|
|
|
|
if y.ndim == 1:
|
|
# reshape is necessary to preserve the data contiguity against vs
|
|
# [:, np.newaxis] that does not.
|
|
y = np.reshape(y, (-1, 1))
|
|
|
|
self.n_outputs_ = y.shape[1]
|
|
|
|
y, expanded_class_weight = self._validate_y_class_weight(y)
|
|
|
|
if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
|
|
y = np.ascontiguousarray(y, dtype=DOUBLE)
|
|
|
|
if expanded_class_weight is not None:
|
|
if sample_weight is not None:
|
|
sample_weight = sample_weight * expanded_class_weight
|
|
else:
|
|
sample_weight = expanded_class_weight
|
|
|
|
# Check parameters
|
|
self._validate_estimator()
|
|
|
|
if not self.bootstrap and self.oob_score:
|
|
raise ValueError("Out of bag estimation only available"
|
|
" if bootstrap=True")
|
|
|
|
random_state = check_random_state(self.random_state)
|
|
|
|
if not self.warm_start:
|
|
# Free allocated memory, if any
|
|
self.estimators_ = []
|
|
|
|
n_more_estimators = self.n_estimators - len(self.estimators_)
|
|
|
|
if n_more_estimators < 0:
|
|
raise ValueError('n_estimators=%d must be larger or equal to '
|
|
'len(estimators_)=%d when warm_start==True'
|
|
% (self.n_estimators, len(self.estimators_)))
|
|
|
|
elif n_more_estimators == 0:
|
|
warn("Warm-start fitting without increasing n_estimators does not "
|
|
"fit new trees.")
|
|
else:
|
|
if self.warm_start and len(self.estimators_) > 0:
|
|
# We draw from the random state to get the random state we
|
|
# would have got if we hadn't used a warm_start.
|
|
random_state.randint(MAX_INT, size=len(self.estimators_))
|
|
|
|
trees = []
|
|
for i in range(n_more_estimators):
|
|
tree = self._make_estimator(append=False)
|
|
tree.set_params(random_state=random_state.randint(MAX_INT))
|
|
trees.append(tree)
|
|
|
|
# Parallel loop: we use the threading backend as the Cython code
|
|
# for fitting the trees is internally releasing the Python GIL
|
|
# making threading always more efficient than multiprocessing in
|
|
# that case.
|
|
trees = [_parallel_build_trees(
|
|
t, self, X, y, sample_weight, i, len(trees),
|
|
verbose=self.verbose, class_weight=self.class_weight) for i,t in enumerate(trees)]
|
|
|
|
# Collect newly grown trees
|
|
self.estimators_.extend(trees)
|
|
|
|
if self.oob_score:
|
|
self._set_oob_score(X, y)
|
|
|
|
# Decapsulate classes_ attributes
|
|
if hasattr(self, "classes_") and self.n_outputs_ == 1:
|
|
self.n_classes_ = self.n_classes_[0]
|
|
self.classes_ = self.classes_[0]
|
|
|
|
return self
|
|
|
|
@abstractmethod
|
|
def _set_oob_score(self, X, y):
|
|
"""Calculate out of bag predictions and score."""
|
|
|
|
def _validate_y_class_weight(self, y):
|
|
# Default implementation
|
|
return y, None
|
|
|
|
@property
|
|
def feature_importances_(self):
|
|
"""Return the feature importances (the higher, the more important the
|
|
feature).
|
|
|
|
Returns
|
|
-------
|
|
feature_importances_ : array, shape = [n_features]
|
|
"""
|
|
check_is_fitted(self, 'n_outputs_')
|
|
|
|
if self.estimators_ is None or len(self.estimators_) == 0:
|
|
raise ValueError("Estimator not fitted, "
|
|
"call `fit` before `feature_importances_`.")
|
|
|
|
|
|
all_importances = [tree.feature_importances_ for tree in self.estimators_]
|
|
|
|
return sum(all_importances) / len(self.estimators_)
|
|
|
|
|
|
class ForestClassifier(BaseForest, ClassifierMixin):
|
|
"""Base class for forest of trees-based classifiers.
|
|
|
|
Warning: This class should not be used directly. Use derived classes
|
|
instead.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def __init__(self,
|
|
base_estimator,
|
|
n_estimators=10,
|
|
estimator_params=tuple(),
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
class_weight=None):
|
|
|
|
super(ForestClassifier, self).__init__(
|
|
base_estimator,
|
|
n_estimators=n_estimators,
|
|
estimator_params=estimator_params,
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start,
|
|
class_weight=class_weight)
|
|
|
|
def _set_oob_score(self, X, y):
|
|
"""Compute out-of-bag score"""
|
|
n_classes_ = self.n_classes_
|
|
n_samples = y.shape[0]
|
|
|
|
oob_decision_function = []
|
|
oob_score = 0.0
|
|
predictions = []
|
|
|
|
for k in range(self.n_outputs_):
|
|
predictions.append(np.zeros((n_samples, n_classes_[k])))
|
|
|
|
sample_indices = np.arange(n_samples)
|
|
for estimator in self.estimators_:
|
|
mask = np.ones(n_samples, dtype=np.bool)
|
|
mask[estimator.indices_] = False
|
|
mask_indices = sample_indices[mask]
|
|
p_estimator = estimator.predict_proba(X[mask_indices, :],
|
|
check_input=False)
|
|
|
|
if self.n_outputs_ == 1:
|
|
p_estimator = [p_estimator]
|
|
|
|
for k in range(self.n_outputs_):
|
|
predictions[k][mask_indices, :] += p_estimator[k]
|
|
|
|
for k in range(self.n_outputs_):
|
|
if (predictions[k].sum(axis=1) == 0).any():
|
|
warn("Some inputs do not have OOB scores. "
|
|
"This probably means too few trees were used "
|
|
"to compute any reliable oob estimates.")
|
|
|
|
decision = (predictions[k] /
|
|
predictions[k].sum(axis=1)[:, np.newaxis])
|
|
oob_decision_function.append(decision)
|
|
oob_score += np.mean(y[:, k] ==
|
|
np.argmax(predictions[k], axis=1), axis=0)
|
|
|
|
if self.n_outputs_ == 1:
|
|
self.oob_decision_function_ = oob_decision_function[0]
|
|
else:
|
|
self.oob_decision_function_ = oob_decision_function
|
|
|
|
self.oob_score_ = oob_score / self.n_outputs_
|
|
|
|
def _validate_y_class_weight(self, y):
|
|
y = np.copy(y)
|
|
expanded_class_weight = None
|
|
|
|
if self.class_weight is not None:
|
|
y_original = np.copy(y)
|
|
|
|
self.classes_ = []
|
|
self.n_classes_ = []
|
|
|
|
for k in range(self.n_outputs_):
|
|
classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True)
|
|
self.classes_.append(classes_k)
|
|
self.n_classes_.append(classes_k.shape[0])
|
|
|
|
if self.class_weight is not None:
|
|
valid_presets = ('auto', 'subsample')
|
|
if isinstance(self.class_weight, string_types):
|
|
if self.class_weight not in valid_presets:
|
|
raise ValueError('Valid presets for class_weight include '
|
|
'"auto" and "subsample". Given "%s".'
|
|
% self.class_weight)
|
|
if self.warm_start:
|
|
warn('class_weight presets "auto" or "subsample" are '
|
|
'not recommended for warm_start if the fitted data '
|
|
'differs from the full dataset. In order to use '
|
|
'"auto" weights, use compute_class_weight("auto", '
|
|
'classes, y). In place of y you can use a large '
|
|
'enough sample of the full training set target to '
|
|
'properly estimate the class frequency '
|
|
'distributions. Pass the resulting weights as the '
|
|
'class_weight parameter.')
|
|
|
|
if self.class_weight != 'subsample' or not self.bootstrap:
|
|
if self.class_weight == 'subsample':
|
|
class_weight = 'auto'
|
|
else:
|
|
class_weight = self.class_weight
|
|
expanded_class_weight = compute_sample_weight(class_weight,
|
|
y_original)
|
|
|
|
return y, expanded_class_weight
|
|
|
|
def predict(self, X):
|
|
"""Predict class for X.
|
|
|
|
The predicted class of an input sample is computed as the majority
|
|
prediction of the trees in the forest.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix of shape = [n_samples, n_features]
|
|
The input samples. Internally, it will be converted to
|
|
``dtype=np.float32`` and if a sparse matrix is provided
|
|
to a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
y : array of shape = [n_samples] or [n_samples, n_outputs]
|
|
The predicted classes.
|
|
"""
|
|
check_is_fitted(self, 'n_outputs_')
|
|
|
|
# ensure_2d=False because there are actually unit test checking we fail
|
|
# for 1d.
|
|
X = check_array(X, ensure_2d=False, accept_sparse="csr")
|
|
proba = self.predict_proba(X)
|
|
|
|
if self.n_outputs_ == 1:
|
|
return self.classes_.take(np.argmax(proba, axis=1), axis=0)
|
|
|
|
else:
|
|
n_samples = proba[0].shape[0]
|
|
predictions = np.zeros((n_samples, self.n_outputs_))
|
|
|
|
for k in range(self.n_outputs_):
|
|
predictions[:, k] = self.classes_[k].take(np.argmax(proba[k],
|
|
axis=1),
|
|
axis=0)
|
|
|
|
return predictions
|
|
|
|
def predict_proba(self, X):
|
|
"""Predict class probabilities for X.
|
|
|
|
The predicted class probabilities of an input sample is computed as
|
|
the mean predicted class probabilities of the trees in the forest. The
|
|
class probability of a single tree is the fraction of samples of the same
|
|
class in a leaf.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix of shape = [n_samples, n_features]
|
|
The input samples. Internally, it will be converted to
|
|
``dtype=np.float32`` and if a sparse matrix is provided
|
|
to a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
p : array of shape = [n_samples, n_classes], or a list of n_outputs
|
|
such arrays if n_outputs > 1.
|
|
The class probabilities of the input samples. The order of the
|
|
classes corresponds to that in the attribute `classes_`.
|
|
"""
|
|
check_is_fitted(self, 'n_outputs_')
|
|
|
|
# Check data
|
|
X = check_array(X, dtype=DTYPE, accept_sparse="csr")
|
|
|
|
# Assign chunk of trees to jobs
|
|
n_jobs, n_trees, starts = _partition_estimators(self.n_estimators,
|
|
self.n_jobs)
|
|
|
|
# Parallel loop
|
|
all_proba = [e.predict_proba(X, check_input=False) for e in self.estimators_]
|
|
|
|
# Reduce
|
|
proba = all_proba[0]
|
|
|
|
if self.n_outputs_ == 1:
|
|
for j in range(1, len(all_proba)):
|
|
proba += all_proba[j]
|
|
|
|
proba /= len(self.estimators_)
|
|
|
|
else:
|
|
for j in range(1, len(all_proba)):
|
|
for k in range(self.n_outputs_):
|
|
proba[k] += all_proba[j][k]
|
|
|
|
for k in range(self.n_outputs_):
|
|
proba[k] /= self.n_estimators
|
|
|
|
return proba
|
|
|
|
def predict_log_proba(self, X):
|
|
"""Predict class log-probabilities for X.
|
|
|
|
The predicted class log-probabilities of an input sample is computed as
|
|
the log of the mean predicted class probabilities of the trees in the
|
|
forest.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix of shape = [n_samples, n_features]
|
|
The input samples. Internally, it will be converted to
|
|
``dtype=np.float32`` and if a sparse matrix is provided
|
|
to a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
p : array of shape = [n_samples, n_classes], or a list of n_outputs
|
|
such arrays if n_outputs > 1.
|
|
The class probabilities of the input samples. The order of the
|
|
classes corresponds to that in the attribute `classes_`.
|
|
"""
|
|
proba = self.predict_proba(X)
|
|
|
|
if self.n_outputs_ == 1:
|
|
return np.log(proba)
|
|
|
|
else:
|
|
for k in range(self.n_outputs_):
|
|
proba[k] = np.log(proba[k])
|
|
|
|
return proba
|
|
|
|
|
|
class ForestRegressor(BaseForest, RegressorMixin):
|
|
"""Base class for forest of trees-based regressors.
|
|
|
|
Warning: This class should not be used directly. Use derived classes
|
|
instead.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def __init__(self,
|
|
base_estimator,
|
|
n_estimators=10,
|
|
estimator_params=tuple(),
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False):
|
|
super(ForestRegressor, self).__init__(
|
|
base_estimator,
|
|
n_estimators=n_estimators,
|
|
estimator_params=estimator_params,
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start)
|
|
|
|
def predict(self, X):
|
|
"""Predict regression target for X.
|
|
|
|
The predicted regression target of an input sample is computed as the
|
|
mean predicted regression targets of the trees in the forest.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix of shape = [n_samples, n_features]
|
|
The input samples. Internally, it will be converted to
|
|
``dtype=np.float32`` and if a sparse matrix is provided
|
|
to a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
y : array of shape = [n_samples] or [n_samples, n_outputs]
|
|
The predicted values.
|
|
"""
|
|
check_is_fitted(self, 'n_outputs_')
|
|
|
|
# Check data
|
|
X = check_array(X, dtype=DTYPE)
|
|
|
|
|
|
# Assign chunk of trees to jobs
|
|
n_jobs, n_trees, starts = _partition_estimators(self.n_estimators,
|
|
self.n_jobs)
|
|
|
|
# Parallel loop
|
|
all_y_hat = [e.predict(X, check_input=False) for e in self.estimators_]
|
|
|
|
# Reduce
|
|
y_hat = sum(all_y_hat) / len(self.estimators_)
|
|
|
|
return y_hat
|
|
|
|
def _set_oob_score(self, X, y):
|
|
"""Compute out-of-bag scores"""
|
|
n_samples = y.shape[0]
|
|
|
|
predictions = np.zeros((n_samples, self.n_outputs_))
|
|
n_predictions = np.zeros((n_samples, self.n_outputs_))
|
|
|
|
sample_indices = np.arange(n_samples)
|
|
for estimator in self.estimators_:
|
|
mask = np.ones(n_samples, dtype=np.bool)
|
|
mask[estimator.indices_] = False
|
|
mask_indices = sample_indices[mask]
|
|
p_estimator = estimator.predict(X[mask_indices, :], check_input=False)
|
|
|
|
if self.n_outputs_ == 1:
|
|
p_estimator = p_estimator[:, np.newaxis]
|
|
|
|
predictions[mask_indices, :] += p_estimator
|
|
n_predictions[mask_indices, :] += 1
|
|
|
|
if (n_predictions == 0).any():
|
|
warn("Some inputs do not have OOB scores. "
|
|
"This probably means too few trees were used "
|
|
"to compute any reliable oob estimates.")
|
|
n_predictions[n_predictions == 0] = 1
|
|
|
|
predictions /= n_predictions
|
|
self.oob_prediction_ = predictions
|
|
|
|
if self.n_outputs_ == 1:
|
|
self.oob_prediction_ = \
|
|
self.oob_prediction_.reshape((n_samples, ))
|
|
|
|
self.oob_score_ = 0.0
|
|
|
|
for k in range(self.n_outputs_):
|
|
self.oob_score_ += r2_score(y[:, k],
|
|
predictions[:, k])
|
|
|
|
self.oob_score_ /= self.n_outputs_
|
|
|
|
|
|
class RandomForestClassifier(ForestClassifier):
|
|
"""A random forest classifier.
|
|
|
|
A random forest is a meta estimator that fits a number of decision tree
|
|
classifiers on various sub-samples of the dataset and use averaging to
|
|
improve the predictive accuracy and control over-fitting.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : integer, optional (default=10)
|
|
The number of trees in the forest.
|
|
|
|
criterion : string, optional (default="gini")
|
|
The function to measure the quality of a split. Supported criteria are
|
|
"gini" for the Gini impurity and "entropy" for the information gain.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_features : int, float, string or None, optional (default="auto")
|
|
The number of features to consider when looking for the best split:
|
|
|
|
- If int, then consider `max_features` features at each split.
|
|
- If float, then `max_features` is a percentage and
|
|
`int(max_features * n_features)` features are considered at each
|
|
split.
|
|
- If "auto", then `max_features=sqrt(n_features)`.
|
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
|
- If "log2", then `max_features=log2(n_features)`.
|
|
- If None, then `max_features=n_features`.
|
|
|
|
Note: the search for a split does not stop until at least one
|
|
valid partition of the node samples is found, even if it requires to
|
|
effectively inspect more than ``max_features`` features.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_depth : integer or None, optional (default=None)
|
|
The maximum depth of the tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
Ignored if ``max_leaf_nodes`` is not None.
|
|
Note: this parameter is tree-specific.
|
|
|
|
min_samples_split : integer, optional (default=2)
|
|
The minimum number of samples required to split an internal node.
|
|
Note: this parameter is tree-specific.
|
|
|
|
min_samples_leaf : integer, optional (default=1)
|
|
The minimum number of samples in newly created leaves. A split is
|
|
discarded if after the split, one of the leaves would contain less then
|
|
``min_samples_leaf`` samples.
|
|
Note: this parameter is tree-specific.
|
|
|
|
min_weight_fraction_leaf : float, optional (default=0.)
|
|
The minimum weighted fraction of the input samples required to be at a
|
|
leaf node.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_leaf_nodes : int or None, optional (default=None)
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
If not None then ``max_depth`` will be ignored.
|
|
Note: this parameter is tree-specific.
|
|
|
|
bootstrap : boolean, optional (default=True)
|
|
Whether bootstrap samples are used when building trees.
|
|
|
|
oob_score : bool
|
|
Whether to use out-of-bag samples to estimate
|
|
the generalization error.
|
|
|
|
n_jobs : integer, optional (default=1)
|
|
The number of jobs to run in parallel for both `fit` and `predict`.
|
|
If -1, then the number of jobs is set to the number of cores.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
verbose : int, optional (default=0)
|
|
Controls the verbosity of the tree building process.
|
|
|
|
warm_start : bool, optional (default=False)
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest.
|
|
|
|
class_weight : dict, list of dicts, "auto", "subsample" or None, optional
|
|
|
|
Weights associated with classes in the form ``{class_label: weight}``.
|
|
If not given, all classes are supposed to have weight one. For
|
|
multi-output problems, a list of dicts can be provided in the same
|
|
order as the columns of y.
|
|
|
|
The "auto" mode uses the values of y to automatically adjust
|
|
weights inversely proportional to class frequencies in the input data.
|
|
|
|
The "subsample" mode is the same as "auto" except that weights are
|
|
computed based on the bootstrap sample for every tree grown.
|
|
|
|
For multi-output, the weights of each column of y will be multiplied.
|
|
|
|
Note that these weights will be multiplied with sample_weight (passed
|
|
through the fit method) if sample_weight is specified.
|
|
|
|
Attributes
|
|
----------
|
|
estimators_ : list of DecisionTreeClassifier
|
|
The collection of fitted sub-estimators.
|
|
|
|
classes_ : array of shape = [n_classes] or a list of such arrays
|
|
The classes labels (single output problem), or a list of arrays of
|
|
class labels (multi-output problem).
|
|
|
|
n_classes_ : int or list
|
|
The number of classes (single output problem), or a list containing the
|
|
number of classes for each output (multi-output problem).
|
|
|
|
feature_importances_ : array of shape = [n_features]
|
|
The feature importances (the higher, the more important the feature).
|
|
|
|
oob_score_ : float
|
|
Score of the training dataset obtained using an out-of-bag estimate.
|
|
|
|
oob_decision_function_ : array of shape = [n_samples, n_classes]
|
|
Decision function computed with out-of-bag estimate on the training
|
|
set. If n_estimators is small it might be possible that a data point
|
|
was never left out during the bootstrap. In this case,
|
|
`oob_decision_function_` might contain NaN.
|
|
|
|
References
|
|
----------
|
|
|
|
.. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
|
|
|
|
See also
|
|
--------
|
|
DecisionTreeClassifier, ExtraTreesClassifier
|
|
"""
|
|
def __init__(self,
|
|
n_estimators=10,
|
|
criterion="gini",
|
|
max_depth=None,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_features="auto",
|
|
max_leaf_nodes=None,
|
|
bootstrap=True,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
class_weight=None):
|
|
super(RandomForestClassifier, self).__init__(
|
|
base_estimator=DecisionTreeClassifier(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"random_state"),
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start,
|
|
class_weight=class_weight)
|
|
|
|
self.criterion = criterion
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = max_features
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
|
|
|
|
class RandomForestRegressor(ForestRegressor):
|
|
"""A random forest regressor.
|
|
|
|
A random forest is a meta estimator that fits a number of classifying
|
|
decision trees on various sub-samples of the dataset and use averaging
|
|
to improve the predictive accuracy and control over-fitting.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : integer, optional (default=10)
|
|
The number of trees in the forest.
|
|
|
|
criterion : string, optional (default="mse")
|
|
The function to measure the quality of a split. The only supported
|
|
criterion is "mse" for the mean squared error.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_features : int, float, string or None, optional (default="auto")
|
|
The number of features to consider when looking for the best split:
|
|
|
|
- If int, then consider `max_features` features at each split.
|
|
- If float, then `max_features` is a percentage and
|
|
`int(max_features * n_features)` features are considered at each
|
|
split.
|
|
- If "auto", then `max_features=n_features`.
|
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
|
- If "log2", then `max_features=log2(n_features)`.
|
|
- If None, then `max_features=n_features`.
|
|
|
|
Note: the search for a split does not stop until at least one
|
|
valid partition of the node samples is found, even if it requires to
|
|
effectively inspect more than ``max_features`` features.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_depth : integer or None, optional (default=None)
|
|
The maximum depth of the tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
Ignored if ``max_leaf_nodes`` is not None.
|
|
Note: this parameter is tree-specific.
|
|
|
|
min_samples_split : integer, optional (default=2)
|
|
The minimum number of samples required to split an internal node.
|
|
Note: this parameter is tree-specific.
|
|
|
|
min_samples_leaf : integer, optional (default=1)
|
|
The minimum number of samples in newly created leaves. A split is
|
|
discarded if after the split, one of the leaves would contain less then
|
|
``min_samples_leaf`` samples.
|
|
Note: this parameter is tree-specific.
|
|
|
|
min_weight_fraction_leaf : float, optional (default=0.)
|
|
The minimum weighted fraction of the input samples required to be at a
|
|
leaf node.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_leaf_nodes : int or None, optional (default=None)
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
If not None then ``max_depth`` will be ignored.
|
|
Note: this parameter is tree-specific.
|
|
|
|
bootstrap : boolean, optional (default=True)
|
|
Whether bootstrap samples are used when building trees.
|
|
|
|
oob_score : bool
|
|
whether to use out-of-bag samples to estimate
|
|
the generalization error.
|
|
|
|
n_jobs : integer, optional (default=1)
|
|
The number of jobs to run in parallel for both `fit` and `predict`.
|
|
If -1, then the number of jobs is set to the number of cores.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
verbose : int, optional (default=0)
|
|
Controls the verbosity of the tree building process.
|
|
|
|
warm_start : bool, optional (default=False)
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest.
|
|
|
|
Attributes
|
|
----------
|
|
estimators_ : list of DecisionTreeRegressor
|
|
The collection of fitted sub-estimators.
|
|
|
|
feature_importances_ : array of shape = [n_features]
|
|
The feature importances (the higher, the more important the feature).
|
|
|
|
oob_score_ : float
|
|
Score of the training dataset obtained using an out-of-bag estimate.
|
|
|
|
oob_prediction_ : array of shape = [n_samples]
|
|
Prediction computed with out-of-bag estimate on the training set.
|
|
|
|
References
|
|
----------
|
|
|
|
.. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
|
|
|
|
See also
|
|
--------
|
|
DecisionTreeRegressor, ExtraTreesRegressor
|
|
"""
|
|
def __init__(self,
|
|
n_estimators=10,
|
|
criterion="mse",
|
|
max_depth=None,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_features="auto",
|
|
max_leaf_nodes=None,
|
|
bootstrap=True,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False):
|
|
super(RandomForestRegressor, self).__init__(
|
|
base_estimator=DecisionTreeRegressor(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"random_state"),
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start)
|
|
|
|
self.criterion = criterion
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = max_features
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
|
|
|
|
class ExtraTreesClassifier(ForestClassifier):
|
|
"""An extra-trees classifier.
|
|
|
|
This class implements a meta estimator that fits a number of
|
|
randomized decision trees (a.k.a. extra-trees) on various sub-samples
|
|
of the dataset and use averaging to improve the predictive accuracy
|
|
and control over-fitting.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : integer, optional (default=10)
|
|
The number of trees in the forest.
|
|
|
|
criterion : string, optional (default="gini")
|
|
The function to measure the quality of a split. Supported criteria are
|
|
"gini" for the Gini impurity and "entropy" for the information gain.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_features : int, float, string or None, optional (default="auto")
|
|
The number of features to consider when looking for the best split:
|
|
|
|
- If int, then consider `max_features` features at each split.
|
|
- If float, then `max_features` is a percentage and
|
|
`int(max_features * n_features)` features are considered at each
|
|
split.
|
|
- If "auto", then `max_features=sqrt(n_features)`.
|
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
|
- If "log2", then `max_features=log2(n_features)`.
|
|
- If None, then `max_features=n_features`.
|
|
|
|
Note: the search for a split does not stop until at least one
|
|
valid partition of the node samples is found, even if it requires to
|
|
effectively inspect more than ``max_features`` features.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_depth : integer or None, optional (default=None)
|
|
The maximum depth of the tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
Ignored if ``max_leaf_nodes`` is not None.
|
|
Note: this parameter is tree-specific.
|
|
|
|
min_samples_split : integer, optional (default=2)
|
|
The minimum number of samples required to split an internal node.
|
|
Note: this parameter is tree-specific.
|
|
|
|
min_samples_leaf : integer, optional (default=1)
|
|
The minimum number of samples in newly created leaves. A split is
|
|
discarded if after the split, one of the leaves would contain less then
|
|
``min_samples_leaf`` samples.
|
|
Note: this parameter is tree-specific.
|
|
|
|
min_weight_fraction_leaf : float, optional (default=0.)
|
|
The minimum weighted fraction of the input samples required to be at a
|
|
leaf node.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_leaf_nodes : int or None, optional (default=None)
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
If not None then ``max_depth`` will be ignored.
|
|
Note: this parameter is tree-specific.
|
|
|
|
bootstrap : boolean, optional (default=False)
|
|
Whether bootstrap samples are used when building trees.
|
|
|
|
oob_score : bool
|
|
Whether to use out-of-bag samples to estimate
|
|
the generalization error.
|
|
|
|
n_jobs : integer, optional (default=1)
|
|
The number of jobs to run in parallel for both `fit` and `predict`.
|
|
If -1, then the number of jobs is set to the number of cores.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
verbose : int, optional (default=0)
|
|
Controls the verbosity of the tree building process.
|
|
|
|
warm_start : bool, optional (default=False)
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest.
|
|
|
|
class_weight : dict, list of dicts, "auto", "subsample" or None, optional
|
|
|
|
Weights associated with classes in the form ``{class_label: weight}``.
|
|
If not given, all classes are supposed to have weight one. For
|
|
multi-output problems, a list of dicts can be provided in the same
|
|
order as the columns of y.
|
|
|
|
The "auto" mode uses the values of y to automatically adjust
|
|
weights inversely proportional to class frequencies in the input data.
|
|
|
|
The "subsample" mode is the same as "auto" except that weights are
|
|
computed based on the bootstrap sample for every tree grown.
|
|
|
|
For multi-output, the weights of each column of y will be multiplied.
|
|
|
|
Note that these weights will be multiplied with sample_weight (passed
|
|
through the fit method) if sample_weight is specified.
|
|
|
|
Attributes
|
|
----------
|
|
estimators_ : list of DecisionTreeClassifier
|
|
The collection of fitted sub-estimators.
|
|
|
|
classes_ : array of shape = [n_classes] or a list of such arrays
|
|
The classes labels (single output problem), or a list of arrays of
|
|
class labels (multi-output problem).
|
|
|
|
n_classes_ : int or list
|
|
The number of classes (single output problem), or a list containing the
|
|
number of classes for each output (multi-output problem).
|
|
|
|
feature_importances_ : array of shape = [n_features]
|
|
The feature importances (the higher, the more important the feature).
|
|
|
|
oob_score_ : float
|
|
Score of the training dataset obtained using an out-of-bag estimate.
|
|
|
|
oob_decision_function_ : array of shape = [n_samples, n_classes]
|
|
Decision function computed with out-of-bag estimate on the training
|
|
set. If n_estimators is small it might be possible that a data point
|
|
was never left out during the bootstrap. In this case,
|
|
`oob_decision_function_` might contain NaN.
|
|
|
|
References
|
|
----------
|
|
|
|
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
|
|
Machine Learning, 63(1), 3-42, 2006.
|
|
|
|
See also
|
|
--------
|
|
sklearn.tree.ExtraTreeClassifier : Base classifier for this ensemble.
|
|
RandomForestClassifier : Ensemble Classifier based on trees with optimal
|
|
splits.
|
|
"""
|
|
def __init__(self,
|
|
n_estimators=10,
|
|
criterion="gini",
|
|
max_depth=None,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_features="auto",
|
|
max_leaf_nodes=None,
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
class_weight=None):
|
|
super(ExtraTreesClassifier, self).__init__(
|
|
base_estimator=ExtraTreeClassifier(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes", "random_state"),
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start,
|
|
class_weight=class_weight)
|
|
|
|
self.criterion = criterion
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = max_features
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
|
|
|
|
class ExtraTreesRegressor(ForestRegressor):
|
|
"""An extra-trees regressor.
|
|
|
|
This class implements a meta estimator that fits a number of
|
|
randomized decision trees (a.k.a. extra-trees) on various sub-samples
|
|
of the dataset and use averaging to improve the predictive accuracy
|
|
and control over-fitting.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : integer, optional (default=10)
|
|
The number of trees in the forest.
|
|
|
|
criterion : string, optional (default="mse")
|
|
The function to measure the quality of a split. The only supported
|
|
criterion is "mse" for the mean squared error.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_features : int, float, string or None, optional (default="auto")
|
|
The number of features to consider when looking for the best split:
|
|
|
|
- If int, then consider `max_features` features at each split.
|
|
- If float, then `max_features` is a percentage and
|
|
`int(max_features * n_features)` features are considered at each
|
|
split.
|
|
- If "auto", then `max_features=n_features`.
|
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
|
- If "log2", then `max_features=log2(n_features)`.
|
|
- If None, then `max_features=n_features`.
|
|
|
|
Note: the search for a split does not stop until at least one
|
|
valid partition of the node samples is found, even if it requires to
|
|
effectively inspect more than ``max_features`` features.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_depth : integer or None, optional (default=None)
|
|
The maximum depth of the tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
Ignored if ``max_leaf_nodes`` is not None.
|
|
Note: this parameter is tree-specific.
|
|
|
|
min_samples_split : integer, optional (default=2)
|
|
The minimum number of samples required to split an internal node.
|
|
Note: this parameter is tree-specific.
|
|
|
|
min_samples_leaf : integer, optional (default=1)
|
|
The minimum number of samples in newly created leaves. A split is
|
|
discarded if after the split, one of the leaves would contain less then
|
|
``min_samples_leaf`` samples.
|
|
Note: this parameter is tree-specific.
|
|
|
|
min_weight_fraction_leaf : float, optional (default=0.)
|
|
The minimum weighted fraction of the input samples required to be at a
|
|
leaf node.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_leaf_nodes : int or None, optional (default=None)
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
If not None then ``max_depth`` will be ignored.
|
|
Note: this parameter is tree-specific.
|
|
|
|
bootstrap : boolean, optional (default=False)
|
|
Whether bootstrap samples are used when building trees.
|
|
Note: this parameter is tree-specific.
|
|
|
|
oob_score : bool
|
|
Whether to use out-of-bag samples to estimate
|
|
the generalization error.
|
|
|
|
n_jobs : integer, optional (default=1)
|
|
The number of jobs to run in parallel for both `fit` and `predict`.
|
|
If -1, then the number of jobs is set to the number of cores.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
verbose : int, optional (default=0)
|
|
Controls the verbosity of the tree building process.
|
|
|
|
warm_start : bool, optional (default=False)
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest.
|
|
|
|
Attributes
|
|
----------
|
|
estimators_ : list of DecisionTreeRegressor
|
|
The collection of fitted sub-estimators.
|
|
|
|
feature_importances_ : array of shape = [n_features]
|
|
The feature importances (the higher, the more important the feature).
|
|
|
|
oob_score_ : float
|
|
Score of the training dataset obtained using an out-of-bag estimate.
|
|
|
|
oob_prediction_ : array of shape = [n_samples]
|
|
Prediction computed with out-of-bag estimate on the training set.
|
|
|
|
References
|
|
----------
|
|
|
|
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
|
|
Machine Learning, 63(1), 3-42, 2006.
|
|
|
|
See also
|
|
--------
|
|
sklearn.tree.ExtraTreeRegressor: Base estimator for this ensemble.
|
|
RandomForestRegressor: Ensemble regressor using trees with optimal splits.
|
|
"""
|
|
def __init__(self,
|
|
n_estimators=10,
|
|
criterion="mse",
|
|
max_depth=None,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_features="auto",
|
|
max_leaf_nodes=None,
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False):
|
|
super(ExtraTreesRegressor, self).__init__(
|
|
base_estimator=ExtraTreeRegressor(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"random_state"),
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start)
|
|
|
|
self.criterion = criterion
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = max_features
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
|
|
|
|
class RandomTreesEmbedding(BaseForest):
|
|
"""An ensemble of totally random trees.
|
|
|
|
An unsupervised transformation of a dataset to a high-dimensional
|
|
sparse representation. A datapoint is coded according to which leaf of
|
|
each tree it is sorted into. Using a one-hot encoding of the leaves,
|
|
this leads to a binary coding with as many ones as there are trees in
|
|
the forest.
|
|
|
|
The dimensionality of the resulting representation is
|
|
``n_out <= n_estimators * max_leaf_nodes``. If ``max_leaf_nodes == None``,
|
|
the number of leaf nodes is at most ``n_estimators * 2 ** max_depth``.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : int
|
|
Number of trees in the forest.
|
|
|
|
max_depth : int
|
|
The maximum depth of each tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
Ignored if ``max_leaf_nodes`` is not None.
|
|
|
|
min_samples_split : integer, optional (default=2)
|
|
The minimum number of samples required to split an internal node.
|
|
|
|
min_samples_leaf : integer, optional (default=1)
|
|
The minimum number of samples in newly created leaves. A split is
|
|
discarded if after the split, one of the leaves would contain less then
|
|
``min_samples_leaf`` samples.
|
|
|
|
min_weight_fraction_leaf : float, optional (default=0.)
|
|
The minimum weighted fraction of the input samples required to be at a
|
|
leaf node.
|
|
|
|
max_leaf_nodes : int or None, optional (default=None)
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
If not None then ``max_depth`` will be ignored.
|
|
|
|
sparse_output : bool, optional (default=True)
|
|
Whether or not to return a sparse CSR matrix, as default behavior,
|
|
or to return a dense array compatible with dense pipeline operators.
|
|
|
|
n_jobs : integer, optional (default=1)
|
|
The number of jobs to run in parallel for both `fit` and `predict`.
|
|
If -1, then the number of jobs is set to the number of cores.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
verbose : int, optional (default=0)
|
|
Controls the verbosity of the tree building process.
|
|
|
|
warm_start : bool, optional (default=False)
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest.
|
|
|
|
Attributes
|
|
----------
|
|
estimators_ : list of DecisionTreeClassifier
|
|
The collection of fitted sub-estimators.
|
|
|
|
References
|
|
----------
|
|
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
|
|
Machine Learning, 63(1), 3-42, 2006.
|
|
.. [2] Moosmann, F. and Triggs, B. and Jurie, F. "Fast discriminative
|
|
visual codebooks using randomized clustering forests"
|
|
NIPS 2007
|
|
|
|
"""
|
|
|
|
def __init__(self,
|
|
n_estimators=10,
|
|
max_depth=5,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_leaf_nodes=None,
|
|
sparse_output=True,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False):
|
|
super(RandomTreesEmbedding, self).__init__(
|
|
base_estimator=ExtraTreeRegressor(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"random_state"),
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start)
|
|
|
|
self.criterion = 'mse'
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = 1
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
self.sparse_output = sparse_output
|
|
|
|
def _set_oob_score(self, X, y):
|
|
raise NotImplementedError("OOB score not supported by tree embedding")
|
|
|
|
def fit(self, X, y=None, sample_weight=None):
|
|
"""Fit estimator.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix, shape=(n_samples, n_features)
|
|
The input samples. Use ``dtype=np.float32`` for maximum
|
|
efficiency. Sparse matrices are also supported, use sparse
|
|
``csc_matrix`` for maximum efficiency.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Returns self.
|
|
|
|
"""
|
|
self.fit_transform(X, y, sample_weight=sample_weight)
|
|
return self
|
|
|
|
def fit_transform(self, X, y=None, sample_weight=None):
|
|
"""Fit estimator and transform dataset.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix, shape=(n_samples, n_features)
|
|
Input data used to build forests. Use ``dtype=np.float32`` for
|
|
maximum efficiency.
|
|
|
|
Returns
|
|
-------
|
|
X_transformed : sparse matrix, shape=(n_samples, n_out)
|
|
Transformed dataset.
|
|
"""
|
|
# ensure_2d=False because there are actually unit test checking we fail
|
|
# for 1d.
|
|
X = check_array(X, ensure_2d=False)
|
|
|
|
rnd = check_random_state(self.random_state)
|
|
y = rnd.uniform(size=X.shape[0])
|
|
super(RandomTreesEmbedding, self).fit(X, y,
|
|
sample_weight=sample_weight)
|
|
|
|
self.one_hot_encoder_ = OneHotEncoder(sparse=self.sparse_output)
|
|
return self.one_hot_encoder_.fit_transform(self.apply(X))
|
|
|
|
def transform(self, X):
|
|
"""Transform dataset.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix, shape=(n_samples, n_features)
|
|
Input data to be transformed. Use ``dtype=np.float32`` for maximum
|
|
efficiency. Sparse matrices are also supported, use sparse
|
|
``csr_matrix`` for maximum efficiency.
|
|
|
|
Returns
|
|
-------
|
|
X_transformed : sparse matrix, shape=(n_samples, n_out)
|
|
Transformed dataset.
|
|
"""
|
|
return self.one_hot_encoder_.transform(self.apply(X))
|