Files
triton/python/isaac/external/tree.py
2015-08-12 21:59:59 -07:00

1366 lines
51 KiB
Python

"""
This module gathers tree-based methods, including decision, regression and
randomized trees. Single and multi-output problems are both handled.
"""
# Authors: Gilles Louppe <g.louppe@gmail.com>
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Brian Holt <bdholt1@gmail.com>
# Noel Dawe <noel@dawe.me>
# Satrajit Gosh <satrajit.ghosh@gmail.com>
# Joly Arnaud <arnaud.v.joly@gmail.com>
# Fares Hedayati <fares.hedayati@gmail.com>
#
# Licence: BSD 3 clause
# MODIFICATIONS:
# - Removed base classes
# - Incorporated required functions from six, utils
from __future__ import division
import sys
import numbers
from abc import abstractmethod
import numpy as np
from ._tree import Criterion
from ._tree import Splitter
from ._tree import DepthFirstTreeBuilder, BestFirstTreeBuilder
from ._tree import Tree
from . import _tree
__all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor",
"ExtraTreeClassifier", "ExtraTreeRegressor"]
################################ six ########################################
PY2 = sys.version_info[0] == 2
PY3 = sys.version_info[0] == 3
################################ utils ########################################
class NotFittedError(ValueError, AttributeError):
"""Exception class to raise if estimator is used before fitting
This class inherits from both ValueError and AttributeError to help with
exception handling and backward compatibility.
"""
def check_is_fitted(estimator, attributes, msg=None, all_or_any=all):
"""Perform is_fitted validation for estimator.
Checks if the estimator is fitted by verifying the presence of
"all_or_any" of the passed attributes and raises a NotFittedError with the
given message.
Parameters
----------
estimator : estimator instance.
estimator instance for which the check is performed.
attributes : attribute name(s) given as string or a list/tuple of strings
Eg. : ["coef_", "estimator_", ...], "coef_"
msg : string
The default error message is, "This %(name)s instance is not fitted
yet. Call 'fit' with appropriate arguments before using this method."
For custom messages if "%(name)s" is present in the message string,
it is substituted for the estimator name.
Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
all_or_any : callable, {all, any}, default all
Specify whether all or any of the given attributes must exist.
"""
if msg is None:
msg = ("This %(name)s instance is not fitted yet. Call 'fit' with "
"appropriate arguments before using this method.")
if not hasattr(estimator, 'fit'):
raise TypeError("%s is not an estimator instance." % (estimator))
if not isinstance(attributes, (list, tuple)):
attributes = [attributes]
if not all_or_any([hasattr(estimator, attr) for attr in attributes]):
raise NotFittedError(msg % {'name': type(estimator).__name__})
def compute_sample_weight(class_weight, y, indices=None):
"""Estimate sample weights by class for unbalanced datasets.
Parameters
----------
class_weight : dict, list of dicts, "auto", or None, optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one. For
multi-output problems, a list of dicts can be provided in the same
order as the columns of y.
The "auto" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data.
For multi-output, the weights of each column of y will be multiplied.
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
Array of original class labels per sample.
indices : array-like, shape (n_subsample,), or None
Array of indices to be used in a subsample. Can be of length less than
n_samples in the case of a subsample, or equal to n_samples in the
case of a bootstrap subsample with repeated indices. If None, the
sample weight will be calculated over the full sample. Only "auto" is
supported for class_weight if this is provided.
Returns
-------
sample_weight_vect : ndarray, shape (n_samples,)
Array with sample weights as applied to the original y
"""
y = np.atleast_1d(y)
if y.ndim == 1:
y = np.reshape(y, (-1, 1))
n_outputs = y.shape[1]
if isinstance(class_weight, six.string_types):
if class_weight != 'auto':
raise ValueError('The only valid preset for class_weight is '
'"auto". Given "%s".' % class_weight)
elif (indices is not None and
not isinstance(class_weight, six.string_types)):
raise ValueError('The only valid class_weight for subsampling is '
'"auto". Given "%s".' % class_weight)
elif n_outputs > 1:
if (not hasattr(class_weight, "__iter__") or
isinstance(class_weight, dict)):
raise ValueError("For multi-output, class_weight should be a "
"list of dicts, or a valid string.")
if len(class_weight) != n_outputs:
raise ValueError("For multi-output, number of elements in "
"class_weight should match number of outputs.")
expanded_class_weight = []
for k in range(n_outputs):
y_full = y[:, k]
classes_full = np.unique(y_full)
classes_missing = None
if class_weight == 'auto' or n_outputs == 1:
class_weight_k = class_weight
else:
class_weight_k = class_weight[k]
if indices is not None:
# Get class weights for the subsample, covering all classes in
# case some labels that were present in the original data are
# missing from the sample.
y_subsample = y[indices, k]
classes_subsample = np.unique(y_subsample)
weight_k = np.choose(np.searchsorted(classes_subsample,
classes_full),
compute_class_weight(class_weight_k,
classes_subsample,
y_subsample),
mode='clip')
classes_missing = set(classes_full) - set(classes_subsample)
else:
weight_k = compute_class_weight(class_weight_k,
classes_full,
y_full)
weight_k = weight_k[np.searchsorted(classes_full, y_full)]
if classes_missing:
# Make missing classes' weight zero
weight_k[in1d(y_full, list(classes_missing))] = 0.
expanded_class_weight.append(weight_k)
expanded_class_weight = np.prod(expanded_class_weight,
axis=0,
dtype=np.float64)
return expanded_class_weight
def _assert_all_finite(X):
"""Like assert_all_finite, but only for ndarray."""
X = np.asanyarray(X)
# First try an O(n) time, O(1) space solution for the common case that
# everything is finite; fall back to O(n) space np.isfinite to prevent
# false positives from overflow in sum method.
if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
and not np.isfinite(X).all()):
raise ValueError("Input contains NaN, infinity"
" or a value too large for %r." % X.dtype)
def check_array(array, accept_sparse=None, dtype="numeric", order=None,
copy=False, force_all_finite=True, ensure_2d=True,
allow_nd=False, ensure_min_samples=1, ensure_min_features=1):
"""Input validation on an array, list, sparse matrix or similar.
By default, the input is converted to an at least 2nd numpy array.
If the dtype of the array is object, attempt converting to float,
raising on failure.
Parameters
----------
array : object
Input object to check / convert.
accept_sparse : string, list of string or None (default=None)
String[s] representing allowed sparse matrix formats, such as 'csc',
'csr', etc. None means that sparse matrix input will raise an error.
If the input is sparse but not in the allowed format, it will be
converted to the first listed format.
dtype : string, type or None (default="numeric")
Data type of result. If None, the dtype of the input is preserved.
If "numeric", dtype is preserved unless array.dtype is object.
order : 'F', 'C' or None (default=None)
Whether an array will be forced to be fortran or c-style.
copy : boolean (default=False)
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.
force_all_finite : boolean (default=True)
Whether to raise an error on np.inf and np.nan in X.
ensure_2d : boolean (default=True)
Whether to make X at least 2d.
allow_nd : boolean (default=False)
Whether to allow X.ndim > 2.
ensure_min_samples : int (default=1)
Make sure that the array has a minimum number of samples in its first
axis (rows for a 2D array). Setting to 0 disables this check.
ensure_min_features : int (default=1)
Make sure that the 2D array has some minimum number of features
(columns). The default value of 1 rejects empty datasets.
This check is only enforced when the input data has effectively 2
dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
disables this check.
Returns
-------
X_converted : object
The converted and validated X.
"""
if isinstance(accept_sparse, str):
accept_sparse = [accept_sparse]
# store whether originally we wanted numeric dtype
dtype_numeric = dtype == "numeric"
if ensure_2d:
array = np.atleast_2d(array)
if dtype_numeric:
if hasattr(array, "dtype") and getattr(array.dtype, "kind", None) == "O":
# if input is object, convert to float.
dtype = np.float64
else:
dtype = None
array = np.array(array, dtype=dtype, order=order, copy=copy)
# make sure we actually converted to numeric:
if dtype_numeric and array.dtype.kind == "O":
array = array.astype(np.float64)
if not allow_nd and array.ndim >= 3:
raise ValueError("Found array with dim %d. Expected <= 2" %
array.ndim)
if force_all_finite:
_assert_all_finite(array)
shape_repr = _shape_repr(array.shape)
if ensure_min_samples > 0:
n_samples = _num_samples(array)
if n_samples < ensure_min_samples:
raise ValueError("Found array with %d sample(s) (shape=%s) while a"
" minimum of %d is required."
% (n_samples, shape_repr, ensure_min_samples))
if ensure_min_features > 0 and array.ndim == 2:
n_features = array.shape[1]
if n_features < ensure_min_features:
raise ValueError("Found array with %d feature(s) (shape=%s) while"
" a minimum of %d is required."
% (n_features, shape_repr, ensure_min_features))
return array
def check_random_state(seed):
"""Turn seed into a np.random.RandomState instance
If seed is None, return the RandomState singleton used by np.random.
If seed is an int, return a new RandomState instance seeded with seed.
If seed is already a RandomState instance, return it.
Otherwise raise ValueError.
"""
if seed is None or seed is np.random:
return np.random.mtrand._rand
if isinstance(seed, (numbers.Integral, np.integer)):
return np.random.RandomState(seed)
if isinstance(seed, np.random.RandomState):
return seed
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
' instance' % seed)
def _shape_repr(shape):
"""Return a platform independent reprensentation of an array shape
Under Python 2, the `long` type introduces an 'L' suffix when using the
default %r format for tuples of integers (typically used to store the shape
of an array).
Under Windows 64 bit (and Python 2), the `long` type is used by default
in numpy shapes even when the integer dimensions are well below 32 bit.
The platform specific type causes string messages or doctests to change
from one platform to another which is not desirable.
Under Python 3, there is no more `long` type so the `L` suffix is never
introduced in string representation.
>>> _shape_repr((1, 2))
'(1, 2)'
>>> one = 2 ** 64 / 2 ** 64 # force an upcast to `long` under Python 2
>>> _shape_repr((one, 2 * one))
'(1, 2)'
>>> _shape_repr((1,))
'(1,)'
>>> _shape_repr(())
'()'
"""
if len(shape) == 0:
return "()"
joined = ", ".join("%d" % e for e in shape)
if len(shape) == 1:
# special notation for singleton tuples
joined += ','
return "(%s)" % joined
def _num_samples(x):
"""Return number of samples in array-like x."""
if hasattr(x, 'fit'):
# Don't get num_samples from an ensembles length!
raise TypeError('Expected sequence or array-like, got '
'estimator %s' % x)
if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
if hasattr(x, '__array__'):
x = np.asarray(x)
else:
raise TypeError("Expected sequence or array-like, got %s" %
type(x))
if hasattr(x, 'shape'):
if len(x.shape) == 0:
raise TypeError("Singleton array %r cannot be considered"
" a valid collection." % x)
return x.shape[0]
else:
return len(x)
################################ metrics ########################################
def _weighted_sum(sample_score, sample_weight, normalize=False):
if normalize:
return np.average(sample_score, weights=sample_weight)
elif sample_weight is not None:
return np.dot(sample_score, sample_weight)
else:
return sample_score.sum()
def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
"""Accuracy classification score.
In multilabel classification, this function computes subset accuracy:
the set of labels predicted for a sample must *exactly* match the
corresponding set of labels in y_true.
Parameters
----------
y_true : 1d array-like, or label indicator array / sparse matrix
Ground truth (correct) labels.
y_pred : 1d array-like, or label indicator array / sparse matrix
Predicted labels, as returned by a classifier.
normalize : bool, optional (default=True)
If ``False``, return the number of correctly classified samples.
Otherwise, return the fraction of correctly classified samples.
sample_weight : array-like of shape = [n_samples], optional
Sample weights.
Returns
-------
score : float
If ``normalize == True``, return the correctly classified samples
(float), else it returns the number of correctly classified samples
(int).
The best performance is 1 with ``normalize == True`` and the number
of samples with ``normalize == False``.
See also
--------
jaccard_similarity_score, hamming_loss, zero_one_loss
Notes
-----
In binary and multiclass classification, this function is equal
to the ``jaccard_similarity_score`` function.
Examples
--------
>>> import numpy as np
>>> from sklearn.metrics import accuracy_score
>>> y_pred = [0, 2, 1, 3]
>>> y_true = [0, 1, 2, 3]
>>> accuracy_score(y_true, y_pred)
0.5
>>> accuracy_score(y_true, y_pred, normalize=False)
2
In the multilabel case with binary label indicators:
>>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))
0.5
"""
# Compute accuracy for each possible representation
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
if y_type.startswith('multilabel'):
differing_labels = count_nonzero(y_true - y_pred, axis=1)
score = differing_labels == 0
else:
score = y_true == y_pred
return _weighted_sum(score, sample_weight, normalize)
def r2_score(y_true, y_pred, sample_weight=None):
"""R^2 (coefficient of determination) regression score function.
Best possible score is 1.0, lower values are worse.
Parameters
----------
y_true : array-like of shape = [n_samples] or [n_samples, n_outputs]
Ground truth (correct) target values.
y_pred : array-like of shape = [n_samples] or [n_samples, n_outputs]
Estimated target values.
sample_weight : array-like of shape = [n_samples], optional
Sample weights.
Returns
-------
z : float
The R^2 score.
Notes
-----
This is not a symmetric function.
Unlike most other scores, R^2 score may be negative (it need not actually
be the square of a quantity R).
References
----------
.. [1] `Wikipedia entry on the Coefficient of determination
<http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
Examples
--------
>>> from sklearn.metrics import r2_score
>>> y_true = [3, -0.5, 2, 7]
>>> y_pred = [2.5, 0.0, 2, 8]
>>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS
0.948...
>>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
>>> y_pred = [[0, 2], [-1, 2], [8, -5]]
>>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS
0.938...
"""
y_type, y_true, y_pred = _check_reg_targets(y_true, y_pred)
if sample_weight is not None:
sample_weight = column_or_1d(sample_weight)
weight = sample_weight[:, np.newaxis]
else:
weight = 1.
numerator = (weight * (y_true - y_pred) ** 2).sum(dtype=np.float64)
denominator = (weight * (y_true - np.average(
y_true, axis=0, weights=sample_weight)) ** 2).sum(dtype=np.float64)
if denominator == 0.0:
if numerator == 0.0:
return 1.0
else:
# arbitrary set to zero to avoid -inf scores, having a constant
# y_true is not interesting for scoring a regression anyway
return 0.0
return 1 - numerator / denominator
################################ base #########################################
class ClassifierMixin(object):
"""Mixin class for all classifiers in scikit-learn."""
def score(self, X, y, sample_weight=None):
"""Returns the mean accuracy on the given test data and labels.
In multi-label classification, this is the subset accuracy
which is a harsh metric since you require for each sample that
each label set be correctly predicted.
Parameters
----------
X : array-like, shape = (n_samples, n_features)
Test samples.
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
True labels for X.
sample_weight : array-like, shape = [n_samples], optional
Sample weights.
Returns
-------
score : float
Mean accuracy of self.predict(X) wrt. y.
"""
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
class RegressorMixin(object):
"""Mixin class for all regression estimators in scikit-learn."""
def score(self, X, y, sample_weight=None):
"""Returns the coefficient of determination R^2 of the prediction.
The coefficient R^2 is defined as (1 - u/v), where u is the regression
sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual
sum of squares ((y_true - y_true.mean()) ** 2).sum().
Best possible score is 1.0, lower values are worse.
Parameters
----------
X : array-like, shape = (n_samples, n_features)
Test samples.
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
True values for X.
sample_weight : array-like, shape = [n_samples], optional
Sample weights.
Returns
-------
score : float
R^2 of self.predict(X) wrt. y.
"""
return r2_score(y, self.predict(X), sample_weight=sample_weight)
# =============================================================================
# Types and constants
# =============================================================================
DTYPE = _tree.DTYPE
DOUBLE = _tree.DOUBLE
CRITERIA_CLF = {"gini": _tree.Gini, "entropy": _tree.Entropy}
CRITERIA_REG = {"mse": _tree.MSE, "friedman_mse": _tree.FriedmanMSE}
DENSE_SPLITTERS = {"best": _tree.BestSplitter,
"presort-best": _tree.PresortBestSplitter,
"random": _tree.RandomSplitter}
SPARSE_SPLITTERS = {"best": _tree.BestSparseSplitter,
"random": _tree.RandomSparseSplitter}
# =============================================================================
# Base decision tree
# =============================================================================
class BaseDecisionTree:
"""Base class for decision trees.
Warning: This class should not be used directly.
Use derived classes instead.
"""
@abstractmethod
def __init__(self,
criterion,
splitter,
max_depth,
min_samples_split,
min_samples_leaf,
min_weight_fraction_leaf,
max_features,
max_leaf_nodes,
random_state,
class_weight=None):
self.criterion = criterion
self.splitter = splitter
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.min_weight_fraction_leaf = min_weight_fraction_leaf
self.max_features = max_features
self.random_state = random_state
self.max_leaf_nodes = max_leaf_nodes
self.class_weight = class_weight
self.n_features_ = None
self.n_outputs_ = None
self.classes_ = None
self.n_classes_ = None
self.tree_ = None
self.max_features_ = None
def fit(self, X, y, sample_weight=None, check_input=True):
"""Build a decision tree from the training set (X, y).
Parameters
----------
X : array-like or sparse matrix, shape = [n_samples, n_features]
The training input samples. Internally, it will be converted to
``dtype=np.float32`` and if a sparse matrix is provided
to a sparse ``csc_matrix``.
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
The target values (class labels in classification, real numbers in
regression). In the regression case, use ``dtype=np.float64`` and
``order='C'`` for maximum efficiency.
sample_weight : array-like, shape = [n_samples] or None
Sample weights. If None, then samples are equally weighted. Splits
that would create child nodes with net zero or negative weight are
ignored while searching for a split in each node. In the case of
classification, splits are also ignored if they would result in any
single class carrying a negative weight in either child node.
check_input : boolean, (default=True)
Allow to bypass several input checking.
Don't use this parameter unless you know what you do.
Returns
-------
self : object
Returns self.
"""
random_state = check_random_state(self.random_state)
if check_input:
X = check_array(X, dtype=DTYPE)
# Determine output settings
n_samples, self.n_features_ = X.shape
is_classification = isinstance(self, ClassifierMixin)
y = np.atleast_1d(y)
expanded_class_weight = None
if y.ndim == 1:
# reshape is necessary to preserve the data contiguity against vs
# [:, np.newaxis] that does not.
y = np.reshape(y, (-1, 1))
self.n_outputs_ = y.shape[1]
if is_classification:
y = np.copy(y)
self.classes_ = []
self.n_classes_ = []
if self.class_weight is not None:
y_original = np.copy(y)
for k in range(self.n_outputs_):
classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True)
self.classes_.append(classes_k)
self.n_classes_.append(classes_k.shape[0])
if self.class_weight is not None:
expanded_class_weight = compute_sample_weight(
self.class_weight, y_original)
else:
self.classes_ = [None] * self.n_outputs_
self.n_classes_ = [1] * self.n_outputs_
self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
y = np.ascontiguousarray(y, dtype=DOUBLE)
# Check parameters
max_depth = ((2 ** 31) - 1 if self.max_depth is None
else self.max_depth)
max_leaf_nodes = (-1 if self.max_leaf_nodes is None
else self.max_leaf_nodes)
if isinstance(self.max_features, str if PY3 else basestring):
if self.max_features == "auto":
if is_classification:
max_features = max(1, int(np.sqrt(self.n_features_)))
else:
max_features = self.n_features_
elif self.max_features == "sqrt":
max_features = max(1, int(np.sqrt(self.n_features_)))
elif self.max_features == "log2":
max_features = max(1, int(np.log2(self.n_features_)))
else:
raise ValueError(
'Invalid value for max_features. Allowed string '
'values are "auto", "sqrt" or "log2".')
elif self.max_features is None:
max_features = self.n_features_
elif isinstance(self.max_features, (numbers.Integral, np.integer)):
max_features = self.max_features
else: # float
if self.max_features > 0.0:
max_features = max(1, int(self.max_features * self.n_features_))
else:
max_features = 0
self.max_features_ = max_features
if len(y) != n_samples:
raise ValueError("Number of labels=%d does not match "
"number of samples=%d" % (len(y), n_samples))
if self.min_samples_split <= 0:
raise ValueError("min_samples_split must be greater than zero.")
if self.min_samples_leaf <= 0:
raise ValueError("min_samples_leaf must be greater than zero.")
if not 0 <= self.min_weight_fraction_leaf <= 0.5:
raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
if max_depth <= 0:
raise ValueError("max_depth must be greater than zero. ")
if not (0 < max_features <= self.n_features_):
raise ValueError("max_features must be in (0, n_features]")
if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)):
raise ValueError("max_leaf_nodes must be integral number but was "
"%r" % max_leaf_nodes)
if -1 < max_leaf_nodes < 2:
raise ValueError(("max_leaf_nodes {0} must be either smaller than "
"0 or larger than 1").format(max_leaf_nodes))
if sample_weight is not None:
if (getattr(sample_weight, "dtype", None) != DOUBLE or
not sample_weight.flags.contiguous):
sample_weight = np.ascontiguousarray(
sample_weight, dtype=DOUBLE)
if len(sample_weight.shape) > 1:
raise ValueError("Sample weights array has more "
"than one dimension: %d" %
len(sample_weight.shape))
if len(sample_weight) != n_samples:
raise ValueError("Number of weights=%d does not match "
"number of samples=%d" %
(len(sample_weight), n_samples))
if expanded_class_weight is not None:
if sample_weight is not None:
sample_weight = sample_weight * expanded_class_weight
else:
sample_weight = expanded_class_weight
# Set min_weight_leaf from min_weight_fraction_leaf
if self.min_weight_fraction_leaf != 0. and sample_weight is not None:
min_weight_leaf = (self.min_weight_fraction_leaf *
np.sum(sample_weight))
else:
min_weight_leaf = 0.
# Set min_samples_split sensibly
min_samples_split = max(self.min_samples_split,
2 * self.min_samples_leaf)
# Build tree
criterion = self.criterion
if not isinstance(criterion, Criterion):
if is_classification:
criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
self.n_classes_)
else:
criterion = CRITERIA_REG[self.criterion](self.n_outputs_)
SPLITTERS = DENSE_SPLITTERS
splitter = self.splitter
if not isinstance(self.splitter, Splitter):
splitter = SPLITTERS[self.splitter](criterion,
self.max_features_,
self.min_samples_leaf,
min_weight_leaf,
random_state)
self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_)
# Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
if max_leaf_nodes < 0:
builder = DepthFirstTreeBuilder(splitter, min_samples_split,
self.min_samples_leaf,
min_weight_leaf,
max_depth)
else:
builder = BestFirstTreeBuilder(splitter, min_samples_split,
self.min_samples_leaf,
min_weight_leaf,
max_depth,
max_leaf_nodes)
builder.build(self.tree_, X, y, sample_weight)
if self.n_outputs_ == 1:
self.n_classes_ = self.n_classes_[0]
self.classes_ = self.classes_[0]
return self
def predict(self, X, check_input=True):
"""Predict class or regression value for X.
For a classification model, the predicted class for each sample in X is
returned. For a regression model, the predicted value based on X is
returned.
Parameters
----------
X : array-like or sparse matrix of shape = [n_samples, n_features]
The input samples. Internally, it will be converted to
``dtype=np.float32`` and if a sparse matrix is provided
to a sparse ``csr_matrix``.
check_input : boolean, (default=True)
Allow to bypass several input checking.
Don't use this parameter unless you know what you do.
Returns
-------
y : array of shape = [n_samples] or [n_samples, n_outputs]
The predicted classes, or the predict values.
"""
if check_input:
X = check_array(X, dtype=DTYPE)
n_samples, n_features = X.shape
if self.tree_ is None:
raise NotFittedError("Tree not initialized. Perform a fit first")
if self.n_features_ != n_features:
raise ValueError("Number of features of the model must "
" match the input. Model n_features is %s and "
" input n_features is %s "
% (self.n_features_, n_features))
proba = self.tree_.predict(X)
# Classification
if isinstance(self, ClassifierMixin):
if self.n_outputs_ == 1:
return self.classes_.take(np.argmax(proba, axis=1), axis=0)
else:
predictions = np.zeros((n_samples, self.n_outputs_))
for k in range(self.n_outputs_):
predictions[:, k] = self.classes_[k].take(
np.argmax(proba[:, k], axis=1),
axis=0)
return predictions
# Regression
else:
if self.n_outputs_ == 1:
return proba[:, 0]
else:
return proba[:, :, 0]
@property
def feature_importances_(self):
"""Return the feature importances.
The importance of a feature is computed as the (normalized) total
reduction of the criterion brought by that feature.
It is also known as the Gini importance.
Returns
-------
feature_importances_ : array, shape = [n_features]
"""
if self.tree_ is None:
raise NotFittedError("Estimator not fitted, call `fit` before"
" `feature_importances_`.")
return self.tree_.compute_feature_importances()
# =============================================================================
# Public estimators
# =============================================================================
class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
"""A decision tree classifier.
Parameters
----------
criterion : string, optional (default="gini")
The function to measure the quality of a split. Supported criteria are
"gini" for the Gini impurity and "entropy" for the information gain.
splitter : string, optional (default="best")
The strategy used to choose the split at each node. Supported
strategies are "best" to choose the best split and "random" to choose
the best random split.
max_features : int, float, string or None, optional (default=None)
The number of features to consider when looking for the best split:
- If int, then consider `max_features` features at each split.
- If float, then `max_features` is a percentage and
`int(max_features * n_features)` features are considered at each
split.
- If "auto", then `max_features=sqrt(n_features)`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.
Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
max_depth : int or None, optional (default=None)
The maximum depth of the tree. If None, then nodes are expanded until
all leaves are pure or until all leaves contain less than
min_samples_split samples.
Ignored if ``max_leaf_nodes`` is not None.
min_samples_split : int, optional (default=2)
The minimum number of samples required to split an internal node.
min_samples_leaf : int, optional (default=1)
The minimum number of samples required to be at a leaf node.
min_weight_fraction_leaf : float, optional (default=0.)
The minimum weighted fraction of the input samples required to be at a
leaf node.
max_leaf_nodes : int or None, optional (default=None)
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
Best nodes are defined as relative reduction in impurity.
If None then unlimited number of leaf nodes.
If not None then ``max_depth`` will be ignored.
class_weight : dict, list of dicts, "auto" or None, optional (default=None)
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one. For
multi-output problems, a list of dicts can be provided in the same
order as the columns of y.
The "auto" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data.
For multi-output, the weights of each column of y will be multiplied.
Note that these weights will be multiplied with sample_weight (passed
through the fit method) if sample_weight is specified.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Attributes
----------
tree_ : Tree object
The underlying Tree object.
max_features_ : int,
The inferred value of max_features.
classes_ : array of shape = [n_classes] or a list of such arrays
The classes labels (single output problem),
or a list of arrays of class labels (multi-output problem).
n_classes_ : int or list
The number of classes (for single output problems),
or a list containing the number of classes for each
output (for multi-output problems).
feature_importances_ : array of shape = [n_features]
The feature importances. The higher, the more important the
feature. The importance of a feature is computed as the (normalized)
total reduction of the criterion brought by that feature. It is also
known as the Gini importance [4]_.
See also
--------
DecisionTreeRegressor
References
----------
.. [1] http://en.wikipedia.org/wiki/Decision_tree_learning
.. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
and Regression Trees", Wadsworth, Belmont, CA, 1984.
.. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
Learning", Springer, 2009.
.. [4] L. Breiman, and A. Cutler, "Random Forests",
http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
Examples
--------
>>> from sklearn.datasets import load_iris
>>> from sklearn.cross_validation import cross_val_score
>>> from sklearn.tree import DecisionTreeClassifier
>>> clf = DecisionTreeClassifier(random_state=0)
>>> iris = load_iris()
>>> cross_val_score(clf, iris.data, iris.target, cv=10)
... # doctest: +SKIP
...
array([ 1. , 0.93..., 0.86..., 0.93..., 0.93...,
0.93..., 0.93..., 1. , 0.93..., 1. ])
"""
def __init__(self,
criterion="gini",
splitter="best",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.,
max_features=None,
random_state=None,
max_leaf_nodes=None,
class_weight=None):
super(DecisionTreeClassifier, self).__init__(
criterion=criterion,
splitter=splitter,
max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
min_weight_fraction_leaf=min_weight_fraction_leaf,
max_features=max_features,
max_leaf_nodes=max_leaf_nodes,
class_weight=class_weight,
random_state=random_state)
def predict_proba(self, X, check_input=True):
"""Predict class probabilities of the input samples X.
The predicted class probability is the fraction of samples of the same
class in a leaf.
check_input : boolean, (default=True)
Allow to bypass several input checking.
Don't use this parameter unless you know what you do.
Parameters
----------
X : array-like or sparse matrix of shape = [n_samples, n_features]
The input samples. Internally, it will be converted to
``dtype=np.float32`` and if a sparse matrix is provided
to a sparse ``csr_matrix``.
Returns
-------
p : array of shape = [n_samples, n_classes], or a list of n_outputs
such arrays if n_outputs > 1.
The class probabilities of the input samples. The order of the
classes corresponds to that in the attribute `classes_`.
"""
check_is_fitted(self, 'n_outputs_')
if check_input:
X = check_array(X, dtype=DTYPE)
n_samples, n_features = X.shape
if self.tree_ is None:
raise NotFittedError("Tree not initialized. Perform a fit first.")
if self.n_features_ != n_features:
raise ValueError("Number of features of the model must "
" match the input. Model n_features is %s and "
" input n_features is %s "
% (self.n_features_, n_features))
proba = self.tree_.predict(X)
if self.n_outputs_ == 1:
proba = proba[:, :self.n_classes_]
normalizer = proba.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba /= normalizer
return proba
else:
all_proba = []
for k in range(self.n_outputs_):
proba_k = proba[:, k, :self.n_classes_[k]]
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba_k /= normalizer
all_proba.append(proba_k)
return all_proba
def predict_log_proba(self, X):
"""Predict class log-probabilities of the input samples X.
Parameters
----------
X : array-like or sparse matrix of shape = [n_samples, n_features]
The input samples. Internally, it will be converted to
``dtype=np.float32`` and if a sparse matrix is provided
to a sparse ``csr_matrix``.
Returns
-------
p : array of shape = [n_samples, n_classes], or a list of n_outputs
such arrays if n_outputs > 1.
The class log-probabilities of the input samples. The order of the
classes corresponds to that in the attribute `classes_`.
"""
proba = self.predict_proba(X)
if self.n_outputs_ == 1:
return np.log(proba)
else:
for k in range(self.n_outputs_):
proba[k] = np.log(proba[k])
return proba
class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
"""A decision tree regressor.
Parameters
----------
criterion : string, optional (default="mse")
The function to measure the quality of a split. The only supported
criterion is "mse" for the mean squared error.
splitter : string, optional (default="best")
The strategy used to choose the split at each node. Supported
strategies are "best" to choose the best split and "random" to choose
the best random split.
max_features : int, float, string or None, optional (default=None)
The number of features to consider when looking for the best split:
- If int, then consider `max_features` features at each split.
- If float, then `max_features` is a percentage and
`int(max_features * n_features)` features are considered at each
split.
- If "auto", then `max_features=n_features`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.
Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
max_depth : int or None, optional (default=None)
The maximum depth of the tree. If None, then nodes are expanded until
all leaves are pure or until all leaves contain less than
min_samples_split samples.
Ignored if ``max_leaf_nodes`` is not None.
min_samples_split : int, optional (default=2)
The minimum number of samples required to split an internal node.
min_samples_leaf : int, optional (default=1)
The minimum number of samples required to be at a leaf node.
min_weight_fraction_leaf : float, optional (default=0.)
The minimum weighted fraction of the input samples required to be at a
leaf node.
max_leaf_nodes : int or None, optional (default=None)
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
Best nodes are defined as relative reduction in impurity.
If None then unlimited number of leaf nodes.
If not None then ``max_depth`` will be ignored.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Attributes
----------
tree_ : Tree object
The underlying Tree object.
max_features_ : int,
The inferred value of max_features.
feature_importances_ : array of shape = [n_features]
The feature importances.
The higher, the more important the feature.
The importance of a feature is computed as the
(normalized) total reduction of the criterion brought
by that feature. It is also known as the Gini importance [4]_.
See also
--------
DecisionTreeClassifier
References
----------
.. [1] http://en.wikipedia.org/wiki/Decision_tree_learning
.. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
and Regression Trees", Wadsworth, Belmont, CA, 1984.
.. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
Learning", Springer, 2009.
.. [4] L. Breiman, and A. Cutler, "Random Forests",
http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
Examples
--------
>>> from sklearn.datasets import load_boston
>>> from sklearn.cross_validation import cross_val_score
>>> from sklearn.tree import DecisionTreeRegressor
>>> boston = load_boston()
>>> regressor = DecisionTreeRegressor(random_state=0)
>>> cross_val_score(regressor, boston.data, boston.target, cv=10)
... # doctest: +SKIP
...
array([ 0.61..., 0.57..., -0.34..., 0.41..., 0.75...,
0.07..., 0.29..., 0.33..., -1.42..., -1.77...])
"""
def __init__(self,
criterion="mse",
splitter="best",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.,
max_features=None,
random_state=None,
max_leaf_nodes=None):
super(DecisionTreeRegressor, self).__init__(
criterion=criterion,
splitter=splitter,
max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
min_weight_fraction_leaf=min_weight_fraction_leaf,
max_features=max_features,
max_leaf_nodes=max_leaf_nodes,
random_state=random_state)
class ExtraTreeClassifier(DecisionTreeClassifier):
"""An extremely randomized tree classifier.
Extra-trees differ from classic decision trees in the way they are built.
When looking for the best split to separate the samples of a node into two
groups, random splits are drawn for each of the `max_features` randomly
selected features and the best split among those is chosen. When
`max_features` is set 1, this amounts to building a totally random
decision tree.
Warning: Extra-trees should only be used within ensemble methods.
See also
--------
ExtraTreeRegressor, ExtraTreesClassifier, ExtraTreesRegressor
References
----------
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
Machine Learning, 63(1), 3-42, 2006.
"""
def __init__(self,
criterion="gini",
splitter="random",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.,
max_features="auto",
random_state=None,
max_leaf_nodes=None,
class_weight=None):
super(ExtraTreeClassifier, self).__init__(
criterion=criterion,
splitter=splitter,
max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
min_weight_fraction_leaf=min_weight_fraction_leaf,
max_features=max_features,
max_leaf_nodes=max_leaf_nodes,
class_weight=class_weight,
random_state=random_state)
class ExtraTreeRegressor(DecisionTreeRegressor):
"""An extremely randomized tree regressor.
Extra-trees differ from classic decision trees in the way they are built.
When looking for the best split to separate the samples of a node into two
groups, random splits are drawn for each of the `max_features` randomly
selected features and the best split among those is chosen. When
`max_features` is set 1, this amounts to building a totally random
decision tree.
Warning: Extra-trees should only be used within ensemble methods.
See also
--------
ExtraTreeClassifier, ExtraTreesClassifier, ExtraTreesRegressor
References
----------
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
Machine Learning, 63(1), 3-42, 2006.
"""
def __init__(self,
criterion="mse",
splitter="random",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.,
max_features="auto",
random_state=None,
max_leaf_nodes=None):
super(ExtraTreeRegressor, self).__init__(
criterion=criterion,
splitter=splitter,
max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
min_weight_fraction_leaf=min_weight_fraction_leaf,
max_features=max_features,
max_leaf_nodes=max_leaf_nodes,
random_state=random_state)