""" This module gathers tree-based methods, including decision, regression and randomized trees. Single and multi-output problems are both handled. """ # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt # Noel Dawe # Satrajit Gosh # Joly Arnaud # Fares Hedayati # # Licence: BSD 3 clause # MODIFICATIONS: # - Removed base classes # - Incorporated required functions from six, utils from __future__ import division import sys import numbers from abc import abstractmethod import numpy as np from ._tree import Criterion from ._tree import Splitter from ._tree import DepthFirstTreeBuilder, BestFirstTreeBuilder from ._tree import Tree from . import _tree __all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor", "ExtraTreeClassifier", "ExtraTreeRegressor"] ################################ six ######################################## PY2 = sys.version_info[0] == 2 PY3 = sys.version_info[0] == 3 ################################ utils ######################################## class NotFittedError(ValueError, AttributeError): """Exception class to raise if estimator is used before fitting This class inherits from both ValueError and AttributeError to help with exception handling and backward compatibility. """ def check_is_fitted(estimator, attributes, msg=None, all_or_any=all): """Perform is_fitted validation for estimator. Checks if the estimator is fitted by verifying the presence of "all_or_any" of the passed attributes and raises a NotFittedError with the given message. Parameters ---------- estimator : estimator instance. estimator instance for which the check is performed. attributes : attribute name(s) given as string or a list/tuple of strings Eg. : ["coef_", "estimator_", ...], "coef_" msg : string The default error message is, "This %(name)s instance is not fitted yet. Call 'fit' with appropriate arguments before using this method." For custom messages if "%(name)s" is present in the message string, it is substituted for the estimator name. Eg. : "Estimator, %(name)s, must be fitted before sparsifying". all_or_any : callable, {all, any}, default all Specify whether all or any of the given attributes must exist. """ if msg is None: msg = ("This %(name)s instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") if not hasattr(estimator, 'fit'): raise TypeError("%s is not an estimator instance." % (estimator)) if not isinstance(attributes, (list, tuple)): attributes = [attributes] if not all_or_any([hasattr(estimator, attr) for attr in attributes]): raise NotFittedError(msg % {'name': type(estimator).__name__}) def compute_sample_weight(class_weight, y, indices=None): """Estimate sample weights by class for unbalanced datasets. Parameters ---------- class_weight : dict, list of dicts, "auto", or None, optional Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. The "auto" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data. For multi-output, the weights of each column of y will be multiplied. y : array-like, shape = [n_samples] or [n_samples, n_outputs] Array of original class labels per sample. indices : array-like, shape (n_subsample,), or None Array of indices to be used in a subsample. Can be of length less than n_samples in the case of a subsample, or equal to n_samples in the case of a bootstrap subsample with repeated indices. If None, the sample weight will be calculated over the full sample. Only "auto" is supported for class_weight if this is provided. Returns ------- sample_weight_vect : ndarray, shape (n_samples,) Array with sample weights as applied to the original y """ y = np.atleast_1d(y) if y.ndim == 1: y = np.reshape(y, (-1, 1)) n_outputs = y.shape[1] if isinstance(class_weight, six.string_types): if class_weight != 'auto': raise ValueError('The only valid preset for class_weight is ' '"auto". Given "%s".' % class_weight) elif (indices is not None and not isinstance(class_weight, six.string_types)): raise ValueError('The only valid class_weight for subsampling is ' '"auto". Given "%s".' % class_weight) elif n_outputs > 1: if (not hasattr(class_weight, "__iter__") or isinstance(class_weight, dict)): raise ValueError("For multi-output, class_weight should be a " "list of dicts, or a valid string.") if len(class_weight) != n_outputs: raise ValueError("For multi-output, number of elements in " "class_weight should match number of outputs.") expanded_class_weight = [] for k in range(n_outputs): y_full = y[:, k] classes_full = np.unique(y_full) classes_missing = None if class_weight == 'auto' or n_outputs == 1: class_weight_k = class_weight else: class_weight_k = class_weight[k] if indices is not None: # Get class weights for the subsample, covering all classes in # case some labels that were present in the original data are # missing from the sample. y_subsample = y[indices, k] classes_subsample = np.unique(y_subsample) weight_k = np.choose(np.searchsorted(classes_subsample, classes_full), compute_class_weight(class_weight_k, classes_subsample, y_subsample), mode='clip') classes_missing = set(classes_full) - set(classes_subsample) else: weight_k = compute_class_weight(class_weight_k, classes_full, y_full) weight_k = weight_k[np.searchsorted(classes_full, y_full)] if classes_missing: # Make missing classes' weight zero weight_k[in1d(y_full, list(classes_missing))] = 0. expanded_class_weight.append(weight_k) expanded_class_weight = np.prod(expanded_class_weight, axis=0, dtype=np.float64) return expanded_class_weight def _assert_all_finite(X): """Like assert_all_finite, but only for ndarray.""" X = np.asanyarray(X) # First try an O(n) time, O(1) space solution for the common case that # everything is finite; fall back to O(n) space np.isfinite to prevent # false positives from overflow in sum method. if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum()) and not np.isfinite(X).all()): raise ValueError("Input contains NaN, infinity" " or a value too large for %r." % X.dtype) def check_array(array, accept_sparse=None, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1): """Input validation on an array, list, sparse matrix or similar. By default, the input is converted to an at least 2nd numpy array. If the dtype of the array is object, attempt converting to float, raising on failure. Parameters ---------- array : object Input object to check / convert. accept_sparse : string, list of string or None (default=None) String[s] representing allowed sparse matrix formats, such as 'csc', 'csr', etc. None means that sparse matrix input will raise an error. If the input is sparse but not in the allowed format, it will be converted to the first listed format. dtype : string, type or None (default="numeric") Data type of result. If None, the dtype of the input is preserved. If "numeric", dtype is preserved unless array.dtype is object. order : 'F', 'C' or None (default=None) Whether an array will be forced to be fortran or c-style. copy : boolean (default=False) Whether a forced copy will be triggered. If copy=False, a copy might be triggered by a conversion. force_all_finite : boolean (default=True) Whether to raise an error on np.inf and np.nan in X. ensure_2d : boolean (default=True) Whether to make X at least 2d. allow_nd : boolean (default=False) Whether to allow X.ndim > 2. ensure_min_samples : int (default=1) Make sure that the array has a minimum number of samples in its first axis (rows for a 2D array). Setting to 0 disables this check. ensure_min_features : int (default=1) Make sure that the 2D array has some minimum number of features (columns). The default value of 1 rejects empty datasets. This check is only enforced when the input data has effectively 2 dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 disables this check. Returns ------- X_converted : object The converted and validated X. """ if isinstance(accept_sparse, str): accept_sparse = [accept_sparse] # store whether originally we wanted numeric dtype dtype_numeric = dtype == "numeric" if ensure_2d: array = np.atleast_2d(array) if dtype_numeric: if hasattr(array, "dtype") and getattr(array.dtype, "kind", None) == "O": # if input is object, convert to float. dtype = np.float64 else: dtype = None array = np.array(array, dtype=dtype, order=order, copy=copy) # make sure we actually converted to numeric: if dtype_numeric and array.dtype.kind == "O": array = array.astype(np.float64) if not allow_nd and array.ndim >= 3: raise ValueError("Found array with dim %d. Expected <= 2" % array.ndim) if force_all_finite: _assert_all_finite(array) shape_repr = _shape_repr(array.shape) if ensure_min_samples > 0: n_samples = _num_samples(array) if n_samples < ensure_min_samples: raise ValueError("Found array with %d sample(s) (shape=%s) while a" " minimum of %d is required." % (n_samples, shape_repr, ensure_min_samples)) if ensure_min_features > 0 and array.ndim == 2: n_features = array.shape[1] if n_features < ensure_min_features: raise ValueError("Found array with %d feature(s) (shape=%s) while" " a minimum of %d is required." % (n_features, shape_repr, ensure_min_features)) return array def check_random_state(seed): """Turn seed into a np.random.RandomState instance If seed is None, return the RandomState singleton used by np.random. If seed is an int, return a new RandomState instance seeded with seed. If seed is already a RandomState instance, return it. Otherwise raise ValueError. """ if seed is None or seed is np.random: return np.random.mtrand._rand if isinstance(seed, (numbers.Integral, np.integer)): return np.random.RandomState(seed) if isinstance(seed, np.random.RandomState): return seed raise ValueError('%r cannot be used to seed a numpy.random.RandomState' ' instance' % seed) def _shape_repr(shape): """Return a platform independent reprensentation of an array shape Under Python 2, the `long` type introduces an 'L' suffix when using the default %r format for tuples of integers (typically used to store the shape of an array). Under Windows 64 bit (and Python 2), the `long` type is used by default in numpy shapes even when the integer dimensions are well below 32 bit. The platform specific type causes string messages or doctests to change from one platform to another which is not desirable. Under Python 3, there is no more `long` type so the `L` suffix is never introduced in string representation. >>> _shape_repr((1, 2)) '(1, 2)' >>> one = 2 ** 64 / 2 ** 64 # force an upcast to `long` under Python 2 >>> _shape_repr((one, 2 * one)) '(1, 2)' >>> _shape_repr((1,)) '(1,)' >>> _shape_repr(()) '()' """ if len(shape) == 0: return "()" joined = ", ".join("%d" % e for e in shape) if len(shape) == 1: # special notation for singleton tuples joined += ',' return "(%s)" % joined def _num_samples(x): """Return number of samples in array-like x.""" if hasattr(x, 'fit'): # Don't get num_samples from an ensembles length! raise TypeError('Expected sequence or array-like, got ' 'estimator %s' % x) if not hasattr(x, '__len__') and not hasattr(x, 'shape'): if hasattr(x, '__array__'): x = np.asarray(x) else: raise TypeError("Expected sequence or array-like, got %s" % type(x)) if hasattr(x, 'shape'): if len(x.shape) == 0: raise TypeError("Singleton array %r cannot be considered" " a valid collection." % x) return x.shape[0] else: return len(x) ################################ metrics ######################################## def _weighted_sum(sample_score, sample_weight, normalize=False): if normalize: return np.average(sample_score, weights=sample_weight) elif sample_weight is not None: return np.dot(sample_score, sample_weight) else: return sample_score.sum() def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None): """Accuracy classification score. In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must *exactly* match the corresponding set of labels in y_true. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) labels. y_pred : 1d array-like, or label indicator array / sparse matrix Predicted labels, as returned by a classifier. normalize : bool, optional (default=True) If ``False``, return the number of correctly classified samples. Otherwise, return the fraction of correctly classified samples. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- score : float If ``normalize == True``, return the correctly classified samples (float), else it returns the number of correctly classified samples (int). The best performance is 1 with ``normalize == True`` and the number of samples with ``normalize == False``. See also -------- jaccard_similarity_score, hamming_loss, zero_one_loss Notes ----- In binary and multiclass classification, this function is equal to the ``jaccard_similarity_score`` function. Examples -------- >>> import numpy as np >>> from sklearn.metrics import accuracy_score >>> y_pred = [0, 2, 1, 3] >>> y_true = [0, 1, 2, 3] >>> accuracy_score(y_true, y_pred) 0.5 >>> accuracy_score(y_true, y_pred, normalize=False) 2 In the multilabel case with binary label indicators: >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2))) 0.5 """ # Compute accuracy for each possible representation y_type, y_true, y_pred = _check_targets(y_true, y_pred) if y_type.startswith('multilabel'): differing_labels = count_nonzero(y_true - y_pred, axis=1) score = differing_labels == 0 else: score = y_true == y_pred return _weighted_sum(score, sample_weight, normalize) def r2_score(y_true, y_pred, sample_weight=None): """R^2 (coefficient of determination) regression score function. Best possible score is 1.0, lower values are worse. Parameters ---------- y_true : array-like of shape = [n_samples] or [n_samples, n_outputs] Ground truth (correct) target values. y_pred : array-like of shape = [n_samples] or [n_samples, n_outputs] Estimated target values. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- z : float The R^2 score. Notes ----- This is not a symmetric function. Unlike most other scores, R^2 score may be negative (it need not actually be the square of a quantity R). References ---------- .. [1] `Wikipedia entry on the Coefficient of determination `_ Examples -------- >>> from sklearn.metrics import r2_score >>> y_true = [3, -0.5, 2, 7] >>> y_pred = [2.5, 0.0, 2, 8] >>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS 0.948... >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] >>> y_pred = [[0, 2], [-1, 2], [8, -5]] >>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS 0.938... """ y_type, y_true, y_pred = _check_reg_targets(y_true, y_pred) if sample_weight is not None: sample_weight = column_or_1d(sample_weight) weight = sample_weight[:, np.newaxis] else: weight = 1. numerator = (weight * (y_true - y_pred) ** 2).sum(dtype=np.float64) denominator = (weight * (y_true - np.average( y_true, axis=0, weights=sample_weight)) ** 2).sum(dtype=np.float64) if denominator == 0.0: if numerator == 0.0: return 1.0 else: # arbitrary set to zero to avoid -inf scores, having a constant # y_true is not interesting for scoring a regression anyway return 0.0 return 1 - numerator / denominator ################################ base ######################################### class ClassifierMixin(object): """Mixin class for all classifiers in scikit-learn.""" def score(self, X, y, sample_weight=None): """Returns the mean accuracy on the given test data and labels. In multi-label classification, this is the subset accuracy which is a harsh metric since you require for each sample that each label set be correctly predicted. Parameters ---------- X : array-like, shape = (n_samples, n_features) Test samples. y : array-like, shape = (n_samples) or (n_samples, n_outputs) True labels for X. sample_weight : array-like, shape = [n_samples], optional Sample weights. Returns ------- score : float Mean accuracy of self.predict(X) wrt. y. """ return accuracy_score(y, self.predict(X), sample_weight=sample_weight) class RegressorMixin(object): """Mixin class for all regression estimators in scikit-learn.""" def score(self, X, y, sample_weight=None): """Returns the coefficient of determination R^2 of the prediction. The coefficient R^2 is defined as (1 - u/v), where u is the regression sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual sum of squares ((y_true - y_true.mean()) ** 2).sum(). Best possible score is 1.0, lower values are worse. Parameters ---------- X : array-like, shape = (n_samples, n_features) Test samples. y : array-like, shape = (n_samples) or (n_samples, n_outputs) True values for X. sample_weight : array-like, shape = [n_samples], optional Sample weights. Returns ------- score : float R^2 of self.predict(X) wrt. y. """ return r2_score(y, self.predict(X), sample_weight=sample_weight) # ============================================================================= # Types and constants # ============================================================================= DTYPE = _tree.DTYPE DOUBLE = _tree.DOUBLE CRITERIA_CLF = {"gini": _tree.Gini, "entropy": _tree.Entropy} CRITERIA_REG = {"mse": _tree.MSE, "friedman_mse": _tree.FriedmanMSE} DENSE_SPLITTERS = {"best": _tree.BestSplitter, "presort-best": _tree.PresortBestSplitter, "random": _tree.RandomSplitter} SPARSE_SPLITTERS = {"best": _tree.BestSparseSplitter, "random": _tree.RandomSparseSplitter} # ============================================================================= # Base decision tree # ============================================================================= class BaseDecisionTree: """Base class for decision trees. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, max_leaf_nodes, random_state, class_weight=None): self.criterion = criterion self.splitter = splitter self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.random_state = random_state self.max_leaf_nodes = max_leaf_nodes self.class_weight = class_weight self.n_features_ = None self.n_outputs_ = None self.classes_ = None self.n_classes_ = None self.tree_ = None self.max_features_ = None def fit(self, X, y, sample_weight=None, check_input=True): """Build a decision tree from the training set (X, y). Parameters ---------- X : array-like or sparse matrix, shape = [n_samples, n_features] The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels in classification, real numbers in regression). In the regression case, use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) if check_input: X = check_array(X, dtype=DTYPE) # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) expanded_class_weight = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: y = np.copy(y) self.classes_ = [] self.n_classes_ = [] if self.class_weight is not None: y_original = np.copy(y) for k in range(self.n_outputs_): classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) if self.class_weight is not None: expanded_class_weight = compute_sample_weight( self.class_weight, y_original) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = ((2 ** 31) - 1 if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.max_features, str if PY3 else basestring): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if self.min_samples_split <= 0: raise ValueError("min_samples_split must be greater than zero.") if self.min_samples_leaf <= 0: raise ValueError("min_samples_leaf must be greater than zero.") if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either smaller than " "0 or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray( sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) else: min_weight_leaf = 0. # Set min_samples_split sensibly min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) SPLITTERS = DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, self.min_samples_leaf, min_weight_leaf, random_state) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, self.min_samples_leaf, min_weight_leaf, max_depth) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, self.min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes) builder.build(self.tree_, X, y, sample_weight) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self def predict(self, X, check_input=True): """Predict class or regression value for X. For a classification model, the predicted class for each sample in X is returned. For a regression model, the predicted value based on X is returned. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted classes, or the predict values. """ if check_input: X = check_array(X, dtype=DTYPE) n_samples, n_features = X.shape if self.tree_ is None: raise NotFittedError("Tree not initialized. Perform a fit first") if self.n_features_ != n_features: raise ValueError("Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self.n_features_, n_features)) proba = self.tree_.predict(X) # Classification if isinstance(self, ClassifierMixin): if self.n_outputs_ == 1: return self.classes_.take(np.argmax(proba, axis=1), axis=0) else: predictions = np.zeros((n_samples, self.n_outputs_)) for k in range(self.n_outputs_): predictions[:, k] = self.classes_[k].take( np.argmax(proba[:, k], axis=1), axis=0) return predictions # Regression else: if self.n_outputs_ == 1: return proba[:, 0] else: return proba[:, :, 0] @property def feature_importances_(self): """Return the feature importances. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. Returns ------- feature_importances_ : array, shape = [n_features] """ if self.tree_ is None: raise NotFittedError("Estimator not fitted, call `fit` before" " `feature_importances_`.") return self.tree_.compute_feature_importances() # ============================================================================= # Public estimators # ============================================================================= class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin): """A decision tree classifier. Parameters ---------- criterion : string, optional (default="gini") The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. splitter : string, optional (default="best") The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split. max_features : int, float, string or None, optional (default=None) The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. max_depth : int or None, optional (default=None) The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. Ignored if ``max_leaf_nodes`` is not None. min_samples_split : int, optional (default=2) The minimum number of samples required to split an internal node. min_samples_leaf : int, optional (default=1) The minimum number of samples required to be at a leaf node. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the input samples required to be at a leaf node. max_leaf_nodes : int or None, optional (default=None) Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. class_weight : dict, list of dicts, "auto" or None, optional (default=None) Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. The "auto" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data. For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Attributes ---------- tree_ : Tree object The underlying Tree object. max_features_ : int, The inferred value of max_features. classes_ : array of shape = [n_classes] or a list of such arrays The classes labels (single output problem), or a list of arrays of class labels (multi-output problem). n_classes_ : int or list The number of classes (for single output problems), or a list containing the number of classes for each output (for multi-output problems). feature_importances_ : array of shape = [n_features] The feature importances. The higher, the more important the feature. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance [4]_. See also -------- DecisionTreeRegressor References ---------- .. [1] http://en.wikipedia.org/wiki/Decision_tree_learning .. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification and Regression Trees", Wadsworth, Belmont, CA, 1984. .. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical Learning", Springer, 2009. .. [4] L. Breiman, and A. Cutler, "Random Forests", http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm Examples -------- >>> from sklearn.datasets import load_iris >>> from sklearn.cross_validation import cross_val_score >>> from sklearn.tree import DecisionTreeClassifier >>> clf = DecisionTreeClassifier(random_state=0) >>> iris = load_iris() >>> cross_val_score(clf, iris.data, iris.target, cv=10) ... # doctest: +SKIP ... array([ 1. , 0.93..., 0.86..., 0.93..., 0.93..., 0.93..., 0.93..., 1. , 0.93..., 1. ]) """ def __init__(self, criterion="gini", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features=None, random_state=None, max_leaf_nodes=None, class_weight=None): super(DecisionTreeClassifier, self).__init__( criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, random_state=random_state) def predict_proba(self, X, check_input=True): """Predict class probabilities of the input samples X. The predicted class probability is the fraction of samples of the same class in a leaf. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ check_is_fitted(self, 'n_outputs_') if check_input: X = check_array(X, dtype=DTYPE) n_samples, n_features = X.shape if self.tree_ is None: raise NotFittedError("Tree not initialized. Perform a fit first.") if self.n_features_ != n_features: raise ValueError("Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self.n_features_, n_features)) proba = self.tree_.predict(X) if self.n_outputs_ == 1: proba = proba[:, :self.n_classes_] normalizer = proba.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba /= normalizer return proba else: all_proba = [] for k in range(self.n_outputs_): proba_k = proba[:, k, :self.n_classes_[k]] normalizer = proba_k.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba_k /= normalizer all_proba.append(proba_k) return all_proba def predict_log_proba(self, X): """Predict class log-probabilities of the input samples X. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs such arrays if n_outputs > 1. The class log-probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ proba = self.predict_proba(X) if self.n_outputs_ == 1: return np.log(proba) else: for k in range(self.n_outputs_): proba[k] = np.log(proba[k]) return proba class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): """A decision tree regressor. Parameters ---------- criterion : string, optional (default="mse") The function to measure the quality of a split. The only supported criterion is "mse" for the mean squared error. splitter : string, optional (default="best") The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split. max_features : int, float, string or None, optional (default=None) The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. max_depth : int or None, optional (default=None) The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. Ignored if ``max_leaf_nodes`` is not None. min_samples_split : int, optional (default=2) The minimum number of samples required to split an internal node. min_samples_leaf : int, optional (default=1) The minimum number of samples required to be at a leaf node. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the input samples required to be at a leaf node. max_leaf_nodes : int or None, optional (default=None) Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Attributes ---------- tree_ : Tree object The underlying Tree object. max_features_ : int, The inferred value of max_features. feature_importances_ : array of shape = [n_features] The feature importances. The higher, the more important the feature. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance [4]_. See also -------- DecisionTreeClassifier References ---------- .. [1] http://en.wikipedia.org/wiki/Decision_tree_learning .. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification and Regression Trees", Wadsworth, Belmont, CA, 1984. .. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical Learning", Springer, 2009. .. [4] L. Breiman, and A. Cutler, "Random Forests", http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm Examples -------- >>> from sklearn.datasets import load_boston >>> from sklearn.cross_validation import cross_val_score >>> from sklearn.tree import DecisionTreeRegressor >>> boston = load_boston() >>> regressor = DecisionTreeRegressor(random_state=0) >>> cross_val_score(regressor, boston.data, boston.target, cv=10) ... # doctest: +SKIP ... array([ 0.61..., 0.57..., -0.34..., 0.41..., 0.75..., 0.07..., 0.29..., 0.33..., -1.42..., -1.77...]) """ def __init__(self, criterion="mse", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features=None, random_state=None, max_leaf_nodes=None): super(DecisionTreeRegressor, self).__init__( criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, random_state=random_state) class ExtraTreeClassifier(DecisionTreeClassifier): """An extremely randomized tree classifier. Extra-trees differ from classic decision trees in the way they are built. When looking for the best split to separate the samples of a node into two groups, random splits are drawn for each of the `max_features` randomly selected features and the best split among those is chosen. When `max_features` is set 1, this amounts to building a totally random decision tree. Warning: Extra-trees should only be used within ensemble methods. See also -------- ExtraTreeRegressor, ExtraTreesClassifier, ExtraTreesRegressor References ---------- .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees", Machine Learning, 63(1), 3-42, 2006. """ def __init__(self, criterion="gini", splitter="random", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", random_state=None, max_leaf_nodes=None, class_weight=None): super(ExtraTreeClassifier, self).__init__( criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, random_state=random_state) class ExtraTreeRegressor(DecisionTreeRegressor): """An extremely randomized tree regressor. Extra-trees differ from classic decision trees in the way they are built. When looking for the best split to separate the samples of a node into two groups, random splits are drawn for each of the `max_features` randomly selected features and the best split among those is chosen. When `max_features` is set 1, this amounts to building a totally random decision tree. Warning: Extra-trees should only be used within ensemble methods. See also -------- ExtraTreeClassifier, ExtraTreesClassifier, ExtraTreesRegressor References ---------- .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees", Machine Learning, 63(1), 3-42, 2006. """ def __init__(self, criterion="mse", splitter="random", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", random_state=None, max_leaf_nodes=None): super(ExtraTreeRegressor, self).__init__( criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, random_state=random_state)