Python: Added external random forest support

2015-08-13 01:17:35 -07:00
parent cc2d3416be
commit d49a541fd4
6 changed files with 1669 additions and 833 deletions
--- a/python/isaac/external/tree.py
+++ b/python/isaac/external/tree.py
@@ -19,12 +19,14 @@ randomized trees. Single and multi-output problems are both handled.

 from __future__ import division

-import sys
 import numbers
 from abc import  abstractmethod

 import numpy as np

+from utils import NotFittedError, check_is_fitted, compute_sample_weight, check_array, check_random_state, ClassifierMixin, RegressorMixin, string_types
+from utils import BaseEstimator
+
 from ._tree import Criterion
 from ._tree import Splitter
 from ._tree import DepthFirstTreeBuilder, BestFirstTreeBuilder
@@ -34,539 +36,7 @@ from . import _tree
 __all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor",
           "ExtraTreeClassifier", "ExtraTreeRegressor"]

-################################ six   ########################################
-PY2 = sys.version_info[0] == 2
-PY3 = sys.version_info[0] == 3

-################################ utils ########################################
-
-class NotFittedError(ValueError, AttributeError):
-    """Exception class to raise if estimator is used before fitting
-
-    This class inherits from both ValueError and AttributeError to help with
-    exception handling and backward compatibility.
-    """
-
-def check_is_fitted(estimator, attributes, msg=None, all_or_any=all):
-    """Perform is_fitted validation for estimator.
-
-    Checks if the estimator is fitted by verifying the presence of
-    "all_or_any" of the passed attributes and raises a NotFittedError with the
-    given message.
-
-    Parameters
-    ----------
-    estimator : estimator instance.
-        estimator instance for which the check is performed.
-
-    attributes : attribute name(s) given as string or a list/tuple of strings
-        Eg. : ["coef_", "estimator_", ...], "coef_"
-
-    msg : string
-        The default error message is, "This %(name)s instance is not fitted
-        yet. Call 'fit' with appropriate arguments before using this method."
-
-        For custom messages if "%(name)s" is present in the message string,
-        it is substituted for the estimator name.
-
-        Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
-
-    all_or_any : callable, {all, any}, default all
-        Specify whether all or any of the given attributes must exist.
-    """
-    if msg is None:
-        msg = ("This %(name)s instance is not fitted yet. Call 'fit' with "
-               "appropriate arguments before using this method.")
-
-    if not hasattr(estimator, 'fit'):
-        raise TypeError("%s is not an estimator instance." % (estimator))
-
-    if not isinstance(attributes, (list, tuple)):
-        attributes = [attributes]
-
-    if not all_or_any([hasattr(estimator, attr) for attr in attributes]):
-        raise NotFittedError(msg % {'name': type(estimator).__name__})
-        
-def compute_sample_weight(class_weight, y, indices=None):
-    """Estimate sample weights by class for unbalanced datasets.
-
-    Parameters
-    ----------
-    class_weight : dict, list of dicts, "auto", or None, optional
-        Weights associated with classes in the form ``{class_label: weight}``.
-        If not given, all classes are supposed to have weight one. For
-        multi-output problems, a list of dicts can be provided in the same
-        order as the columns of y.
-
-        The "auto" mode uses the values of y to automatically adjust
-        weights inversely proportional to class frequencies in the input data.
-
-        For multi-output, the weights of each column of y will be multiplied.
-
-    y : array-like, shape = [n_samples] or [n_samples, n_outputs]
-        Array of original class labels per sample.
-
-    indices : array-like, shape (n_subsample,), or None
-        Array of indices to be used in a subsample. Can be of length less than
-        n_samples in the case of a subsample, or equal to n_samples in the
-        case of a bootstrap subsample with repeated indices. If None, the
-        sample weight will be calculated over the full sample. Only "auto" is
-        supported for class_weight if this is provided.
-
-    Returns
-    -------
-    sample_weight_vect : ndarray, shape (n_samples,)
-        Array with sample weights as applied to the original y
-    """
-
-    y = np.atleast_1d(y)
-    if y.ndim == 1:
-        y = np.reshape(y, (-1, 1))
-    n_outputs = y.shape[1]
-
-    if isinstance(class_weight, six.string_types):
-        if class_weight != 'auto':
-            raise ValueError('The only valid preset for class_weight is '
-                             '"auto". Given "%s".' % class_weight)
-    elif (indices is not None and
-          not isinstance(class_weight, six.string_types)):
-        raise ValueError('The only valid class_weight for subsampling is '
-                         '"auto". Given "%s".' % class_weight)
-    elif n_outputs > 1:
-        if (not hasattr(class_weight, "__iter__") or
-                isinstance(class_weight, dict)):
-            raise ValueError("For multi-output, class_weight should be a "
-                             "list of dicts, or a valid string.")
-        if len(class_weight) != n_outputs:
-            raise ValueError("For multi-output, number of elements in "
-                             "class_weight should match number of outputs.")
-
-    expanded_class_weight = []
-    for k in range(n_outputs):
-
-        y_full = y[:, k]
-        classes_full = np.unique(y_full)
-        classes_missing = None
-
-        if class_weight == 'auto' or n_outputs == 1:
-            class_weight_k = class_weight
-        else:
-            class_weight_k = class_weight[k]
-
-        if indices is not None:
-            # Get class weights for the subsample, covering all classes in
-            # case some labels that were present in the original data are
-            # missing from the sample.
-            y_subsample = y[indices, k]
-            classes_subsample = np.unique(y_subsample)
-
-            weight_k = np.choose(np.searchsorted(classes_subsample,
-                                                 classes_full),
-                                 compute_class_weight(class_weight_k,
-                                                      classes_subsample,
-                                                      y_subsample),
-                                 mode='clip')
-
-            classes_missing = set(classes_full) - set(classes_subsample)
-        else:
-            weight_k = compute_class_weight(class_weight_k,
-                                            classes_full,
-                                            y_full)
-
-        weight_k = weight_k[np.searchsorted(classes_full, y_full)]
-
-        if classes_missing:
-            # Make missing classes' weight zero
-            weight_k[in1d(y_full, list(classes_missing))] = 0.
-
-        expanded_class_weight.append(weight_k)
-
-    expanded_class_weight = np.prod(expanded_class_weight,
-                                    axis=0,
-                                    dtype=np.float64)
-
-    return expanded_class_weight
-    
-def _assert_all_finite(X):
-    """Like assert_all_finite, but only for ndarray."""
-    X = np.asanyarray(X)
-    # First try an O(n) time, O(1) space solution for the common case that
-    # everything is finite; fall back to O(n) space np.isfinite to prevent
-    # false positives from overflow in sum method.
-    if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
-            and not np.isfinite(X).all()):
-        raise ValueError("Input contains NaN, infinity"
-                         " or a value too large for %r." % X.dtype)
-                         
-def check_array(array, accept_sparse=None, dtype="numeric", order=None,
-                copy=False, force_all_finite=True, ensure_2d=True,
-                allow_nd=False, ensure_min_samples=1, ensure_min_features=1):
-    """Input validation on an array, list, sparse matrix or similar.
-
-    By default, the input is converted to an at least 2nd numpy array.
-    If the dtype of the array is object, attempt converting to float,
-    raising on failure.
-
-    Parameters
-    ----------
-    array : object
-        Input object to check / convert.
-
-    accept_sparse : string, list of string or None (default=None)
-        String[s] representing allowed sparse matrix formats, such as 'csc',
-        'csr', etc.  None means that sparse matrix input will raise an error.
-        If the input is sparse but not in the allowed format, it will be
-        converted to the first listed format.
-
-    dtype : string, type or None (default="numeric")
-        Data type of result. If None, the dtype of the input is preserved.
-        If "numeric", dtype is preserved unless array.dtype is object.
-
-    order : 'F', 'C' or None (default=None)
-        Whether an array will be forced to be fortran or c-style.
-
-    copy : boolean (default=False)
-        Whether a forced copy will be triggered. If copy=False, a copy might
-        be triggered by a conversion.
-
-    force_all_finite : boolean (default=True)
-        Whether to raise an error on np.inf and np.nan in X.
-
-    ensure_2d : boolean (default=True)
-        Whether to make X at least 2d.
-
-    allow_nd : boolean (default=False)
-        Whether to allow X.ndim > 2.
-
-    ensure_min_samples : int (default=1)
-        Make sure that the array has a minimum number of samples in its first
-        axis (rows for a 2D array). Setting to 0 disables this check.
-
-    ensure_min_features : int (default=1)
-        Make sure that the 2D array has some minimum number of features
-        (columns). The default value of 1 rejects empty datasets.
-        This check is only enforced when the input data has effectively 2
-        dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
-        disables this check.
-
-    Returns
-    -------
-    X_converted : object
-        The converted and validated X.
-    """
-    if isinstance(accept_sparse, str):
-        accept_sparse = [accept_sparse]
-
-    # store whether originally we wanted numeric dtype
-    dtype_numeric = dtype == "numeric"
-
-    if ensure_2d:
-        array = np.atleast_2d(array)
-    if dtype_numeric:
-        if hasattr(array, "dtype") and getattr(array.dtype, "kind", None) == "O":
-            # if input is object, convert to float.
-            dtype = np.float64
-        else:
-            dtype = None
-    array = np.array(array, dtype=dtype, order=order, copy=copy)
-    # make sure we actually converted to numeric:
-    if dtype_numeric and array.dtype.kind == "O":
-        array = array.astype(np.float64)
-    if not allow_nd and array.ndim >= 3:
-        raise ValueError("Found array with dim %d. Expected <= 2" %
-                         array.ndim)
-    if force_all_finite:
-        _assert_all_finite(array)
-
-    shape_repr = _shape_repr(array.shape)
-    if ensure_min_samples > 0:
-        n_samples = _num_samples(array)
-        if n_samples < ensure_min_samples:
-            raise ValueError("Found array with %d sample(s) (shape=%s) while a"
-                             " minimum of %d is required."
-                             % (n_samples, shape_repr, ensure_min_samples))
-
-    if ensure_min_features > 0 and array.ndim == 2:
-        n_features = array.shape[1]
-        if n_features < ensure_min_features:
-            raise ValueError("Found array with %d feature(s) (shape=%s) while"
-                             " a minimum of %d is required."
-                             % (n_features, shape_repr, ensure_min_features))
-    return array
-    
-def check_random_state(seed):
-    """Turn seed into a np.random.RandomState instance
-
-    If seed is None, return the RandomState singleton used by np.random.
-    If seed is an int, return a new RandomState instance seeded with seed.
-    If seed is already a RandomState instance, return it.
-    Otherwise raise ValueError.
-    """
-    if seed is None or seed is np.random:
-        return np.random.mtrand._rand
-    if isinstance(seed, (numbers.Integral, np.integer)):
-        return np.random.RandomState(seed)
-    if isinstance(seed, np.random.RandomState):
-        return seed
-    raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
-                     ' instance' % seed)
-    
-def _shape_repr(shape):
-    """Return a platform independent reprensentation of an array shape
-
-    Under Python 2, the `long` type introduces an 'L' suffix when using the
-    default %r format for tuples of integers (typically used to store the shape
-    of an array).
-
-    Under Windows 64 bit (and Python 2), the `long` type is used by default
-    in numpy shapes even when the integer dimensions are well below 32 bit.
-    The platform specific type causes string messages or doctests to change
-    from one platform to another which is not desirable.
-
-    Under Python 3, there is no more `long` type so the `L` suffix is never
-    introduced in string representation.
-
-    >>> _shape_repr((1, 2))
-    '(1, 2)'
-    >>> one = 2 ** 64 / 2 ** 64  # force an upcast to `long` under Python 2
-    >>> _shape_repr((one, 2 * one))
-    '(1, 2)'
-    >>> _shape_repr((1,))
-    '(1,)'
-    >>> _shape_repr(())
-    '()'
-    """
-    if len(shape) == 0:
-        return "()"
-    joined = ", ".join("%d" % e for e in shape)
-    if len(shape) == 1:
-        # special notation for singleton tuples
-        joined += ','
-    return "(%s)" % joined
-
-def _num_samples(x):
-    """Return number of samples in array-like x."""
-    if hasattr(x, 'fit'):
-        # Don't get num_samples from an ensembles length!
-        raise TypeError('Expected sequence or array-like, got '
-                        'estimator %s' % x)
-    if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
-        if hasattr(x, '__array__'):
-            x = np.asarray(x)
-        else:
-            raise TypeError("Expected sequence or array-like, got %s" %
-                            type(x))
-    if hasattr(x, 'shape'):
-        if len(x.shape) == 0:
-            raise TypeError("Singleton array %r cannot be considered"
-                            " a valid collection." % x)
-        return x.shape[0]
-    else:
-        return len(x)
-################################ metrics ########################################
-def _weighted_sum(sample_score, sample_weight, normalize=False):
-    if normalize:
-        return np.average(sample_score, weights=sample_weight)
-    elif sample_weight is not None:
-        return np.dot(sample_score, sample_weight)
-    else:
-        return sample_score.sum()
-
-def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
-    """Accuracy classification score.
-
-    In multilabel classification, this function computes subset accuracy:
-    the set of labels predicted for a sample must *exactly* match the
-    corresponding set of labels in y_true.
-
-    Parameters
-    ----------
-    y_true : 1d array-like, or label indicator array / sparse matrix
-        Ground truth (correct) labels.
-
-    y_pred : 1d array-like, or label indicator array / sparse matrix
-        Predicted labels, as returned by a classifier.
-
-    normalize : bool, optional (default=True)
-        If ``False``, return the number of correctly classified samples.
-        Otherwise, return the fraction of correctly classified samples.
-
-    sample_weight : array-like of shape = [n_samples], optional
-        Sample weights.
-
-    Returns
-    -------
-    score : float
-        If ``normalize == True``, return the correctly classified samples
-        (float), else it returns the number of correctly classified samples
-        (int).
-
-        The best performance is 1 with ``normalize == True`` and the number
-        of samples with ``normalize == False``.
-
-    See also
-    --------
-    jaccard_similarity_score, hamming_loss, zero_one_loss
-
-    Notes
-    -----
-    In binary and multiclass classification, this function is equal
-    to the ``jaccard_similarity_score`` function.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn.metrics import accuracy_score
-    >>> y_pred = [0, 2, 1, 3]
-    >>> y_true = [0, 1, 2, 3]
-    >>> accuracy_score(y_true, y_pred)
-    0.5
-    >>> accuracy_score(y_true, y_pred, normalize=False)
-    2
-
-    In the multilabel case with binary label indicators:
-    >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))
-    0.5
-    """
-
-    # Compute accuracy for each possible representation
-    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
-    if y_type.startswith('multilabel'):
-        differing_labels = count_nonzero(y_true - y_pred, axis=1)
-        score = differing_labels == 0
-    else:
-        score = y_true == y_pred
-
-    return _weighted_sum(score, sample_weight, normalize)
-    
-def r2_score(y_true, y_pred, sample_weight=None):
-    """R^2 (coefficient of determination) regression score function.
-
-    Best possible score is 1.0, lower values are worse.
-
-    Parameters
-    ----------
-    y_true : array-like of shape = [n_samples] or [n_samples, n_outputs]
-        Ground truth (correct) target values.
-
-    y_pred : array-like of shape = [n_samples] or [n_samples, n_outputs]
-        Estimated target values.
-
-    sample_weight : array-like of shape = [n_samples], optional
-        Sample weights.
-
-    Returns
-    -------
-    z : float
-        The R^2 score.
-
-    Notes
-    -----
-    This is not a symmetric function.
-
-    Unlike most other scores, R^2 score may be negative (it need not actually
-    be the square of a quantity R).
-
-    References
-    ----------
-    .. [1] `Wikipedia entry on the Coefficient of determination
-            <http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
-
-    Examples
-    --------
-    >>> from sklearn.metrics import r2_score
-    >>> y_true = [3, -0.5, 2, 7]
-    >>> y_pred = [2.5, 0.0, 2, 8]
-    >>> r2_score(y_true, y_pred)  # doctest: +ELLIPSIS
-    0.948...
-    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
-    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
-    >>> r2_score(y_true, y_pred)  # doctest: +ELLIPSIS
-    0.938...
-
-    """
-    y_type, y_true, y_pred = _check_reg_targets(y_true, y_pred)
-
-    if sample_weight is not None:
-        sample_weight = column_or_1d(sample_weight)
-        weight = sample_weight[:, np.newaxis]
-    else:
-        weight = 1.
-
-    numerator = (weight * (y_true - y_pred) ** 2).sum(dtype=np.float64)
-    denominator = (weight * (y_true - np.average(
-        y_true, axis=0, weights=sample_weight)) ** 2).sum(dtype=np.float64)
-
-    if denominator == 0.0:
-        if numerator == 0.0:
-            return 1.0
-        else:
-            # arbitrary set to zero to avoid -inf scores, having a constant
-            # y_true is not interesting for scoring a regression anyway
-            return 0.0
-
-    return 1 - numerator / denominator
-
-
-################################   base   #########################################
-class ClassifierMixin(object):
-    """Mixin class for all classifiers in scikit-learn."""
-
-    def score(self, X, y, sample_weight=None):
-        """Returns the mean accuracy on the given test data and labels.
-
-        In multi-label classification, this is the subset accuracy
-        which is a harsh metric since you require for each sample that
-        each label set be correctly predicted.
-
-        Parameters
-        ----------
-        X : array-like, shape = (n_samples, n_features)
-            Test samples.
-
-        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
-            True labels for X.
-
-        sample_weight : array-like, shape = [n_samples], optional
-            Sample weights.
-
-        Returns
-        -------
-        score : float
-            Mean accuracy of self.predict(X) wrt. y.
-
-        """
-        return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
-        
-class RegressorMixin(object):
-    """Mixin class for all regression estimators in scikit-learn."""
-
-    def score(self, X, y, sample_weight=None):
-        """Returns the coefficient of determination R^2 of the prediction.
-
-        The coefficient R^2 is defined as (1 - u/v), where u is the regression
-        sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual
-        sum of squares ((y_true - y_true.mean()) ** 2).sum().
-        Best possible score is 1.0, lower values are worse.
-
-        Parameters
-        ----------
-        X : array-like, shape = (n_samples, n_features)
-            Test samples.
-
-        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
-            True values for X.
-
-        sample_weight : array-like, shape = [n_samples], optional
-            Sample weights.
-
-        Returns
-        -------
-        score : float
-            R^2 of self.predict(X) wrt. y.
-        """
-        return r2_score(y, self.predict(X), sample_weight=sample_weight)
-        
 # =============================================================================
 # Types and constants
 # =============================================================================
@@ -589,7 +59,7 @@ SPARSE_SPLITTERS = {"best": _tree.BestSparseSplitter,
 # =============================================================================


-class BaseDecisionTree:
+class BaseDecisionTree(BaseEstimator):
    """Base class for decision trees.

    Warning: This class should not be used directly.
@@ -709,7 +179,7 @@ class BaseDecisionTree:
        max_leaf_nodes = (-1 if self.max_leaf_nodes is None
                          else self.max_leaf_nodes)

-        if isinstance(self.max_features, str if PY3 else basestring):
+        if isinstance(self.max_features, string_types):
            if self.max_features == "auto":
                if is_classification:
                    max_features = max(1, int(np.sqrt(self.n_features_)))