Python: Added external random forest support

This commit is contained in:
Philippe Tillet
2015-08-13 01:17:35 -07:00
parent cc2d3416be
commit d49a541fd4
6 changed files with 1669 additions and 833 deletions

Binary file not shown.

View File

@@ -1,68 +0,0 @@
# Authors: Gilles Louppe <g.louppe@gmail.com>
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Arnaud Joly <arnaud.v.joly@gmail.com>
#
# Licence: BSD 3 clause
# See _utils.pyx for details.
import numpy as np
cimport numpy as np
ctypedef np.npy_intp SIZE_t # Type for indices and counters
# =============================================================================
# Stack data structure
# =============================================================================
# A record on the stack for depth-first tree growing
cdef struct StackRecord:
SIZE_t start
SIZE_t end
SIZE_t depth
SIZE_t parent
bint is_left
double impurity
SIZE_t n_constant_features
cdef class Stack:
cdef SIZE_t capacity
cdef SIZE_t top
cdef StackRecord* stack_
cdef bint is_empty(self) nogil
cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
bint is_left, double impurity,
SIZE_t n_constant_features) nogil
cdef int pop(self, StackRecord* res) nogil
# =============================================================================
# PriorityHeap data structure
# =============================================================================
# A record on the frontier for best-first tree growing
cdef struct PriorityHeapRecord:
SIZE_t node_id
SIZE_t start
SIZE_t end
SIZE_t pos
SIZE_t depth
bint is_leaf
double impurity
double impurity_left
double impurity_right
double improvement
cdef class PriorityHeap:
cdef SIZE_t capacity
cdef SIZE_t heap_ptr
cdef PriorityHeapRecord* heap_
cdef bint is_empty(self) nogil
cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
SIZE_t depth, bint is_leaf, double improvement,
double impurity, double impurity_left,
double impurity_right) nogil
cdef int pop(self, PriorityHeapRecord* res) nogil

View File

@@ -1,230 +0,0 @@
# cython: cdivision=True
# cython: boundscheck=False
# cython: wraparound=False
# Authors: Gilles Louppe <g.louppe@gmail.com>
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Arnaud Joly <arnaud.v.joly@gmail.com>
#
# Licence: BSD 3 clause
from libc.stdlib cimport free, malloc, realloc
# =============================================================================
# Stack data structure
# =============================================================================
cdef class Stack:
"""A LIFO data structure.
Attributes
----------
capacity : SIZE_t
The elements the stack can hold; if more added then ``self.stack_``
needs to be resized.
top : SIZE_t
The number of elements currently on the stack.
stack : StackRecord pointer
The stack of records (upward in the stack corresponds to the right).
"""
def __cinit__(self, SIZE_t capacity):
self.capacity = capacity
self.top = 0
self.stack_ = <StackRecord*> malloc(capacity * sizeof(StackRecord))
if self.stack_ == NULL:
raise MemoryError()
def __dealloc__(self):
free(self.stack_)
cdef bint is_empty(self) nogil:
return self.top <= 0
cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
bint is_left, double impurity,
SIZE_t n_constant_features) nogil:
"""Push a new element onto the stack.
Returns 0 if successful; -1 on out of memory error.
"""
cdef SIZE_t top = self.top
cdef StackRecord* stack = NULL
# Resize if capacity not sufficient
if top >= self.capacity:
self.capacity *= 2
stack = <StackRecord*> realloc(self.stack_,
self.capacity * sizeof(StackRecord))
if stack == NULL:
# no free; __dealloc__ handles that
return -1
self.stack_ = stack
stack = self.stack_
stack[top].start = start
stack[top].end = end
stack[top].depth = depth
stack[top].parent = parent
stack[top].is_left = is_left
stack[top].impurity = impurity
stack[top].n_constant_features = n_constant_features
# Increment stack pointer
self.top = top + 1
return 0
cdef int pop(self, StackRecord* res) nogil:
"""Remove the top element from the stack and copy to ``res``.
Returns 0 if pop was successful (and ``res`` is set); -1
otherwise.
"""
cdef SIZE_t top = self.top
cdef StackRecord* stack = self.stack_
if top <= 0:
return -1
res[0] = stack[top - 1]
self.top = top - 1
return 0
# =============================================================================
# PriorityHeap data structure
# =============================================================================
cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil:
"""Restore heap invariant parent.improvement > child.improvement from
``pos`` upwards. """
if pos == 0:
return
cdef SIZE_t parent_pos = (pos - 1) / 2
if heap[parent_pos].improvement < heap[pos].improvement:
heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
heapify_up(heap, parent_pos)
cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos,
SIZE_t heap_length) nogil:
"""Restore heap invariant parent.improvement > children.improvement from
``pos`` downwards. """
cdef SIZE_t left_pos = 2 * (pos + 1) - 1
cdef SIZE_t right_pos = 2 * (pos + 1)
cdef SIZE_t largest = pos
if (left_pos < heap_length and
heap[left_pos].improvement > heap[largest].improvement):
largest = left_pos
if (right_pos < heap_length and
heap[right_pos].improvement > heap[largest].improvement):
largest = right_pos
if largest != pos:
heap[pos], heap[largest] = heap[largest], heap[pos]
heapify_down(heap, largest, heap_length)
cdef class PriorityHeap:
"""A priority queue implemented as a binary heap.
The heap invariant is that the impurity improvement of the parent record
is larger then the impurity improvement of the children.
Attributes
----------
capacity : SIZE_t
The capacity of the heap
heap_ptr : SIZE_t
The water mark of the heap; the heap grows from left to right in the
array ``heap_``. The following invariant holds ``heap_ptr < capacity``.
heap_ : PriorityHeapRecord*
The array of heap records. The maximum element is on the left;
the heap grows from left to right
"""
def __cinit__(self, SIZE_t capacity):
self.capacity = capacity
self.heap_ptr = 0
self.heap_ = <PriorityHeapRecord*> malloc(capacity * sizeof(PriorityHeapRecord))
if self.heap_ == NULL:
raise MemoryError()
def __dealloc__(self):
free(self.heap_)
cdef bint is_empty(self) nogil:
return self.heap_ptr <= 0
cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
SIZE_t depth, bint is_leaf, double improvement,
double impurity, double impurity_left,
double impurity_right) nogil:
"""Push record on the priority heap.
Returns 0 if successful; -1 on out of memory error.
"""
cdef SIZE_t heap_ptr = self.heap_ptr
cdef PriorityHeapRecord* heap = NULL
# Resize if capacity not sufficient
if heap_ptr >= self.capacity:
self.capacity *= 2
heap = <PriorityHeapRecord*> realloc(self.heap_,
self.capacity *
sizeof(PriorityHeapRecord))
if heap == NULL:
# no free; __dealloc__ handles that
return -1
self.heap_ = heap
# Put element as last element of heap
heap = self.heap_
heap[heap_ptr].node_id = node_id
heap[heap_ptr].start = start
heap[heap_ptr].end = end
heap[heap_ptr].pos = pos
heap[heap_ptr].depth = depth
heap[heap_ptr].is_leaf = is_leaf
heap[heap_ptr].impurity = impurity
heap[heap_ptr].impurity_left = impurity_left
heap[heap_ptr].impurity_right = impurity_right
heap[heap_ptr].improvement = improvement
# Heapify up
heapify_up(heap, heap_ptr)
# Increase element count
self.heap_ptr = heap_ptr + 1
return 0
cdef int pop(self, PriorityHeapRecord* res) nogil:
"""Remove max element from the heap. """
cdef SIZE_t heap_ptr = self.heap_ptr
cdef PriorityHeapRecord* heap = self.heap_
if heap_ptr <= 0:
return -1
# Take first element
res[0] = heap[0]
# Put last element to the front
heap[0], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[0]
# Restore heap invariant
if heap_ptr > 1:
heapify_down(heap, 0, heap_ptr - 1)
self.heap_ptr = heap_ptr - 1
return 0

1664
python/isaac/external/forest.py vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -19,12 +19,14 @@ randomized trees. Single and multi-output problems are both handled.
from __future__ import division
import sys
import numbers
from abc import abstractmethod
import numpy as np
from utils import NotFittedError, check_is_fitted, compute_sample_weight, check_array, check_random_state, ClassifierMixin, RegressorMixin, string_types
from utils import BaseEstimator
from ._tree import Criterion
from ._tree import Splitter
from ._tree import DepthFirstTreeBuilder, BestFirstTreeBuilder
@@ -34,539 +36,7 @@ from . import _tree
__all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor",
"ExtraTreeClassifier", "ExtraTreeRegressor"]
################################ six ########################################
PY2 = sys.version_info[0] == 2
PY3 = sys.version_info[0] == 3
################################ utils ########################################
class NotFittedError(ValueError, AttributeError):
"""Exception class to raise if estimator is used before fitting
This class inherits from both ValueError and AttributeError to help with
exception handling and backward compatibility.
"""
def check_is_fitted(estimator, attributes, msg=None, all_or_any=all):
"""Perform is_fitted validation for estimator.
Checks if the estimator is fitted by verifying the presence of
"all_or_any" of the passed attributes and raises a NotFittedError with the
given message.
Parameters
----------
estimator : estimator instance.
estimator instance for which the check is performed.
attributes : attribute name(s) given as string or a list/tuple of strings
Eg. : ["coef_", "estimator_", ...], "coef_"
msg : string
The default error message is, "This %(name)s instance is not fitted
yet. Call 'fit' with appropriate arguments before using this method."
For custom messages if "%(name)s" is present in the message string,
it is substituted for the estimator name.
Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
all_or_any : callable, {all, any}, default all
Specify whether all or any of the given attributes must exist.
"""
if msg is None:
msg = ("This %(name)s instance is not fitted yet. Call 'fit' with "
"appropriate arguments before using this method.")
if not hasattr(estimator, 'fit'):
raise TypeError("%s is not an estimator instance." % (estimator))
if not isinstance(attributes, (list, tuple)):
attributes = [attributes]
if not all_or_any([hasattr(estimator, attr) for attr in attributes]):
raise NotFittedError(msg % {'name': type(estimator).__name__})
def compute_sample_weight(class_weight, y, indices=None):
"""Estimate sample weights by class for unbalanced datasets.
Parameters
----------
class_weight : dict, list of dicts, "auto", or None, optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one. For
multi-output problems, a list of dicts can be provided in the same
order as the columns of y.
The "auto" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data.
For multi-output, the weights of each column of y will be multiplied.
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
Array of original class labels per sample.
indices : array-like, shape (n_subsample,), or None
Array of indices to be used in a subsample. Can be of length less than
n_samples in the case of a subsample, or equal to n_samples in the
case of a bootstrap subsample with repeated indices. If None, the
sample weight will be calculated over the full sample. Only "auto" is
supported for class_weight if this is provided.
Returns
-------
sample_weight_vect : ndarray, shape (n_samples,)
Array with sample weights as applied to the original y
"""
y = np.atleast_1d(y)
if y.ndim == 1:
y = np.reshape(y, (-1, 1))
n_outputs = y.shape[1]
if isinstance(class_weight, six.string_types):
if class_weight != 'auto':
raise ValueError('The only valid preset for class_weight is '
'"auto". Given "%s".' % class_weight)
elif (indices is not None and
not isinstance(class_weight, six.string_types)):
raise ValueError('The only valid class_weight for subsampling is '
'"auto". Given "%s".' % class_weight)
elif n_outputs > 1:
if (not hasattr(class_weight, "__iter__") or
isinstance(class_weight, dict)):
raise ValueError("For multi-output, class_weight should be a "
"list of dicts, or a valid string.")
if len(class_weight) != n_outputs:
raise ValueError("For multi-output, number of elements in "
"class_weight should match number of outputs.")
expanded_class_weight = []
for k in range(n_outputs):
y_full = y[:, k]
classes_full = np.unique(y_full)
classes_missing = None
if class_weight == 'auto' or n_outputs == 1:
class_weight_k = class_weight
else:
class_weight_k = class_weight[k]
if indices is not None:
# Get class weights for the subsample, covering all classes in
# case some labels that were present in the original data are
# missing from the sample.
y_subsample = y[indices, k]
classes_subsample = np.unique(y_subsample)
weight_k = np.choose(np.searchsorted(classes_subsample,
classes_full),
compute_class_weight(class_weight_k,
classes_subsample,
y_subsample),
mode='clip')
classes_missing = set(classes_full) - set(classes_subsample)
else:
weight_k = compute_class_weight(class_weight_k,
classes_full,
y_full)
weight_k = weight_k[np.searchsorted(classes_full, y_full)]
if classes_missing:
# Make missing classes' weight zero
weight_k[in1d(y_full, list(classes_missing))] = 0.
expanded_class_weight.append(weight_k)
expanded_class_weight = np.prod(expanded_class_weight,
axis=0,
dtype=np.float64)
return expanded_class_weight
def _assert_all_finite(X):
"""Like assert_all_finite, but only for ndarray."""
X = np.asanyarray(X)
# First try an O(n) time, O(1) space solution for the common case that
# everything is finite; fall back to O(n) space np.isfinite to prevent
# false positives from overflow in sum method.
if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
and not np.isfinite(X).all()):
raise ValueError("Input contains NaN, infinity"
" or a value too large for %r." % X.dtype)
def check_array(array, accept_sparse=None, dtype="numeric", order=None,
copy=False, force_all_finite=True, ensure_2d=True,
allow_nd=False, ensure_min_samples=1, ensure_min_features=1):
"""Input validation on an array, list, sparse matrix or similar.
By default, the input is converted to an at least 2nd numpy array.
If the dtype of the array is object, attempt converting to float,
raising on failure.
Parameters
----------
array : object
Input object to check / convert.
accept_sparse : string, list of string or None (default=None)
String[s] representing allowed sparse matrix formats, such as 'csc',
'csr', etc. None means that sparse matrix input will raise an error.
If the input is sparse but not in the allowed format, it will be
converted to the first listed format.
dtype : string, type or None (default="numeric")
Data type of result. If None, the dtype of the input is preserved.
If "numeric", dtype is preserved unless array.dtype is object.
order : 'F', 'C' or None (default=None)
Whether an array will be forced to be fortran or c-style.
copy : boolean (default=False)
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.
force_all_finite : boolean (default=True)
Whether to raise an error on np.inf and np.nan in X.
ensure_2d : boolean (default=True)
Whether to make X at least 2d.
allow_nd : boolean (default=False)
Whether to allow X.ndim > 2.
ensure_min_samples : int (default=1)
Make sure that the array has a minimum number of samples in its first
axis (rows for a 2D array). Setting to 0 disables this check.
ensure_min_features : int (default=1)
Make sure that the 2D array has some minimum number of features
(columns). The default value of 1 rejects empty datasets.
This check is only enforced when the input data has effectively 2
dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
disables this check.
Returns
-------
X_converted : object
The converted and validated X.
"""
if isinstance(accept_sparse, str):
accept_sparse = [accept_sparse]
# store whether originally we wanted numeric dtype
dtype_numeric = dtype == "numeric"
if ensure_2d:
array = np.atleast_2d(array)
if dtype_numeric:
if hasattr(array, "dtype") and getattr(array.dtype, "kind", None) == "O":
# if input is object, convert to float.
dtype = np.float64
else:
dtype = None
array = np.array(array, dtype=dtype, order=order, copy=copy)
# make sure we actually converted to numeric:
if dtype_numeric and array.dtype.kind == "O":
array = array.astype(np.float64)
if not allow_nd and array.ndim >= 3:
raise ValueError("Found array with dim %d. Expected <= 2" %
array.ndim)
if force_all_finite:
_assert_all_finite(array)
shape_repr = _shape_repr(array.shape)
if ensure_min_samples > 0:
n_samples = _num_samples(array)
if n_samples < ensure_min_samples:
raise ValueError("Found array with %d sample(s) (shape=%s) while a"
" minimum of %d is required."
% (n_samples, shape_repr, ensure_min_samples))
if ensure_min_features > 0 and array.ndim == 2:
n_features = array.shape[1]
if n_features < ensure_min_features:
raise ValueError("Found array with %d feature(s) (shape=%s) while"
" a minimum of %d is required."
% (n_features, shape_repr, ensure_min_features))
return array
def check_random_state(seed):
"""Turn seed into a np.random.RandomState instance
If seed is None, return the RandomState singleton used by np.random.
If seed is an int, return a new RandomState instance seeded with seed.
If seed is already a RandomState instance, return it.
Otherwise raise ValueError.
"""
if seed is None or seed is np.random:
return np.random.mtrand._rand
if isinstance(seed, (numbers.Integral, np.integer)):
return np.random.RandomState(seed)
if isinstance(seed, np.random.RandomState):
return seed
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
' instance' % seed)
def _shape_repr(shape):
"""Return a platform independent reprensentation of an array shape
Under Python 2, the `long` type introduces an 'L' suffix when using the
default %r format for tuples of integers (typically used to store the shape
of an array).
Under Windows 64 bit (and Python 2), the `long` type is used by default
in numpy shapes even when the integer dimensions are well below 32 bit.
The platform specific type causes string messages or doctests to change
from one platform to another which is not desirable.
Under Python 3, there is no more `long` type so the `L` suffix is never
introduced in string representation.
>>> _shape_repr((1, 2))
'(1, 2)'
>>> one = 2 ** 64 / 2 ** 64 # force an upcast to `long` under Python 2
>>> _shape_repr((one, 2 * one))
'(1, 2)'
>>> _shape_repr((1,))
'(1,)'
>>> _shape_repr(())
'()'
"""
if len(shape) == 0:
return "()"
joined = ", ".join("%d" % e for e in shape)
if len(shape) == 1:
# special notation for singleton tuples
joined += ','
return "(%s)" % joined
def _num_samples(x):
"""Return number of samples in array-like x."""
if hasattr(x, 'fit'):
# Don't get num_samples from an ensembles length!
raise TypeError('Expected sequence or array-like, got '
'estimator %s' % x)
if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
if hasattr(x, '__array__'):
x = np.asarray(x)
else:
raise TypeError("Expected sequence or array-like, got %s" %
type(x))
if hasattr(x, 'shape'):
if len(x.shape) == 0:
raise TypeError("Singleton array %r cannot be considered"
" a valid collection." % x)
return x.shape[0]
else:
return len(x)
################################ metrics ########################################
def _weighted_sum(sample_score, sample_weight, normalize=False):
if normalize:
return np.average(sample_score, weights=sample_weight)
elif sample_weight is not None:
return np.dot(sample_score, sample_weight)
else:
return sample_score.sum()
def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
"""Accuracy classification score.
In multilabel classification, this function computes subset accuracy:
the set of labels predicted for a sample must *exactly* match the
corresponding set of labels in y_true.
Parameters
----------
y_true : 1d array-like, or label indicator array / sparse matrix
Ground truth (correct) labels.
y_pred : 1d array-like, or label indicator array / sparse matrix
Predicted labels, as returned by a classifier.
normalize : bool, optional (default=True)
If ``False``, return the number of correctly classified samples.
Otherwise, return the fraction of correctly classified samples.
sample_weight : array-like of shape = [n_samples], optional
Sample weights.
Returns
-------
score : float
If ``normalize == True``, return the correctly classified samples
(float), else it returns the number of correctly classified samples
(int).
The best performance is 1 with ``normalize == True`` and the number
of samples with ``normalize == False``.
See also
--------
jaccard_similarity_score, hamming_loss, zero_one_loss
Notes
-----
In binary and multiclass classification, this function is equal
to the ``jaccard_similarity_score`` function.
Examples
--------
>>> import numpy as np
>>> from sklearn.metrics import accuracy_score
>>> y_pred = [0, 2, 1, 3]
>>> y_true = [0, 1, 2, 3]
>>> accuracy_score(y_true, y_pred)
0.5
>>> accuracy_score(y_true, y_pred, normalize=False)
2
In the multilabel case with binary label indicators:
>>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))
0.5
"""
# Compute accuracy for each possible representation
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
if y_type.startswith('multilabel'):
differing_labels = count_nonzero(y_true - y_pred, axis=1)
score = differing_labels == 0
else:
score = y_true == y_pred
return _weighted_sum(score, sample_weight, normalize)
def r2_score(y_true, y_pred, sample_weight=None):
"""R^2 (coefficient of determination) regression score function.
Best possible score is 1.0, lower values are worse.
Parameters
----------
y_true : array-like of shape = [n_samples] or [n_samples, n_outputs]
Ground truth (correct) target values.
y_pred : array-like of shape = [n_samples] or [n_samples, n_outputs]
Estimated target values.
sample_weight : array-like of shape = [n_samples], optional
Sample weights.
Returns
-------
z : float
The R^2 score.
Notes
-----
This is not a symmetric function.
Unlike most other scores, R^2 score may be negative (it need not actually
be the square of a quantity R).
References
----------
.. [1] `Wikipedia entry on the Coefficient of determination
<http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
Examples
--------
>>> from sklearn.metrics import r2_score
>>> y_true = [3, -0.5, 2, 7]
>>> y_pred = [2.5, 0.0, 2, 8]
>>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS
0.948...
>>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
>>> y_pred = [[0, 2], [-1, 2], [8, -5]]
>>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS
0.938...
"""
y_type, y_true, y_pred = _check_reg_targets(y_true, y_pred)
if sample_weight is not None:
sample_weight = column_or_1d(sample_weight)
weight = sample_weight[:, np.newaxis]
else:
weight = 1.
numerator = (weight * (y_true - y_pred) ** 2).sum(dtype=np.float64)
denominator = (weight * (y_true - np.average(
y_true, axis=0, weights=sample_weight)) ** 2).sum(dtype=np.float64)
if denominator == 0.0:
if numerator == 0.0:
return 1.0
else:
# arbitrary set to zero to avoid -inf scores, having a constant
# y_true is not interesting for scoring a regression anyway
return 0.0
return 1 - numerator / denominator
################################ base #########################################
class ClassifierMixin(object):
"""Mixin class for all classifiers in scikit-learn."""
def score(self, X, y, sample_weight=None):
"""Returns the mean accuracy on the given test data and labels.
In multi-label classification, this is the subset accuracy
which is a harsh metric since you require for each sample that
each label set be correctly predicted.
Parameters
----------
X : array-like, shape = (n_samples, n_features)
Test samples.
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
True labels for X.
sample_weight : array-like, shape = [n_samples], optional
Sample weights.
Returns
-------
score : float
Mean accuracy of self.predict(X) wrt. y.
"""
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
class RegressorMixin(object):
"""Mixin class for all regression estimators in scikit-learn."""
def score(self, X, y, sample_weight=None):
"""Returns the coefficient of determination R^2 of the prediction.
The coefficient R^2 is defined as (1 - u/v), where u is the regression
sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual
sum of squares ((y_true - y_true.mean()) ** 2).sum().
Best possible score is 1.0, lower values are worse.
Parameters
----------
X : array-like, shape = (n_samples, n_features)
Test samples.
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
True values for X.
sample_weight : array-like, shape = [n_samples], optional
Sample weights.
Returns
-------
score : float
R^2 of self.predict(X) wrt. y.
"""
return r2_score(y, self.predict(X), sample_weight=sample_weight)
# =============================================================================
# Types and constants
# =============================================================================
@@ -589,7 +59,7 @@ SPARSE_SPLITTERS = {"best": _tree.BestSparseSplitter,
# =============================================================================
class BaseDecisionTree:
class BaseDecisionTree(BaseEstimator):
"""Base class for decision trees.
Warning: This class should not be used directly.
@@ -709,7 +179,7 @@ class BaseDecisionTree:
max_leaf_nodes = (-1 if self.max_leaf_nodes is None
else self.max_leaf_nodes)
if isinstance(self.max_features, str if PY3 else basestring):
if isinstance(self.max_features, string_types):
if self.max_features == "auto":
if is_classification:
max_features = max(1, int(np.sqrt(self.n_features_)))

Binary file not shown.