Python: Added external random forest support
This commit is contained in:
BIN
python/isaac/external/__init__.pyc
vendored
BIN
python/isaac/external/__init__.pyc
vendored
Binary file not shown.
68
python/isaac/external/_utils.pxd
vendored
68
python/isaac/external/_utils.pxd
vendored
@@ -1,68 +0,0 @@
|
|||||||
# Authors: Gilles Louppe <g.louppe@gmail.com>
|
|
||||||
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
|
||||||
# Arnaud Joly <arnaud.v.joly@gmail.com>
|
|
||||||
#
|
|
||||||
# Licence: BSD 3 clause
|
|
||||||
|
|
||||||
# See _utils.pyx for details.
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
cimport numpy as np
|
|
||||||
|
|
||||||
ctypedef np.npy_intp SIZE_t # Type for indices and counters
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# Stack data structure
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
# A record on the stack for depth-first tree growing
|
|
||||||
cdef struct StackRecord:
|
|
||||||
SIZE_t start
|
|
||||||
SIZE_t end
|
|
||||||
SIZE_t depth
|
|
||||||
SIZE_t parent
|
|
||||||
bint is_left
|
|
||||||
double impurity
|
|
||||||
SIZE_t n_constant_features
|
|
||||||
|
|
||||||
cdef class Stack:
|
|
||||||
cdef SIZE_t capacity
|
|
||||||
cdef SIZE_t top
|
|
||||||
cdef StackRecord* stack_
|
|
||||||
|
|
||||||
cdef bint is_empty(self) nogil
|
|
||||||
cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
|
|
||||||
bint is_left, double impurity,
|
|
||||||
SIZE_t n_constant_features) nogil
|
|
||||||
cdef int pop(self, StackRecord* res) nogil
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# PriorityHeap data structure
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
# A record on the frontier for best-first tree growing
|
|
||||||
cdef struct PriorityHeapRecord:
|
|
||||||
SIZE_t node_id
|
|
||||||
SIZE_t start
|
|
||||||
SIZE_t end
|
|
||||||
SIZE_t pos
|
|
||||||
SIZE_t depth
|
|
||||||
bint is_leaf
|
|
||||||
double impurity
|
|
||||||
double impurity_left
|
|
||||||
double impurity_right
|
|
||||||
double improvement
|
|
||||||
|
|
||||||
cdef class PriorityHeap:
|
|
||||||
cdef SIZE_t capacity
|
|
||||||
cdef SIZE_t heap_ptr
|
|
||||||
cdef PriorityHeapRecord* heap_
|
|
||||||
|
|
||||||
cdef bint is_empty(self) nogil
|
|
||||||
cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
|
|
||||||
SIZE_t depth, bint is_leaf, double improvement,
|
|
||||||
double impurity, double impurity_left,
|
|
||||||
double impurity_right) nogil
|
|
||||||
cdef int pop(self, PriorityHeapRecord* res) nogil
|
|
230
python/isaac/external/_utils.pyx
vendored
230
python/isaac/external/_utils.pyx
vendored
@@ -1,230 +0,0 @@
|
|||||||
# cython: cdivision=True
|
|
||||||
# cython: boundscheck=False
|
|
||||||
# cython: wraparound=False
|
|
||||||
|
|
||||||
# Authors: Gilles Louppe <g.louppe@gmail.com>
|
|
||||||
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
|
||||||
# Arnaud Joly <arnaud.v.joly@gmail.com>
|
|
||||||
#
|
|
||||||
# Licence: BSD 3 clause
|
|
||||||
|
|
||||||
from libc.stdlib cimport free, malloc, realloc
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# Stack data structure
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
cdef class Stack:
|
|
||||||
"""A LIFO data structure.
|
|
||||||
|
|
||||||
Attributes
|
|
||||||
----------
|
|
||||||
capacity : SIZE_t
|
|
||||||
The elements the stack can hold; if more added then ``self.stack_``
|
|
||||||
needs to be resized.
|
|
||||||
|
|
||||||
top : SIZE_t
|
|
||||||
The number of elements currently on the stack.
|
|
||||||
|
|
||||||
stack : StackRecord pointer
|
|
||||||
The stack of records (upward in the stack corresponds to the right).
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __cinit__(self, SIZE_t capacity):
|
|
||||||
self.capacity = capacity
|
|
||||||
self.top = 0
|
|
||||||
self.stack_ = <StackRecord*> malloc(capacity * sizeof(StackRecord))
|
|
||||||
if self.stack_ == NULL:
|
|
||||||
raise MemoryError()
|
|
||||||
|
|
||||||
def __dealloc__(self):
|
|
||||||
free(self.stack_)
|
|
||||||
|
|
||||||
cdef bint is_empty(self) nogil:
|
|
||||||
return self.top <= 0
|
|
||||||
|
|
||||||
cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
|
|
||||||
bint is_left, double impurity,
|
|
||||||
SIZE_t n_constant_features) nogil:
|
|
||||||
"""Push a new element onto the stack.
|
|
||||||
|
|
||||||
Returns 0 if successful; -1 on out of memory error.
|
|
||||||
"""
|
|
||||||
cdef SIZE_t top = self.top
|
|
||||||
cdef StackRecord* stack = NULL
|
|
||||||
|
|
||||||
# Resize if capacity not sufficient
|
|
||||||
if top >= self.capacity:
|
|
||||||
self.capacity *= 2
|
|
||||||
stack = <StackRecord*> realloc(self.stack_,
|
|
||||||
self.capacity * sizeof(StackRecord))
|
|
||||||
if stack == NULL:
|
|
||||||
# no free; __dealloc__ handles that
|
|
||||||
return -1
|
|
||||||
self.stack_ = stack
|
|
||||||
|
|
||||||
stack = self.stack_
|
|
||||||
stack[top].start = start
|
|
||||||
stack[top].end = end
|
|
||||||
stack[top].depth = depth
|
|
||||||
stack[top].parent = parent
|
|
||||||
stack[top].is_left = is_left
|
|
||||||
stack[top].impurity = impurity
|
|
||||||
stack[top].n_constant_features = n_constant_features
|
|
||||||
|
|
||||||
# Increment stack pointer
|
|
||||||
self.top = top + 1
|
|
||||||
return 0
|
|
||||||
|
|
||||||
cdef int pop(self, StackRecord* res) nogil:
|
|
||||||
"""Remove the top element from the stack and copy to ``res``.
|
|
||||||
|
|
||||||
Returns 0 if pop was successful (and ``res`` is set); -1
|
|
||||||
otherwise.
|
|
||||||
"""
|
|
||||||
cdef SIZE_t top = self.top
|
|
||||||
cdef StackRecord* stack = self.stack_
|
|
||||||
|
|
||||||
if top <= 0:
|
|
||||||
return -1
|
|
||||||
|
|
||||||
res[0] = stack[top - 1]
|
|
||||||
self.top = top - 1
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# PriorityHeap data structure
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil:
|
|
||||||
"""Restore heap invariant parent.improvement > child.improvement from
|
|
||||||
``pos`` upwards. """
|
|
||||||
if pos == 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
cdef SIZE_t parent_pos = (pos - 1) / 2
|
|
||||||
|
|
||||||
if heap[parent_pos].improvement < heap[pos].improvement:
|
|
||||||
heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
|
|
||||||
heapify_up(heap, parent_pos)
|
|
||||||
|
|
||||||
|
|
||||||
cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos,
|
|
||||||
SIZE_t heap_length) nogil:
|
|
||||||
"""Restore heap invariant parent.improvement > children.improvement from
|
|
||||||
``pos`` downwards. """
|
|
||||||
cdef SIZE_t left_pos = 2 * (pos + 1) - 1
|
|
||||||
cdef SIZE_t right_pos = 2 * (pos + 1)
|
|
||||||
cdef SIZE_t largest = pos
|
|
||||||
|
|
||||||
if (left_pos < heap_length and
|
|
||||||
heap[left_pos].improvement > heap[largest].improvement):
|
|
||||||
largest = left_pos
|
|
||||||
|
|
||||||
if (right_pos < heap_length and
|
|
||||||
heap[right_pos].improvement > heap[largest].improvement):
|
|
||||||
largest = right_pos
|
|
||||||
|
|
||||||
if largest != pos:
|
|
||||||
heap[pos], heap[largest] = heap[largest], heap[pos]
|
|
||||||
heapify_down(heap, largest, heap_length)
|
|
||||||
|
|
||||||
|
|
||||||
cdef class PriorityHeap:
|
|
||||||
"""A priority queue implemented as a binary heap.
|
|
||||||
|
|
||||||
The heap invariant is that the impurity improvement of the parent record
|
|
||||||
is larger then the impurity improvement of the children.
|
|
||||||
|
|
||||||
Attributes
|
|
||||||
----------
|
|
||||||
capacity : SIZE_t
|
|
||||||
The capacity of the heap
|
|
||||||
|
|
||||||
heap_ptr : SIZE_t
|
|
||||||
The water mark of the heap; the heap grows from left to right in the
|
|
||||||
array ``heap_``. The following invariant holds ``heap_ptr < capacity``.
|
|
||||||
|
|
||||||
heap_ : PriorityHeapRecord*
|
|
||||||
The array of heap records. The maximum element is on the left;
|
|
||||||
the heap grows from left to right
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __cinit__(self, SIZE_t capacity):
|
|
||||||
self.capacity = capacity
|
|
||||||
self.heap_ptr = 0
|
|
||||||
self.heap_ = <PriorityHeapRecord*> malloc(capacity * sizeof(PriorityHeapRecord))
|
|
||||||
if self.heap_ == NULL:
|
|
||||||
raise MemoryError()
|
|
||||||
|
|
||||||
def __dealloc__(self):
|
|
||||||
free(self.heap_)
|
|
||||||
|
|
||||||
cdef bint is_empty(self) nogil:
|
|
||||||
return self.heap_ptr <= 0
|
|
||||||
|
|
||||||
cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
|
|
||||||
SIZE_t depth, bint is_leaf, double improvement,
|
|
||||||
double impurity, double impurity_left,
|
|
||||||
double impurity_right) nogil:
|
|
||||||
"""Push record on the priority heap.
|
|
||||||
|
|
||||||
Returns 0 if successful; -1 on out of memory error.
|
|
||||||
"""
|
|
||||||
cdef SIZE_t heap_ptr = self.heap_ptr
|
|
||||||
cdef PriorityHeapRecord* heap = NULL
|
|
||||||
|
|
||||||
# Resize if capacity not sufficient
|
|
||||||
if heap_ptr >= self.capacity:
|
|
||||||
self.capacity *= 2
|
|
||||||
heap = <PriorityHeapRecord*> realloc(self.heap_,
|
|
||||||
self.capacity *
|
|
||||||
sizeof(PriorityHeapRecord))
|
|
||||||
if heap == NULL:
|
|
||||||
# no free; __dealloc__ handles that
|
|
||||||
return -1
|
|
||||||
self.heap_ = heap
|
|
||||||
|
|
||||||
# Put element as last element of heap
|
|
||||||
heap = self.heap_
|
|
||||||
heap[heap_ptr].node_id = node_id
|
|
||||||
heap[heap_ptr].start = start
|
|
||||||
heap[heap_ptr].end = end
|
|
||||||
heap[heap_ptr].pos = pos
|
|
||||||
heap[heap_ptr].depth = depth
|
|
||||||
heap[heap_ptr].is_leaf = is_leaf
|
|
||||||
heap[heap_ptr].impurity = impurity
|
|
||||||
heap[heap_ptr].impurity_left = impurity_left
|
|
||||||
heap[heap_ptr].impurity_right = impurity_right
|
|
||||||
heap[heap_ptr].improvement = improvement
|
|
||||||
|
|
||||||
# Heapify up
|
|
||||||
heapify_up(heap, heap_ptr)
|
|
||||||
|
|
||||||
# Increase element count
|
|
||||||
self.heap_ptr = heap_ptr + 1
|
|
||||||
return 0
|
|
||||||
|
|
||||||
cdef int pop(self, PriorityHeapRecord* res) nogil:
|
|
||||||
"""Remove max element from the heap. """
|
|
||||||
cdef SIZE_t heap_ptr = self.heap_ptr
|
|
||||||
cdef PriorityHeapRecord* heap = self.heap_
|
|
||||||
|
|
||||||
if heap_ptr <= 0:
|
|
||||||
return -1
|
|
||||||
|
|
||||||
# Take first element
|
|
||||||
res[0] = heap[0]
|
|
||||||
|
|
||||||
# Put last element to the front
|
|
||||||
heap[0], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[0]
|
|
||||||
|
|
||||||
# Restore heap invariant
|
|
||||||
if heap_ptr > 1:
|
|
||||||
heapify_down(heap, 0, heap_ptr - 1)
|
|
||||||
|
|
||||||
self.heap_ptr = heap_ptr - 1
|
|
||||||
|
|
||||||
return 0
|
|
1664
python/isaac/external/forest.py
vendored
Normal file
1664
python/isaac/external/forest.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
540
python/isaac/external/tree.py
vendored
540
python/isaac/external/tree.py
vendored
@@ -19,12 +19,14 @@ randomized trees. Single and multi-output problems are both handled.
|
|||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
import sys
|
|
||||||
import numbers
|
import numbers
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
from utils import NotFittedError, check_is_fitted, compute_sample_weight, check_array, check_random_state, ClassifierMixin, RegressorMixin, string_types
|
||||||
|
from utils import BaseEstimator
|
||||||
|
|
||||||
from ._tree import Criterion
|
from ._tree import Criterion
|
||||||
from ._tree import Splitter
|
from ._tree import Splitter
|
||||||
from ._tree import DepthFirstTreeBuilder, BestFirstTreeBuilder
|
from ._tree import DepthFirstTreeBuilder, BestFirstTreeBuilder
|
||||||
@@ -34,539 +36,7 @@ from . import _tree
|
|||||||
__all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor",
|
__all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor",
|
||||||
"ExtraTreeClassifier", "ExtraTreeRegressor"]
|
"ExtraTreeClassifier", "ExtraTreeRegressor"]
|
||||||
|
|
||||||
################################ six ########################################
|
|
||||||
PY2 = sys.version_info[0] == 2
|
|
||||||
PY3 = sys.version_info[0] == 3
|
|
||||||
|
|
||||||
################################ utils ########################################
|
|
||||||
|
|
||||||
class NotFittedError(ValueError, AttributeError):
|
|
||||||
"""Exception class to raise if estimator is used before fitting
|
|
||||||
|
|
||||||
This class inherits from both ValueError and AttributeError to help with
|
|
||||||
exception handling and backward compatibility.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def check_is_fitted(estimator, attributes, msg=None, all_or_any=all):
|
|
||||||
"""Perform is_fitted validation for estimator.
|
|
||||||
|
|
||||||
Checks if the estimator is fitted by verifying the presence of
|
|
||||||
"all_or_any" of the passed attributes and raises a NotFittedError with the
|
|
||||||
given message.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
estimator : estimator instance.
|
|
||||||
estimator instance for which the check is performed.
|
|
||||||
|
|
||||||
attributes : attribute name(s) given as string or a list/tuple of strings
|
|
||||||
Eg. : ["coef_", "estimator_", ...], "coef_"
|
|
||||||
|
|
||||||
msg : string
|
|
||||||
The default error message is, "This %(name)s instance is not fitted
|
|
||||||
yet. Call 'fit' with appropriate arguments before using this method."
|
|
||||||
|
|
||||||
For custom messages if "%(name)s" is present in the message string,
|
|
||||||
it is substituted for the estimator name.
|
|
||||||
|
|
||||||
Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
|
|
||||||
|
|
||||||
all_or_any : callable, {all, any}, default all
|
|
||||||
Specify whether all or any of the given attributes must exist.
|
|
||||||
"""
|
|
||||||
if msg is None:
|
|
||||||
msg = ("This %(name)s instance is not fitted yet. Call 'fit' with "
|
|
||||||
"appropriate arguments before using this method.")
|
|
||||||
|
|
||||||
if not hasattr(estimator, 'fit'):
|
|
||||||
raise TypeError("%s is not an estimator instance." % (estimator))
|
|
||||||
|
|
||||||
if not isinstance(attributes, (list, tuple)):
|
|
||||||
attributes = [attributes]
|
|
||||||
|
|
||||||
if not all_or_any([hasattr(estimator, attr) for attr in attributes]):
|
|
||||||
raise NotFittedError(msg % {'name': type(estimator).__name__})
|
|
||||||
|
|
||||||
def compute_sample_weight(class_weight, y, indices=None):
|
|
||||||
"""Estimate sample weights by class for unbalanced datasets.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
class_weight : dict, list of dicts, "auto", or None, optional
|
|
||||||
Weights associated with classes in the form ``{class_label: weight}``.
|
|
||||||
If not given, all classes are supposed to have weight one. For
|
|
||||||
multi-output problems, a list of dicts can be provided in the same
|
|
||||||
order as the columns of y.
|
|
||||||
|
|
||||||
The "auto" mode uses the values of y to automatically adjust
|
|
||||||
weights inversely proportional to class frequencies in the input data.
|
|
||||||
|
|
||||||
For multi-output, the weights of each column of y will be multiplied.
|
|
||||||
|
|
||||||
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
|
|
||||||
Array of original class labels per sample.
|
|
||||||
|
|
||||||
indices : array-like, shape (n_subsample,), or None
|
|
||||||
Array of indices to be used in a subsample. Can be of length less than
|
|
||||||
n_samples in the case of a subsample, or equal to n_samples in the
|
|
||||||
case of a bootstrap subsample with repeated indices. If None, the
|
|
||||||
sample weight will be calculated over the full sample. Only "auto" is
|
|
||||||
supported for class_weight if this is provided.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
sample_weight_vect : ndarray, shape (n_samples,)
|
|
||||||
Array with sample weights as applied to the original y
|
|
||||||
"""
|
|
||||||
|
|
||||||
y = np.atleast_1d(y)
|
|
||||||
if y.ndim == 1:
|
|
||||||
y = np.reshape(y, (-1, 1))
|
|
||||||
n_outputs = y.shape[1]
|
|
||||||
|
|
||||||
if isinstance(class_weight, six.string_types):
|
|
||||||
if class_weight != 'auto':
|
|
||||||
raise ValueError('The only valid preset for class_weight is '
|
|
||||||
'"auto". Given "%s".' % class_weight)
|
|
||||||
elif (indices is not None and
|
|
||||||
not isinstance(class_weight, six.string_types)):
|
|
||||||
raise ValueError('The only valid class_weight for subsampling is '
|
|
||||||
'"auto". Given "%s".' % class_weight)
|
|
||||||
elif n_outputs > 1:
|
|
||||||
if (not hasattr(class_weight, "__iter__") or
|
|
||||||
isinstance(class_weight, dict)):
|
|
||||||
raise ValueError("For multi-output, class_weight should be a "
|
|
||||||
"list of dicts, or a valid string.")
|
|
||||||
if len(class_weight) != n_outputs:
|
|
||||||
raise ValueError("For multi-output, number of elements in "
|
|
||||||
"class_weight should match number of outputs.")
|
|
||||||
|
|
||||||
expanded_class_weight = []
|
|
||||||
for k in range(n_outputs):
|
|
||||||
|
|
||||||
y_full = y[:, k]
|
|
||||||
classes_full = np.unique(y_full)
|
|
||||||
classes_missing = None
|
|
||||||
|
|
||||||
if class_weight == 'auto' or n_outputs == 1:
|
|
||||||
class_weight_k = class_weight
|
|
||||||
else:
|
|
||||||
class_weight_k = class_weight[k]
|
|
||||||
|
|
||||||
if indices is not None:
|
|
||||||
# Get class weights for the subsample, covering all classes in
|
|
||||||
# case some labels that were present in the original data are
|
|
||||||
# missing from the sample.
|
|
||||||
y_subsample = y[indices, k]
|
|
||||||
classes_subsample = np.unique(y_subsample)
|
|
||||||
|
|
||||||
weight_k = np.choose(np.searchsorted(classes_subsample,
|
|
||||||
classes_full),
|
|
||||||
compute_class_weight(class_weight_k,
|
|
||||||
classes_subsample,
|
|
||||||
y_subsample),
|
|
||||||
mode='clip')
|
|
||||||
|
|
||||||
classes_missing = set(classes_full) - set(classes_subsample)
|
|
||||||
else:
|
|
||||||
weight_k = compute_class_weight(class_weight_k,
|
|
||||||
classes_full,
|
|
||||||
y_full)
|
|
||||||
|
|
||||||
weight_k = weight_k[np.searchsorted(classes_full, y_full)]
|
|
||||||
|
|
||||||
if classes_missing:
|
|
||||||
# Make missing classes' weight zero
|
|
||||||
weight_k[in1d(y_full, list(classes_missing))] = 0.
|
|
||||||
|
|
||||||
expanded_class_weight.append(weight_k)
|
|
||||||
|
|
||||||
expanded_class_weight = np.prod(expanded_class_weight,
|
|
||||||
axis=0,
|
|
||||||
dtype=np.float64)
|
|
||||||
|
|
||||||
return expanded_class_weight
|
|
||||||
|
|
||||||
def _assert_all_finite(X):
|
|
||||||
"""Like assert_all_finite, but only for ndarray."""
|
|
||||||
X = np.asanyarray(X)
|
|
||||||
# First try an O(n) time, O(1) space solution for the common case that
|
|
||||||
# everything is finite; fall back to O(n) space np.isfinite to prevent
|
|
||||||
# false positives from overflow in sum method.
|
|
||||||
if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
|
|
||||||
and not np.isfinite(X).all()):
|
|
||||||
raise ValueError("Input contains NaN, infinity"
|
|
||||||
" or a value too large for %r." % X.dtype)
|
|
||||||
|
|
||||||
def check_array(array, accept_sparse=None, dtype="numeric", order=None,
|
|
||||||
copy=False, force_all_finite=True, ensure_2d=True,
|
|
||||||
allow_nd=False, ensure_min_samples=1, ensure_min_features=1):
|
|
||||||
"""Input validation on an array, list, sparse matrix or similar.
|
|
||||||
|
|
||||||
By default, the input is converted to an at least 2nd numpy array.
|
|
||||||
If the dtype of the array is object, attempt converting to float,
|
|
||||||
raising on failure.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
array : object
|
|
||||||
Input object to check / convert.
|
|
||||||
|
|
||||||
accept_sparse : string, list of string or None (default=None)
|
|
||||||
String[s] representing allowed sparse matrix formats, such as 'csc',
|
|
||||||
'csr', etc. None means that sparse matrix input will raise an error.
|
|
||||||
If the input is sparse but not in the allowed format, it will be
|
|
||||||
converted to the first listed format.
|
|
||||||
|
|
||||||
dtype : string, type or None (default="numeric")
|
|
||||||
Data type of result. If None, the dtype of the input is preserved.
|
|
||||||
If "numeric", dtype is preserved unless array.dtype is object.
|
|
||||||
|
|
||||||
order : 'F', 'C' or None (default=None)
|
|
||||||
Whether an array will be forced to be fortran or c-style.
|
|
||||||
|
|
||||||
copy : boolean (default=False)
|
|
||||||
Whether a forced copy will be triggered. If copy=False, a copy might
|
|
||||||
be triggered by a conversion.
|
|
||||||
|
|
||||||
force_all_finite : boolean (default=True)
|
|
||||||
Whether to raise an error on np.inf and np.nan in X.
|
|
||||||
|
|
||||||
ensure_2d : boolean (default=True)
|
|
||||||
Whether to make X at least 2d.
|
|
||||||
|
|
||||||
allow_nd : boolean (default=False)
|
|
||||||
Whether to allow X.ndim > 2.
|
|
||||||
|
|
||||||
ensure_min_samples : int (default=1)
|
|
||||||
Make sure that the array has a minimum number of samples in its first
|
|
||||||
axis (rows for a 2D array). Setting to 0 disables this check.
|
|
||||||
|
|
||||||
ensure_min_features : int (default=1)
|
|
||||||
Make sure that the 2D array has some minimum number of features
|
|
||||||
(columns). The default value of 1 rejects empty datasets.
|
|
||||||
This check is only enforced when the input data has effectively 2
|
|
||||||
dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
|
|
||||||
disables this check.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
X_converted : object
|
|
||||||
The converted and validated X.
|
|
||||||
"""
|
|
||||||
if isinstance(accept_sparse, str):
|
|
||||||
accept_sparse = [accept_sparse]
|
|
||||||
|
|
||||||
# store whether originally we wanted numeric dtype
|
|
||||||
dtype_numeric = dtype == "numeric"
|
|
||||||
|
|
||||||
if ensure_2d:
|
|
||||||
array = np.atleast_2d(array)
|
|
||||||
if dtype_numeric:
|
|
||||||
if hasattr(array, "dtype") and getattr(array.dtype, "kind", None) == "O":
|
|
||||||
# if input is object, convert to float.
|
|
||||||
dtype = np.float64
|
|
||||||
else:
|
|
||||||
dtype = None
|
|
||||||
array = np.array(array, dtype=dtype, order=order, copy=copy)
|
|
||||||
# make sure we actually converted to numeric:
|
|
||||||
if dtype_numeric and array.dtype.kind == "O":
|
|
||||||
array = array.astype(np.float64)
|
|
||||||
if not allow_nd and array.ndim >= 3:
|
|
||||||
raise ValueError("Found array with dim %d. Expected <= 2" %
|
|
||||||
array.ndim)
|
|
||||||
if force_all_finite:
|
|
||||||
_assert_all_finite(array)
|
|
||||||
|
|
||||||
shape_repr = _shape_repr(array.shape)
|
|
||||||
if ensure_min_samples > 0:
|
|
||||||
n_samples = _num_samples(array)
|
|
||||||
if n_samples < ensure_min_samples:
|
|
||||||
raise ValueError("Found array with %d sample(s) (shape=%s) while a"
|
|
||||||
" minimum of %d is required."
|
|
||||||
% (n_samples, shape_repr, ensure_min_samples))
|
|
||||||
|
|
||||||
if ensure_min_features > 0 and array.ndim == 2:
|
|
||||||
n_features = array.shape[1]
|
|
||||||
if n_features < ensure_min_features:
|
|
||||||
raise ValueError("Found array with %d feature(s) (shape=%s) while"
|
|
||||||
" a minimum of %d is required."
|
|
||||||
% (n_features, shape_repr, ensure_min_features))
|
|
||||||
return array
|
|
||||||
|
|
||||||
def check_random_state(seed):
|
|
||||||
"""Turn seed into a np.random.RandomState instance
|
|
||||||
|
|
||||||
If seed is None, return the RandomState singleton used by np.random.
|
|
||||||
If seed is an int, return a new RandomState instance seeded with seed.
|
|
||||||
If seed is already a RandomState instance, return it.
|
|
||||||
Otherwise raise ValueError.
|
|
||||||
"""
|
|
||||||
if seed is None or seed is np.random:
|
|
||||||
return np.random.mtrand._rand
|
|
||||||
if isinstance(seed, (numbers.Integral, np.integer)):
|
|
||||||
return np.random.RandomState(seed)
|
|
||||||
if isinstance(seed, np.random.RandomState):
|
|
||||||
return seed
|
|
||||||
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
|
|
||||||
' instance' % seed)
|
|
||||||
|
|
||||||
def _shape_repr(shape):
|
|
||||||
"""Return a platform independent reprensentation of an array shape
|
|
||||||
|
|
||||||
Under Python 2, the `long` type introduces an 'L' suffix when using the
|
|
||||||
default %r format for tuples of integers (typically used to store the shape
|
|
||||||
of an array).
|
|
||||||
|
|
||||||
Under Windows 64 bit (and Python 2), the `long` type is used by default
|
|
||||||
in numpy shapes even when the integer dimensions are well below 32 bit.
|
|
||||||
The platform specific type causes string messages or doctests to change
|
|
||||||
from one platform to another which is not desirable.
|
|
||||||
|
|
||||||
Under Python 3, there is no more `long` type so the `L` suffix is never
|
|
||||||
introduced in string representation.
|
|
||||||
|
|
||||||
>>> _shape_repr((1, 2))
|
|
||||||
'(1, 2)'
|
|
||||||
>>> one = 2 ** 64 / 2 ** 64 # force an upcast to `long` under Python 2
|
|
||||||
>>> _shape_repr((one, 2 * one))
|
|
||||||
'(1, 2)'
|
|
||||||
>>> _shape_repr((1,))
|
|
||||||
'(1,)'
|
|
||||||
>>> _shape_repr(())
|
|
||||||
'()'
|
|
||||||
"""
|
|
||||||
if len(shape) == 0:
|
|
||||||
return "()"
|
|
||||||
joined = ", ".join("%d" % e for e in shape)
|
|
||||||
if len(shape) == 1:
|
|
||||||
# special notation for singleton tuples
|
|
||||||
joined += ','
|
|
||||||
return "(%s)" % joined
|
|
||||||
|
|
||||||
def _num_samples(x):
|
|
||||||
"""Return number of samples in array-like x."""
|
|
||||||
if hasattr(x, 'fit'):
|
|
||||||
# Don't get num_samples from an ensembles length!
|
|
||||||
raise TypeError('Expected sequence or array-like, got '
|
|
||||||
'estimator %s' % x)
|
|
||||||
if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
|
|
||||||
if hasattr(x, '__array__'):
|
|
||||||
x = np.asarray(x)
|
|
||||||
else:
|
|
||||||
raise TypeError("Expected sequence or array-like, got %s" %
|
|
||||||
type(x))
|
|
||||||
if hasattr(x, 'shape'):
|
|
||||||
if len(x.shape) == 0:
|
|
||||||
raise TypeError("Singleton array %r cannot be considered"
|
|
||||||
" a valid collection." % x)
|
|
||||||
return x.shape[0]
|
|
||||||
else:
|
|
||||||
return len(x)
|
|
||||||
################################ metrics ########################################
|
|
||||||
def _weighted_sum(sample_score, sample_weight, normalize=False):
|
|
||||||
if normalize:
|
|
||||||
return np.average(sample_score, weights=sample_weight)
|
|
||||||
elif sample_weight is not None:
|
|
||||||
return np.dot(sample_score, sample_weight)
|
|
||||||
else:
|
|
||||||
return sample_score.sum()
|
|
||||||
|
|
||||||
def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
|
|
||||||
"""Accuracy classification score.
|
|
||||||
|
|
||||||
In multilabel classification, this function computes subset accuracy:
|
|
||||||
the set of labels predicted for a sample must *exactly* match the
|
|
||||||
corresponding set of labels in y_true.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
y_true : 1d array-like, or label indicator array / sparse matrix
|
|
||||||
Ground truth (correct) labels.
|
|
||||||
|
|
||||||
y_pred : 1d array-like, or label indicator array / sparse matrix
|
|
||||||
Predicted labels, as returned by a classifier.
|
|
||||||
|
|
||||||
normalize : bool, optional (default=True)
|
|
||||||
If ``False``, return the number of correctly classified samples.
|
|
||||||
Otherwise, return the fraction of correctly classified samples.
|
|
||||||
|
|
||||||
sample_weight : array-like of shape = [n_samples], optional
|
|
||||||
Sample weights.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
score : float
|
|
||||||
If ``normalize == True``, return the correctly classified samples
|
|
||||||
(float), else it returns the number of correctly classified samples
|
|
||||||
(int).
|
|
||||||
|
|
||||||
The best performance is 1 with ``normalize == True`` and the number
|
|
||||||
of samples with ``normalize == False``.
|
|
||||||
|
|
||||||
See also
|
|
||||||
--------
|
|
||||||
jaccard_similarity_score, hamming_loss, zero_one_loss
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
In binary and multiclass classification, this function is equal
|
|
||||||
to the ``jaccard_similarity_score`` function.
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
>>> import numpy as np
|
|
||||||
>>> from sklearn.metrics import accuracy_score
|
|
||||||
>>> y_pred = [0, 2, 1, 3]
|
|
||||||
>>> y_true = [0, 1, 2, 3]
|
|
||||||
>>> accuracy_score(y_true, y_pred)
|
|
||||||
0.5
|
|
||||||
>>> accuracy_score(y_true, y_pred, normalize=False)
|
|
||||||
2
|
|
||||||
|
|
||||||
In the multilabel case with binary label indicators:
|
|
||||||
>>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))
|
|
||||||
0.5
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Compute accuracy for each possible representation
|
|
||||||
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
|
|
||||||
if y_type.startswith('multilabel'):
|
|
||||||
differing_labels = count_nonzero(y_true - y_pred, axis=1)
|
|
||||||
score = differing_labels == 0
|
|
||||||
else:
|
|
||||||
score = y_true == y_pred
|
|
||||||
|
|
||||||
return _weighted_sum(score, sample_weight, normalize)
|
|
||||||
|
|
||||||
def r2_score(y_true, y_pred, sample_weight=None):
|
|
||||||
"""R^2 (coefficient of determination) regression score function.
|
|
||||||
|
|
||||||
Best possible score is 1.0, lower values are worse.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
y_true : array-like of shape = [n_samples] or [n_samples, n_outputs]
|
|
||||||
Ground truth (correct) target values.
|
|
||||||
|
|
||||||
y_pred : array-like of shape = [n_samples] or [n_samples, n_outputs]
|
|
||||||
Estimated target values.
|
|
||||||
|
|
||||||
sample_weight : array-like of shape = [n_samples], optional
|
|
||||||
Sample weights.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
z : float
|
|
||||||
The R^2 score.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
This is not a symmetric function.
|
|
||||||
|
|
||||||
Unlike most other scores, R^2 score may be negative (it need not actually
|
|
||||||
be the square of a quantity R).
|
|
||||||
|
|
||||||
References
|
|
||||||
----------
|
|
||||||
.. [1] `Wikipedia entry on the Coefficient of determination
|
|
||||||
<http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
>>> from sklearn.metrics import r2_score
|
|
||||||
>>> y_true = [3, -0.5, 2, 7]
|
|
||||||
>>> y_pred = [2.5, 0.0, 2, 8]
|
|
||||||
>>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS
|
|
||||||
0.948...
|
|
||||||
>>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
|
|
||||||
>>> y_pred = [[0, 2], [-1, 2], [8, -5]]
|
|
||||||
>>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS
|
|
||||||
0.938...
|
|
||||||
|
|
||||||
"""
|
|
||||||
y_type, y_true, y_pred = _check_reg_targets(y_true, y_pred)
|
|
||||||
|
|
||||||
if sample_weight is not None:
|
|
||||||
sample_weight = column_or_1d(sample_weight)
|
|
||||||
weight = sample_weight[:, np.newaxis]
|
|
||||||
else:
|
|
||||||
weight = 1.
|
|
||||||
|
|
||||||
numerator = (weight * (y_true - y_pred) ** 2).sum(dtype=np.float64)
|
|
||||||
denominator = (weight * (y_true - np.average(
|
|
||||||
y_true, axis=0, weights=sample_weight)) ** 2).sum(dtype=np.float64)
|
|
||||||
|
|
||||||
if denominator == 0.0:
|
|
||||||
if numerator == 0.0:
|
|
||||||
return 1.0
|
|
||||||
else:
|
|
||||||
# arbitrary set to zero to avoid -inf scores, having a constant
|
|
||||||
# y_true is not interesting for scoring a regression anyway
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
return 1 - numerator / denominator
|
|
||||||
|
|
||||||
|
|
||||||
################################ base #########################################
|
|
||||||
class ClassifierMixin(object):
|
|
||||||
"""Mixin class for all classifiers in scikit-learn."""
|
|
||||||
|
|
||||||
def score(self, X, y, sample_weight=None):
|
|
||||||
"""Returns the mean accuracy on the given test data and labels.
|
|
||||||
|
|
||||||
In multi-label classification, this is the subset accuracy
|
|
||||||
which is a harsh metric since you require for each sample that
|
|
||||||
each label set be correctly predicted.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
X : array-like, shape = (n_samples, n_features)
|
|
||||||
Test samples.
|
|
||||||
|
|
||||||
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
|
|
||||||
True labels for X.
|
|
||||||
|
|
||||||
sample_weight : array-like, shape = [n_samples], optional
|
|
||||||
Sample weights.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
score : float
|
|
||||||
Mean accuracy of self.predict(X) wrt. y.
|
|
||||||
|
|
||||||
"""
|
|
||||||
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
|
|
||||||
|
|
||||||
class RegressorMixin(object):
|
|
||||||
"""Mixin class for all regression estimators in scikit-learn."""
|
|
||||||
|
|
||||||
def score(self, X, y, sample_weight=None):
|
|
||||||
"""Returns the coefficient of determination R^2 of the prediction.
|
|
||||||
|
|
||||||
The coefficient R^2 is defined as (1 - u/v), where u is the regression
|
|
||||||
sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual
|
|
||||||
sum of squares ((y_true - y_true.mean()) ** 2).sum().
|
|
||||||
Best possible score is 1.0, lower values are worse.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
X : array-like, shape = (n_samples, n_features)
|
|
||||||
Test samples.
|
|
||||||
|
|
||||||
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
|
|
||||||
True values for X.
|
|
||||||
|
|
||||||
sample_weight : array-like, shape = [n_samples], optional
|
|
||||||
Sample weights.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
score : float
|
|
||||||
R^2 of self.predict(X) wrt. y.
|
|
||||||
"""
|
|
||||||
return r2_score(y, self.predict(X), sample_weight=sample_weight)
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Types and constants
|
# Types and constants
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -589,7 +59,7 @@ SPARSE_SPLITTERS = {"best": _tree.BestSparseSplitter,
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
class BaseDecisionTree:
|
class BaseDecisionTree(BaseEstimator):
|
||||||
"""Base class for decision trees.
|
"""Base class for decision trees.
|
||||||
|
|
||||||
Warning: This class should not be used directly.
|
Warning: This class should not be used directly.
|
||||||
@@ -709,7 +179,7 @@ class BaseDecisionTree:
|
|||||||
max_leaf_nodes = (-1 if self.max_leaf_nodes is None
|
max_leaf_nodes = (-1 if self.max_leaf_nodes is None
|
||||||
else self.max_leaf_nodes)
|
else self.max_leaf_nodes)
|
||||||
|
|
||||||
if isinstance(self.max_features, str if PY3 else basestring):
|
if isinstance(self.max_features, string_types):
|
||||||
if self.max_features == "auto":
|
if self.max_features == "auto":
|
||||||
if is_classification:
|
if is_classification:
|
||||||
max_features = max(1, int(np.sqrt(self.n_features_)))
|
max_features = max(1, int(np.sqrt(self.n_features_)))
|
||||||
|
BIN
python/isaac/external/tree.pyc
vendored
BIN
python/isaac/external/tree.pyc
vendored
Binary file not shown.
Reference in New Issue
Block a user