Python: Added external random forest support
This commit is contained in:
BIN
python/isaac/external/__init__.pyc
vendored
BIN
python/isaac/external/__init__.pyc
vendored
Binary file not shown.
68
python/isaac/external/_utils.pxd
vendored
68
python/isaac/external/_utils.pxd
vendored
@@ -1,68 +0,0 @@
|
||||
# Authors: Gilles Louppe <g.louppe@gmail.com>
|
||||
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
||||
# Arnaud Joly <arnaud.v.joly@gmail.com>
|
||||
#
|
||||
# Licence: BSD 3 clause
|
||||
|
||||
# See _utils.pyx for details.
|
||||
|
||||
import numpy as np
|
||||
cimport numpy as np
|
||||
|
||||
ctypedef np.npy_intp SIZE_t # Type for indices and counters
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stack data structure
|
||||
# =============================================================================
|
||||
|
||||
# A record on the stack for depth-first tree growing
|
||||
cdef struct StackRecord:
|
||||
SIZE_t start
|
||||
SIZE_t end
|
||||
SIZE_t depth
|
||||
SIZE_t parent
|
||||
bint is_left
|
||||
double impurity
|
||||
SIZE_t n_constant_features
|
||||
|
||||
cdef class Stack:
|
||||
cdef SIZE_t capacity
|
||||
cdef SIZE_t top
|
||||
cdef StackRecord* stack_
|
||||
|
||||
cdef bint is_empty(self) nogil
|
||||
cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
|
||||
bint is_left, double impurity,
|
||||
SIZE_t n_constant_features) nogil
|
||||
cdef int pop(self, StackRecord* res) nogil
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# PriorityHeap data structure
|
||||
# =============================================================================
|
||||
|
||||
# A record on the frontier for best-first tree growing
|
||||
cdef struct PriorityHeapRecord:
|
||||
SIZE_t node_id
|
||||
SIZE_t start
|
||||
SIZE_t end
|
||||
SIZE_t pos
|
||||
SIZE_t depth
|
||||
bint is_leaf
|
||||
double impurity
|
||||
double impurity_left
|
||||
double impurity_right
|
||||
double improvement
|
||||
|
||||
cdef class PriorityHeap:
|
||||
cdef SIZE_t capacity
|
||||
cdef SIZE_t heap_ptr
|
||||
cdef PriorityHeapRecord* heap_
|
||||
|
||||
cdef bint is_empty(self) nogil
|
||||
cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
|
||||
SIZE_t depth, bint is_leaf, double improvement,
|
||||
double impurity, double impurity_left,
|
||||
double impurity_right) nogil
|
||||
cdef int pop(self, PriorityHeapRecord* res) nogil
|
230
python/isaac/external/_utils.pyx
vendored
230
python/isaac/external/_utils.pyx
vendored
@@ -1,230 +0,0 @@
|
||||
# cython: cdivision=True
|
||||
# cython: boundscheck=False
|
||||
# cython: wraparound=False
|
||||
|
||||
# Authors: Gilles Louppe <g.louppe@gmail.com>
|
||||
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
||||
# Arnaud Joly <arnaud.v.joly@gmail.com>
|
||||
#
|
||||
# Licence: BSD 3 clause
|
||||
|
||||
from libc.stdlib cimport free, malloc, realloc
|
||||
|
||||
# =============================================================================
|
||||
# Stack data structure
|
||||
# =============================================================================
|
||||
|
||||
cdef class Stack:
|
||||
"""A LIFO data structure.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
capacity : SIZE_t
|
||||
The elements the stack can hold; if more added then ``self.stack_``
|
||||
needs to be resized.
|
||||
|
||||
top : SIZE_t
|
||||
The number of elements currently on the stack.
|
||||
|
||||
stack : StackRecord pointer
|
||||
The stack of records (upward in the stack corresponds to the right).
|
||||
"""
|
||||
|
||||
def __cinit__(self, SIZE_t capacity):
|
||||
self.capacity = capacity
|
||||
self.top = 0
|
||||
self.stack_ = <StackRecord*> malloc(capacity * sizeof(StackRecord))
|
||||
if self.stack_ == NULL:
|
||||
raise MemoryError()
|
||||
|
||||
def __dealloc__(self):
|
||||
free(self.stack_)
|
||||
|
||||
cdef bint is_empty(self) nogil:
|
||||
return self.top <= 0
|
||||
|
||||
cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
|
||||
bint is_left, double impurity,
|
||||
SIZE_t n_constant_features) nogil:
|
||||
"""Push a new element onto the stack.
|
||||
|
||||
Returns 0 if successful; -1 on out of memory error.
|
||||
"""
|
||||
cdef SIZE_t top = self.top
|
||||
cdef StackRecord* stack = NULL
|
||||
|
||||
# Resize if capacity not sufficient
|
||||
if top >= self.capacity:
|
||||
self.capacity *= 2
|
||||
stack = <StackRecord*> realloc(self.stack_,
|
||||
self.capacity * sizeof(StackRecord))
|
||||
if stack == NULL:
|
||||
# no free; __dealloc__ handles that
|
||||
return -1
|
||||
self.stack_ = stack
|
||||
|
||||
stack = self.stack_
|
||||
stack[top].start = start
|
||||
stack[top].end = end
|
||||
stack[top].depth = depth
|
||||
stack[top].parent = parent
|
||||
stack[top].is_left = is_left
|
||||
stack[top].impurity = impurity
|
||||
stack[top].n_constant_features = n_constant_features
|
||||
|
||||
# Increment stack pointer
|
||||
self.top = top + 1
|
||||
return 0
|
||||
|
||||
cdef int pop(self, StackRecord* res) nogil:
|
||||
"""Remove the top element from the stack and copy to ``res``.
|
||||
|
||||
Returns 0 if pop was successful (and ``res`` is set); -1
|
||||
otherwise.
|
||||
"""
|
||||
cdef SIZE_t top = self.top
|
||||
cdef StackRecord* stack = self.stack_
|
||||
|
||||
if top <= 0:
|
||||
return -1
|
||||
|
||||
res[0] = stack[top - 1]
|
||||
self.top = top - 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# PriorityHeap data structure
|
||||
# =============================================================================
|
||||
|
||||
cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil:
|
||||
"""Restore heap invariant parent.improvement > child.improvement from
|
||||
``pos`` upwards. """
|
||||
if pos == 0:
|
||||
return
|
||||
|
||||
cdef SIZE_t parent_pos = (pos - 1) / 2
|
||||
|
||||
if heap[parent_pos].improvement < heap[pos].improvement:
|
||||
heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
|
||||
heapify_up(heap, parent_pos)
|
||||
|
||||
|
||||
cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos,
|
||||
SIZE_t heap_length) nogil:
|
||||
"""Restore heap invariant parent.improvement > children.improvement from
|
||||
``pos`` downwards. """
|
||||
cdef SIZE_t left_pos = 2 * (pos + 1) - 1
|
||||
cdef SIZE_t right_pos = 2 * (pos + 1)
|
||||
cdef SIZE_t largest = pos
|
||||
|
||||
if (left_pos < heap_length and
|
||||
heap[left_pos].improvement > heap[largest].improvement):
|
||||
largest = left_pos
|
||||
|
||||
if (right_pos < heap_length and
|
||||
heap[right_pos].improvement > heap[largest].improvement):
|
||||
largest = right_pos
|
||||
|
||||
if largest != pos:
|
||||
heap[pos], heap[largest] = heap[largest], heap[pos]
|
||||
heapify_down(heap, largest, heap_length)
|
||||
|
||||
|
||||
cdef class PriorityHeap:
|
||||
"""A priority queue implemented as a binary heap.
|
||||
|
||||
The heap invariant is that the impurity improvement of the parent record
|
||||
is larger then the impurity improvement of the children.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
capacity : SIZE_t
|
||||
The capacity of the heap
|
||||
|
||||
heap_ptr : SIZE_t
|
||||
The water mark of the heap; the heap grows from left to right in the
|
||||
array ``heap_``. The following invariant holds ``heap_ptr < capacity``.
|
||||
|
||||
heap_ : PriorityHeapRecord*
|
||||
The array of heap records. The maximum element is on the left;
|
||||
the heap grows from left to right
|
||||
"""
|
||||
|
||||
def __cinit__(self, SIZE_t capacity):
|
||||
self.capacity = capacity
|
||||
self.heap_ptr = 0
|
||||
self.heap_ = <PriorityHeapRecord*> malloc(capacity * sizeof(PriorityHeapRecord))
|
||||
if self.heap_ == NULL:
|
||||
raise MemoryError()
|
||||
|
||||
def __dealloc__(self):
|
||||
free(self.heap_)
|
||||
|
||||
cdef bint is_empty(self) nogil:
|
||||
return self.heap_ptr <= 0
|
||||
|
||||
cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
|
||||
SIZE_t depth, bint is_leaf, double improvement,
|
||||
double impurity, double impurity_left,
|
||||
double impurity_right) nogil:
|
||||
"""Push record on the priority heap.
|
||||
|
||||
Returns 0 if successful; -1 on out of memory error.
|
||||
"""
|
||||
cdef SIZE_t heap_ptr = self.heap_ptr
|
||||
cdef PriorityHeapRecord* heap = NULL
|
||||
|
||||
# Resize if capacity not sufficient
|
||||
if heap_ptr >= self.capacity:
|
||||
self.capacity *= 2
|
||||
heap = <PriorityHeapRecord*> realloc(self.heap_,
|
||||
self.capacity *
|
||||
sizeof(PriorityHeapRecord))
|
||||
if heap == NULL:
|
||||
# no free; __dealloc__ handles that
|
||||
return -1
|
||||
self.heap_ = heap
|
||||
|
||||
# Put element as last element of heap
|
||||
heap = self.heap_
|
||||
heap[heap_ptr].node_id = node_id
|
||||
heap[heap_ptr].start = start
|
||||
heap[heap_ptr].end = end
|
||||
heap[heap_ptr].pos = pos
|
||||
heap[heap_ptr].depth = depth
|
||||
heap[heap_ptr].is_leaf = is_leaf
|
||||
heap[heap_ptr].impurity = impurity
|
||||
heap[heap_ptr].impurity_left = impurity_left
|
||||
heap[heap_ptr].impurity_right = impurity_right
|
||||
heap[heap_ptr].improvement = improvement
|
||||
|
||||
# Heapify up
|
||||
heapify_up(heap, heap_ptr)
|
||||
|
||||
# Increase element count
|
||||
self.heap_ptr = heap_ptr + 1
|
||||
return 0
|
||||
|
||||
cdef int pop(self, PriorityHeapRecord* res) nogil:
|
||||
"""Remove max element from the heap. """
|
||||
cdef SIZE_t heap_ptr = self.heap_ptr
|
||||
cdef PriorityHeapRecord* heap = self.heap_
|
||||
|
||||
if heap_ptr <= 0:
|
||||
return -1
|
||||
|
||||
# Take first element
|
||||
res[0] = heap[0]
|
||||
|
||||
# Put last element to the front
|
||||
heap[0], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[0]
|
||||
|
||||
# Restore heap invariant
|
||||
if heap_ptr > 1:
|
||||
heapify_down(heap, 0, heap_ptr - 1)
|
||||
|
||||
self.heap_ptr = heap_ptr - 1
|
||||
|
||||
return 0
|
1664
python/isaac/external/forest.py
vendored
Normal file
1664
python/isaac/external/forest.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
540
python/isaac/external/tree.py
vendored
540
python/isaac/external/tree.py
vendored
@@ -19,12 +19,14 @@ randomized trees. Single and multi-output problems are both handled.
|
||||
|
||||
from __future__ import division
|
||||
|
||||
import sys
|
||||
import numbers
|
||||
from abc import abstractmethod
|
||||
|
||||
import numpy as np
|
||||
|
||||
from utils import NotFittedError, check_is_fitted, compute_sample_weight, check_array, check_random_state, ClassifierMixin, RegressorMixin, string_types
|
||||
from utils import BaseEstimator
|
||||
|
||||
from ._tree import Criterion
|
||||
from ._tree import Splitter
|
||||
from ._tree import DepthFirstTreeBuilder, BestFirstTreeBuilder
|
||||
@@ -34,539 +36,7 @@ from . import _tree
|
||||
__all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor",
|
||||
"ExtraTreeClassifier", "ExtraTreeRegressor"]
|
||||
|
||||
################################ six ########################################
|
||||
PY2 = sys.version_info[0] == 2
|
||||
PY3 = sys.version_info[0] == 3
|
||||
|
||||
################################ utils ########################################
|
||||
|
||||
class NotFittedError(ValueError, AttributeError):
|
||||
"""Exception class to raise if estimator is used before fitting
|
||||
|
||||
This class inherits from both ValueError and AttributeError to help with
|
||||
exception handling and backward compatibility.
|
||||
"""
|
||||
|
||||
def check_is_fitted(estimator, attributes, msg=None, all_or_any=all):
|
||||
"""Perform is_fitted validation for estimator.
|
||||
|
||||
Checks if the estimator is fitted by verifying the presence of
|
||||
"all_or_any" of the passed attributes and raises a NotFittedError with the
|
||||
given message.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator instance.
|
||||
estimator instance for which the check is performed.
|
||||
|
||||
attributes : attribute name(s) given as string or a list/tuple of strings
|
||||
Eg. : ["coef_", "estimator_", ...], "coef_"
|
||||
|
||||
msg : string
|
||||
The default error message is, "This %(name)s instance is not fitted
|
||||
yet. Call 'fit' with appropriate arguments before using this method."
|
||||
|
||||
For custom messages if "%(name)s" is present in the message string,
|
||||
it is substituted for the estimator name.
|
||||
|
||||
Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
|
||||
|
||||
all_or_any : callable, {all, any}, default all
|
||||
Specify whether all or any of the given attributes must exist.
|
||||
"""
|
||||
if msg is None:
|
||||
msg = ("This %(name)s instance is not fitted yet. Call 'fit' with "
|
||||
"appropriate arguments before using this method.")
|
||||
|
||||
if not hasattr(estimator, 'fit'):
|
||||
raise TypeError("%s is not an estimator instance." % (estimator))
|
||||
|
||||
if not isinstance(attributes, (list, tuple)):
|
||||
attributes = [attributes]
|
||||
|
||||
if not all_or_any([hasattr(estimator, attr) for attr in attributes]):
|
||||
raise NotFittedError(msg % {'name': type(estimator).__name__})
|
||||
|
||||
def compute_sample_weight(class_weight, y, indices=None):
|
||||
"""Estimate sample weights by class for unbalanced datasets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
class_weight : dict, list of dicts, "auto", or None, optional
|
||||
Weights associated with classes in the form ``{class_label: weight}``.
|
||||
If not given, all classes are supposed to have weight one. For
|
||||
multi-output problems, a list of dicts can be provided in the same
|
||||
order as the columns of y.
|
||||
|
||||
The "auto" mode uses the values of y to automatically adjust
|
||||
weights inversely proportional to class frequencies in the input data.
|
||||
|
||||
For multi-output, the weights of each column of y will be multiplied.
|
||||
|
||||
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
|
||||
Array of original class labels per sample.
|
||||
|
||||
indices : array-like, shape (n_subsample,), or None
|
||||
Array of indices to be used in a subsample. Can be of length less than
|
||||
n_samples in the case of a subsample, or equal to n_samples in the
|
||||
case of a bootstrap subsample with repeated indices. If None, the
|
||||
sample weight will be calculated over the full sample. Only "auto" is
|
||||
supported for class_weight if this is provided.
|
||||
|
||||
Returns
|
||||
-------
|
||||
sample_weight_vect : ndarray, shape (n_samples,)
|
||||
Array with sample weights as applied to the original y
|
||||
"""
|
||||
|
||||
y = np.atleast_1d(y)
|
||||
if y.ndim == 1:
|
||||
y = np.reshape(y, (-1, 1))
|
||||
n_outputs = y.shape[1]
|
||||
|
||||
if isinstance(class_weight, six.string_types):
|
||||
if class_weight != 'auto':
|
||||
raise ValueError('The only valid preset for class_weight is '
|
||||
'"auto". Given "%s".' % class_weight)
|
||||
elif (indices is not None and
|
||||
not isinstance(class_weight, six.string_types)):
|
||||
raise ValueError('The only valid class_weight for subsampling is '
|
||||
'"auto". Given "%s".' % class_weight)
|
||||
elif n_outputs > 1:
|
||||
if (not hasattr(class_weight, "__iter__") or
|
||||
isinstance(class_weight, dict)):
|
||||
raise ValueError("For multi-output, class_weight should be a "
|
||||
"list of dicts, or a valid string.")
|
||||
if len(class_weight) != n_outputs:
|
||||
raise ValueError("For multi-output, number of elements in "
|
||||
"class_weight should match number of outputs.")
|
||||
|
||||
expanded_class_weight = []
|
||||
for k in range(n_outputs):
|
||||
|
||||
y_full = y[:, k]
|
||||
classes_full = np.unique(y_full)
|
||||
classes_missing = None
|
||||
|
||||
if class_weight == 'auto' or n_outputs == 1:
|
||||
class_weight_k = class_weight
|
||||
else:
|
||||
class_weight_k = class_weight[k]
|
||||
|
||||
if indices is not None:
|
||||
# Get class weights for the subsample, covering all classes in
|
||||
# case some labels that were present in the original data are
|
||||
# missing from the sample.
|
||||
y_subsample = y[indices, k]
|
||||
classes_subsample = np.unique(y_subsample)
|
||||
|
||||
weight_k = np.choose(np.searchsorted(classes_subsample,
|
||||
classes_full),
|
||||
compute_class_weight(class_weight_k,
|
||||
classes_subsample,
|
||||
y_subsample),
|
||||
mode='clip')
|
||||
|
||||
classes_missing = set(classes_full) - set(classes_subsample)
|
||||
else:
|
||||
weight_k = compute_class_weight(class_weight_k,
|
||||
classes_full,
|
||||
y_full)
|
||||
|
||||
weight_k = weight_k[np.searchsorted(classes_full, y_full)]
|
||||
|
||||
if classes_missing:
|
||||
# Make missing classes' weight zero
|
||||
weight_k[in1d(y_full, list(classes_missing))] = 0.
|
||||
|
||||
expanded_class_weight.append(weight_k)
|
||||
|
||||
expanded_class_weight = np.prod(expanded_class_weight,
|
||||
axis=0,
|
||||
dtype=np.float64)
|
||||
|
||||
return expanded_class_weight
|
||||
|
||||
def _assert_all_finite(X):
|
||||
"""Like assert_all_finite, but only for ndarray."""
|
||||
X = np.asanyarray(X)
|
||||
# First try an O(n) time, O(1) space solution for the common case that
|
||||
# everything is finite; fall back to O(n) space np.isfinite to prevent
|
||||
# false positives from overflow in sum method.
|
||||
if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
|
||||
and not np.isfinite(X).all()):
|
||||
raise ValueError("Input contains NaN, infinity"
|
||||
" or a value too large for %r." % X.dtype)
|
||||
|
||||
def check_array(array, accept_sparse=None, dtype="numeric", order=None,
|
||||
copy=False, force_all_finite=True, ensure_2d=True,
|
||||
allow_nd=False, ensure_min_samples=1, ensure_min_features=1):
|
||||
"""Input validation on an array, list, sparse matrix or similar.
|
||||
|
||||
By default, the input is converted to an at least 2nd numpy array.
|
||||
If the dtype of the array is object, attempt converting to float,
|
||||
raising on failure.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
array : object
|
||||
Input object to check / convert.
|
||||
|
||||
accept_sparse : string, list of string or None (default=None)
|
||||
String[s] representing allowed sparse matrix formats, such as 'csc',
|
||||
'csr', etc. None means that sparse matrix input will raise an error.
|
||||
If the input is sparse but not in the allowed format, it will be
|
||||
converted to the first listed format.
|
||||
|
||||
dtype : string, type or None (default="numeric")
|
||||
Data type of result. If None, the dtype of the input is preserved.
|
||||
If "numeric", dtype is preserved unless array.dtype is object.
|
||||
|
||||
order : 'F', 'C' or None (default=None)
|
||||
Whether an array will be forced to be fortran or c-style.
|
||||
|
||||
copy : boolean (default=False)
|
||||
Whether a forced copy will be triggered. If copy=False, a copy might
|
||||
be triggered by a conversion.
|
||||
|
||||
force_all_finite : boolean (default=True)
|
||||
Whether to raise an error on np.inf and np.nan in X.
|
||||
|
||||
ensure_2d : boolean (default=True)
|
||||
Whether to make X at least 2d.
|
||||
|
||||
allow_nd : boolean (default=False)
|
||||
Whether to allow X.ndim > 2.
|
||||
|
||||
ensure_min_samples : int (default=1)
|
||||
Make sure that the array has a minimum number of samples in its first
|
||||
axis (rows for a 2D array). Setting to 0 disables this check.
|
||||
|
||||
ensure_min_features : int (default=1)
|
||||
Make sure that the 2D array has some minimum number of features
|
||||
(columns). The default value of 1 rejects empty datasets.
|
||||
This check is only enforced when the input data has effectively 2
|
||||
dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
|
||||
disables this check.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_converted : object
|
||||
The converted and validated X.
|
||||
"""
|
||||
if isinstance(accept_sparse, str):
|
||||
accept_sparse = [accept_sparse]
|
||||
|
||||
# store whether originally we wanted numeric dtype
|
||||
dtype_numeric = dtype == "numeric"
|
||||
|
||||
if ensure_2d:
|
||||
array = np.atleast_2d(array)
|
||||
if dtype_numeric:
|
||||
if hasattr(array, "dtype") and getattr(array.dtype, "kind", None) == "O":
|
||||
# if input is object, convert to float.
|
||||
dtype = np.float64
|
||||
else:
|
||||
dtype = None
|
||||
array = np.array(array, dtype=dtype, order=order, copy=copy)
|
||||
# make sure we actually converted to numeric:
|
||||
if dtype_numeric and array.dtype.kind == "O":
|
||||
array = array.astype(np.float64)
|
||||
if not allow_nd and array.ndim >= 3:
|
||||
raise ValueError("Found array with dim %d. Expected <= 2" %
|
||||
array.ndim)
|
||||
if force_all_finite:
|
||||
_assert_all_finite(array)
|
||||
|
||||
shape_repr = _shape_repr(array.shape)
|
||||
if ensure_min_samples > 0:
|
||||
n_samples = _num_samples(array)
|
||||
if n_samples < ensure_min_samples:
|
||||
raise ValueError("Found array with %d sample(s) (shape=%s) while a"
|
||||
" minimum of %d is required."
|
||||
% (n_samples, shape_repr, ensure_min_samples))
|
||||
|
||||
if ensure_min_features > 0 and array.ndim == 2:
|
||||
n_features = array.shape[1]
|
||||
if n_features < ensure_min_features:
|
||||
raise ValueError("Found array with %d feature(s) (shape=%s) while"
|
||||
" a minimum of %d is required."
|
||||
% (n_features, shape_repr, ensure_min_features))
|
||||
return array
|
||||
|
||||
def check_random_state(seed):
|
||||
"""Turn seed into a np.random.RandomState instance
|
||||
|
||||
If seed is None, return the RandomState singleton used by np.random.
|
||||
If seed is an int, return a new RandomState instance seeded with seed.
|
||||
If seed is already a RandomState instance, return it.
|
||||
Otherwise raise ValueError.
|
||||
"""
|
||||
if seed is None or seed is np.random:
|
||||
return np.random.mtrand._rand
|
||||
if isinstance(seed, (numbers.Integral, np.integer)):
|
||||
return np.random.RandomState(seed)
|
||||
if isinstance(seed, np.random.RandomState):
|
||||
return seed
|
||||
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
|
||||
' instance' % seed)
|
||||
|
||||
def _shape_repr(shape):
|
||||
"""Return a platform independent reprensentation of an array shape
|
||||
|
||||
Under Python 2, the `long` type introduces an 'L' suffix when using the
|
||||
default %r format for tuples of integers (typically used to store the shape
|
||||
of an array).
|
||||
|
||||
Under Windows 64 bit (and Python 2), the `long` type is used by default
|
||||
in numpy shapes even when the integer dimensions are well below 32 bit.
|
||||
The platform specific type causes string messages or doctests to change
|
||||
from one platform to another which is not desirable.
|
||||
|
||||
Under Python 3, there is no more `long` type so the `L` suffix is never
|
||||
introduced in string representation.
|
||||
|
||||
>>> _shape_repr((1, 2))
|
||||
'(1, 2)'
|
||||
>>> one = 2 ** 64 / 2 ** 64 # force an upcast to `long` under Python 2
|
||||
>>> _shape_repr((one, 2 * one))
|
||||
'(1, 2)'
|
||||
>>> _shape_repr((1,))
|
||||
'(1,)'
|
||||
>>> _shape_repr(())
|
||||
'()'
|
||||
"""
|
||||
if len(shape) == 0:
|
||||
return "()"
|
||||
joined = ", ".join("%d" % e for e in shape)
|
||||
if len(shape) == 1:
|
||||
# special notation for singleton tuples
|
||||
joined += ','
|
||||
return "(%s)" % joined
|
||||
|
||||
def _num_samples(x):
|
||||
"""Return number of samples in array-like x."""
|
||||
if hasattr(x, 'fit'):
|
||||
# Don't get num_samples from an ensembles length!
|
||||
raise TypeError('Expected sequence or array-like, got '
|
||||
'estimator %s' % x)
|
||||
if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
|
||||
if hasattr(x, '__array__'):
|
||||
x = np.asarray(x)
|
||||
else:
|
||||
raise TypeError("Expected sequence or array-like, got %s" %
|
||||
type(x))
|
||||
if hasattr(x, 'shape'):
|
||||
if len(x.shape) == 0:
|
||||
raise TypeError("Singleton array %r cannot be considered"
|
||||
" a valid collection." % x)
|
||||
return x.shape[0]
|
||||
else:
|
||||
return len(x)
|
||||
################################ metrics ########################################
|
||||
def _weighted_sum(sample_score, sample_weight, normalize=False):
|
||||
if normalize:
|
||||
return np.average(sample_score, weights=sample_weight)
|
||||
elif sample_weight is not None:
|
||||
return np.dot(sample_score, sample_weight)
|
||||
else:
|
||||
return sample_score.sum()
|
||||
|
||||
def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
|
||||
"""Accuracy classification score.
|
||||
|
||||
In multilabel classification, this function computes subset accuracy:
|
||||
the set of labels predicted for a sample must *exactly* match the
|
||||
corresponding set of labels in y_true.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : 1d array-like, or label indicator array / sparse matrix
|
||||
Ground truth (correct) labels.
|
||||
|
||||
y_pred : 1d array-like, or label indicator array / sparse matrix
|
||||
Predicted labels, as returned by a classifier.
|
||||
|
||||
normalize : bool, optional (default=True)
|
||||
If ``False``, return the number of correctly classified samples.
|
||||
Otherwise, return the fraction of correctly classified samples.
|
||||
|
||||
sample_weight : array-like of shape = [n_samples], optional
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
If ``normalize == True``, return the correctly classified samples
|
||||
(float), else it returns the number of correctly classified samples
|
||||
(int).
|
||||
|
||||
The best performance is 1 with ``normalize == True`` and the number
|
||||
of samples with ``normalize == False``.
|
||||
|
||||
See also
|
||||
--------
|
||||
jaccard_similarity_score, hamming_loss, zero_one_loss
|
||||
|
||||
Notes
|
||||
-----
|
||||
In binary and multiclass classification, this function is equal
|
||||
to the ``jaccard_similarity_score`` function.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.metrics import accuracy_score
|
||||
>>> y_pred = [0, 2, 1, 3]
|
||||
>>> y_true = [0, 1, 2, 3]
|
||||
>>> accuracy_score(y_true, y_pred)
|
||||
0.5
|
||||
>>> accuracy_score(y_true, y_pred, normalize=False)
|
||||
2
|
||||
|
||||
In the multilabel case with binary label indicators:
|
||||
>>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))
|
||||
0.5
|
||||
"""
|
||||
|
||||
# Compute accuracy for each possible representation
|
||||
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
|
||||
if y_type.startswith('multilabel'):
|
||||
differing_labels = count_nonzero(y_true - y_pred, axis=1)
|
||||
score = differing_labels == 0
|
||||
else:
|
||||
score = y_true == y_pred
|
||||
|
||||
return _weighted_sum(score, sample_weight, normalize)
|
||||
|
||||
def r2_score(y_true, y_pred, sample_weight=None):
|
||||
"""R^2 (coefficient of determination) regression score function.
|
||||
|
||||
Best possible score is 1.0, lower values are worse.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : array-like of shape = [n_samples] or [n_samples, n_outputs]
|
||||
Ground truth (correct) target values.
|
||||
|
||||
y_pred : array-like of shape = [n_samples] or [n_samples, n_outputs]
|
||||
Estimated target values.
|
||||
|
||||
sample_weight : array-like of shape = [n_samples], optional
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
z : float
|
||||
The R^2 score.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This is not a symmetric function.
|
||||
|
||||
Unlike most other scores, R^2 score may be negative (it need not actually
|
||||
be the square of a quantity R).
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] `Wikipedia entry on the Coefficient of determination
|
||||
<http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.metrics import r2_score
|
||||
>>> y_true = [3, -0.5, 2, 7]
|
||||
>>> y_pred = [2.5, 0.0, 2, 8]
|
||||
>>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS
|
||||
0.948...
|
||||
>>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
|
||||
>>> y_pred = [[0, 2], [-1, 2], [8, -5]]
|
||||
>>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS
|
||||
0.938...
|
||||
|
||||
"""
|
||||
y_type, y_true, y_pred = _check_reg_targets(y_true, y_pred)
|
||||
|
||||
if sample_weight is not None:
|
||||
sample_weight = column_or_1d(sample_weight)
|
||||
weight = sample_weight[:, np.newaxis]
|
||||
else:
|
||||
weight = 1.
|
||||
|
||||
numerator = (weight * (y_true - y_pred) ** 2).sum(dtype=np.float64)
|
||||
denominator = (weight * (y_true - np.average(
|
||||
y_true, axis=0, weights=sample_weight)) ** 2).sum(dtype=np.float64)
|
||||
|
||||
if denominator == 0.0:
|
||||
if numerator == 0.0:
|
||||
return 1.0
|
||||
else:
|
||||
# arbitrary set to zero to avoid -inf scores, having a constant
|
||||
# y_true is not interesting for scoring a regression anyway
|
||||
return 0.0
|
||||
|
||||
return 1 - numerator / denominator
|
||||
|
||||
|
||||
################################ base #########################################
|
||||
class ClassifierMixin(object):
|
||||
"""Mixin class for all classifiers in scikit-learn."""
|
||||
|
||||
def score(self, X, y, sample_weight=None):
|
||||
"""Returns the mean accuracy on the given test data and labels.
|
||||
|
||||
In multi-label classification, this is the subset accuracy
|
||||
which is a harsh metric since you require for each sample that
|
||||
each label set be correctly predicted.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = (n_samples, n_features)
|
||||
Test samples.
|
||||
|
||||
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
|
||||
True labels for X.
|
||||
|
||||
sample_weight : array-like, shape = [n_samples], optional
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
Mean accuracy of self.predict(X) wrt. y.
|
||||
|
||||
"""
|
||||
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
|
||||
|
||||
class RegressorMixin(object):
|
||||
"""Mixin class for all regression estimators in scikit-learn."""
|
||||
|
||||
def score(self, X, y, sample_weight=None):
|
||||
"""Returns the coefficient of determination R^2 of the prediction.
|
||||
|
||||
The coefficient R^2 is defined as (1 - u/v), where u is the regression
|
||||
sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual
|
||||
sum of squares ((y_true - y_true.mean()) ** 2).sum().
|
||||
Best possible score is 1.0, lower values are worse.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = (n_samples, n_features)
|
||||
Test samples.
|
||||
|
||||
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
|
||||
True values for X.
|
||||
|
||||
sample_weight : array-like, shape = [n_samples], optional
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
R^2 of self.predict(X) wrt. y.
|
||||
"""
|
||||
return r2_score(y, self.predict(X), sample_weight=sample_weight)
|
||||
|
||||
# =============================================================================
|
||||
# Types and constants
|
||||
# =============================================================================
|
||||
@@ -589,7 +59,7 @@ SPARSE_SPLITTERS = {"best": _tree.BestSparseSplitter,
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class BaseDecisionTree:
|
||||
class BaseDecisionTree(BaseEstimator):
|
||||
"""Base class for decision trees.
|
||||
|
||||
Warning: This class should not be used directly.
|
||||
@@ -709,7 +179,7 @@ class BaseDecisionTree:
|
||||
max_leaf_nodes = (-1 if self.max_leaf_nodes is None
|
||||
else self.max_leaf_nodes)
|
||||
|
||||
if isinstance(self.max_features, str if PY3 else basestring):
|
||||
if isinstance(self.max_features, string_types):
|
||||
if self.max_features == "auto":
|
||||
if is_classification:
|
||||
max_features = max(1, int(np.sqrt(self.n_features_)))
|
||||
|
BIN
python/isaac/external/tree.pyc
vendored
BIN
python/isaac/external/tree.pyc
vendored
Binary file not shown.
Reference in New Issue
Block a user