Python: added standalone decision tree from sklearn

2015-08-12 21:59:59 -07:00
parent 71224a1507
commit cc2d3416be
9 changed files with 42861 additions and 0 deletions
--- a/python/isaac/external/init.py
+++ b/python/isaac/external/init.py
@@ -0,0 +1,12 @@
 """
 The :mod:`sklearn.tree` module includes decision tree-based models for
 classification and regression.
 """
 from .tree import DecisionTreeClassifier
 from .tree import DecisionTreeRegressor
 from .tree import ExtraTreeClassifier
 from .tree import ExtraTreeRegressor
 __all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor",
           "ExtraTreeClassifier", "ExtraTreeRegressor"]
--- a/python/isaac/external/init.pyc
+++ b/python/isaac/external/init.pyc
--- a/python/isaac/external/_tree.c
+++ b/python/isaac/external/_tree.c
--- a/python/isaac/external/_tree.pxd
+++ b/python/isaac/external/_tree.pxd
@@ -0,0 +1,274 @@
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Brian Holt <bdholt1@gmail.com>
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 #
 # Licence: BSD 3 clause
 # See _tree.pyx for details.
 import numpy as np
 cimport numpy as np
 ctypedef np.npy_float32 DTYPE_t          # Type of X
 ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
 ctypedef np.npy_intp SIZE_t              # Type for indices and counters
 ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
 ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
 # =============================================================================
 # Stack data structure
 # =============================================================================
 # A record on the stack for depth-first tree growing
 cdef struct StackRecord:
    SIZE_t start
    SIZE_t end
    SIZE_t depth
    SIZE_t parent
    bint is_left
    double impurity
    SIZE_t n_constant_features
 cdef class Stack:
    cdef SIZE_t capacity
    cdef SIZE_t top
    cdef StackRecord* stack_
    cdef bint is_empty(self) nogil
    cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
                  bint is_left, double impurity,
                  SIZE_t n_constant_features) nogil
    cdef int pop(self, StackRecord* res) nogil
 # =============================================================================
 # PriorityHeap data structure
 # =============================================================================
 # A record on the frontier for best-first tree growing
 cdef struct PriorityHeapRecord:
    SIZE_t node_id
    SIZE_t start
    SIZE_t end
    SIZE_t pos
    SIZE_t depth
    bint is_leaf
    double impurity
    double impurity_left
    double impurity_right
    double improvement
 cdef class PriorityHeap:
    cdef SIZE_t capacity
    cdef SIZE_t heap_ptr
    cdef PriorityHeapRecord* heap_
    cdef bint is_empty(self) nogil
    cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
                  SIZE_t depth, bint is_leaf, double improvement,
                  double impurity, double impurity_left,
                  double impurity_right) nogil
    cdef int pop(self, PriorityHeapRecord* res) nogil
 # =============================================================================
 # Criterion
 # =============================================================================
 cdef class Criterion:
    # The criterion computes the impurity of a node and the reduction of
    # impurity of a split on that node. It also computes the output statistics
    # such as the mean in regression and class probabilities in classification.
    # Internal structures
    cdef DOUBLE_t* y                     # Values of y
    cdef SIZE_t y_stride                 # Stride in y (since n_outputs >= 1)
    cdef DOUBLE_t* sample_weight         # Sample weights
    cdef SIZE_t* samples                 # Sample indices in X, y
    cdef SIZE_t start                    # samples[start:pos] are the samples in the left node
    cdef SIZE_t pos                      # samples[pos:end] are the samples in the right node
    cdef SIZE_t end
    cdef SIZE_t n_outputs                # Number of outputs
    cdef SIZE_t n_node_samples           # Number of samples in the node (end-start)
    cdef double weighted_n_samples       # Weighted number of samples (in total)
    cdef double weighted_n_node_samples  # Weighted number of samples in the node
    cdef double weighted_n_left          # Weighted number of samples in the left node
    cdef double weighted_n_right         # Weighted number of samples in the right node
    # The criterion object is maintained such that left and right collected
    # statistics correspond to samples[start:pos] and samples[pos:end].
    # Methods
    cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
                   double weighted_n_samples, SIZE_t* samples, SIZE_t start,
                   SIZE_t end) nogil
    cdef void reset(self) nogil
    cdef void update(self, SIZE_t new_pos) nogil
    cdef double node_impurity(self) nogil
    cdef void children_impurity(self, double* impurity_left,
                                double* impurity_right) nogil
    cdef void node_value(self, double* dest) nogil
    cdef double impurity_improvement(self, double impurity) nogil
 # =============================================================================
 # Splitter
 # =============================================================================
 cdef struct SplitRecord:
    # Data to track sample split
    SIZE_t feature         # Which feature to split on.
    SIZE_t pos             # Split samples array at the given position,
                           # i.e. count of samples below threshold for feature.
                           # pos is >= end if the node is a leaf.
    double threshold       # Threshold to split at.
    double improvement     # Impurity improvement given parent node.
    double impurity_left   # Impurity of the left split.
    double impurity_right  # Impurity of the right split.
 cdef class Splitter:
    # The splitter searches in the input space for a feature and a threshold
    # to split the samples samples[start:end].
    #
    # The impurity computations are delegated to a criterion object.
    # Internal structures
    cdef public Criterion criterion      # Impurity criterion
    cdef public SIZE_t max_features      # Number of features to test
    cdef public SIZE_t min_samples_leaf  # Min samples in a leaf
    cdef public double min_weight_leaf   # Minimum weight in a leaf
    cdef object random_state             # Random state
    cdef UINT32_t rand_r_state           # sklearn_rand_r random number state
    cdef SIZE_t* samples                 # Sample indices in X, y
    cdef SIZE_t n_samples                # X.shape[0]
    cdef double weighted_n_samples       # Weighted number of samples
    cdef SIZE_t* features                # Feature indices in X
    cdef SIZE_t* constant_features       # Constant features indices
    cdef SIZE_t n_features               # X.shape[1]
    cdef DTYPE_t* feature_values         # temp. array holding feature values
    cdef SIZE_t start                    # Start position for the current node
    cdef SIZE_t end                      # End position for the current node
    cdef DOUBLE_t* y
    cdef SIZE_t y_stride
    cdef DOUBLE_t* sample_weight
    # The samples vector `samples` is maintained by the Splitter object such
    # that the samples contained in a node are contiguous. With this setting,
    # `node_split` reorganizes the node samples `samples[start:end]` in two
    # subsets `samples[start:pos]` and `samples[pos:end]`.
    # The 1-d  `features` array of size n_features contains the features
    # indices and allows fast sampling without replacement of features.
    # The 1-d `constant_features` array of size n_features holds in
    # `constant_features[:n_constant_features]` the feature ids with
    # constant values for all the samples that reached a specific node.
    # The value `n_constant_features` is given by the the parent node to its
    # child nodes.  The content of the range `[n_constant_features:]` is left
    # undefined, but preallocated for performance reasons
    # This allows optimization with depth-based tree building.
    # Methods
    cdef void init(self, object X, np.ndarray y,
                   DOUBLE_t* sample_weight) except *
    cdef void node_reset(self, SIZE_t start, SIZE_t end,
                         double* weighted_n_node_samples) nogil
    cdef void node_split(self,
                         double impurity,   # Impurity of the node
                         SplitRecord* split,
                         SIZE_t* n_constant_features) nogil
    cdef void node_value(self, double* dest) nogil
    cdef double node_impurity(self) nogil
 # =============================================================================
 # Tree
 # =============================================================================
 cdef struct Node:
    # Base storage structure for the nodes in a Tree object
    SIZE_t left_child                    # id of the left child of the node
    SIZE_t right_child                   # id of the right child of the node
    SIZE_t feature                       # Feature used for splitting the node
    DOUBLE_t threshold                   # Threshold value at the node
    DOUBLE_t impurity                    # Impurity of the node (i.e., the value of the criterion)
    SIZE_t n_node_samples                # Number of samples at the node
    DOUBLE_t weighted_n_node_samples     # Weighted number of samples at the node
 cdef class Tree:
    # The Tree object is a binary tree structure constructed by the
    # TreeBuilder. The tree structure is used for predictions and
    # feature importances.
    # Input/Output layout
    cdef public SIZE_t n_features        # Number of features in X
    cdef SIZE_t* n_classes               # Number of classes in y[:, k]
    cdef public SIZE_t n_outputs         # Number of outputs in y
    cdef public SIZE_t max_n_classes     # max(n_classes)
    # Inner structures: values are stored separately from node structure,
    # since size is determined at runtime.
    cdef public SIZE_t max_depth         # Max depth of the tree
    cdef public SIZE_t node_count        # Counter for node IDs
    cdef public SIZE_t capacity          # Capacity of tree, in terms of nodes
    cdef Node* nodes                     # Array of nodes
    cdef double* value                   # (capacity, n_outputs, max_n_classes) array of values
    cdef SIZE_t value_stride             # = n_outputs * max_n_classes
    # Methods
    cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
                          SIZE_t feature, double threshold, double impurity,
                          SIZE_t n_node_samples,
                          double weighted_n_samples) nogil
    cdef void _resize(self, SIZE_t capacity) except *
    cdef int _resize_c(self, SIZE_t capacity=*) nogil
    cdef np.ndarray _get_value_ndarray(self)
    cdef np.ndarray _get_node_ndarray(self)
    cpdef np.ndarray predict(self, object X)
    cpdef np.ndarray apply(self, object X)
    cdef np.ndarray _apply_dense(self, object X)
    cdef np.ndarray _apply_sparse_csr(self, object X)
    cpdef compute_feature_importances(self, normalize=*)
 # =============================================================================
 # Tree builder
 # =============================================================================
 cdef class TreeBuilder:
    # The TreeBuilder recursively builds a Tree object from training samples,
    # using a Splitter object for splitting internal nodes and assigning
    # values to leaves.
    #
    # This class controls the various stopping criteria and the node splitting
    # evaluation order, e.g. depth-first or best-first.
    cdef Splitter splitter          # Splitting algorithm
    cdef SIZE_t min_samples_split   # Minimum number of samples in an internal node
    cdef SIZE_t min_samples_leaf    # Minimum number of samples in a leaf
    cdef double min_weight_leaf     # Minimum weight in a leaf
    cdef SIZE_t max_depth           # Maximal tree depth
    cpdef build(self, Tree tree, object X, np.ndarray y,
                np.ndarray sample_weight=*)
    cdef _check_input(self, object X, np.ndarray y, np.ndarray sample_weight)
--- a/python/isaac/external/_tree.pyx
+++ b/python/isaac/external/_tree.pyx
--- a/python/isaac/external/_utils.pxd
+++ b/python/isaac/external/_utils.pxd
@@ -0,0 +1,68 @@
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 #
 # Licence: BSD 3 clause
 # See _utils.pyx for details.
 import numpy as np
 cimport numpy as np
 ctypedef np.npy_intp SIZE_t              # Type for indices and counters
 # =============================================================================
 # Stack data structure
 # =============================================================================
 # A record on the stack for depth-first tree growing
 cdef struct StackRecord:
    SIZE_t start
    SIZE_t end
    SIZE_t depth
    SIZE_t parent
    bint is_left
    double impurity
    SIZE_t n_constant_features
 cdef class Stack:
    cdef SIZE_t capacity
    cdef SIZE_t top
    cdef StackRecord* stack_
    cdef bint is_empty(self) nogil
    cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
                  bint is_left, double impurity,
                  SIZE_t n_constant_features) nogil
    cdef int pop(self, StackRecord* res) nogil
 # =============================================================================
 # PriorityHeap data structure
 # =============================================================================
 # A record on the frontier for best-first tree growing
 cdef struct PriorityHeapRecord:
    SIZE_t node_id
    SIZE_t start
    SIZE_t end
    SIZE_t pos
    SIZE_t depth
    bint is_leaf
    double impurity
    double impurity_left
    double impurity_right
    double improvement
 cdef class PriorityHeap:
    cdef SIZE_t capacity
    cdef SIZE_t heap_ptr
    cdef PriorityHeapRecord* heap_
    cdef bint is_empty(self) nogil
    cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
                  SIZE_t depth, bint is_leaf, double improvement,
                  double impurity, double impurity_left,
                  double impurity_right) nogil
    cdef int pop(self, PriorityHeapRecord* res) nogil
--- a/python/isaac/external/_utils.pyx
+++ b/python/isaac/external/_utils.pyx
@@ -0,0 +1,230 @@
 # cython: cdivision=True
 # cython: boundscheck=False
 # cython: wraparound=False
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 #
 # Licence: BSD 3 clause
 from libc.stdlib cimport free, malloc, realloc
 # =============================================================================
 # Stack data structure
 # =============================================================================
 cdef class Stack:
    """A LIFO data structure.
    Attributes
    ----------
    capacity : SIZE_t
        The elements the stack can hold; if more added then ``self.stack_``
        needs to be resized.
    top : SIZE_t
        The number of elements currently on the stack.
    stack : StackRecord pointer
        The stack of records (upward in the stack corresponds to the right).
    """
    def __cinit__(self, SIZE_t capacity):
        self.capacity = capacity
        self.top = 0
        self.stack_ = <StackRecord*> malloc(capacity * sizeof(StackRecord))
        if self.stack_ == NULL:
            raise MemoryError()
    def __dealloc__(self):
        free(self.stack_)
    cdef bint is_empty(self) nogil:
        return self.top <= 0
    cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
                  bint is_left, double impurity,
                  SIZE_t n_constant_features) nogil:
        """Push a new element onto the stack.
        Returns 0 if successful; -1 on out of memory error.
        """
        cdef SIZE_t top = self.top
        cdef StackRecord* stack = NULL
        # Resize if capacity not sufficient
        if top >= self.capacity:
            self.capacity *= 2
            stack = <StackRecord*> realloc(self.stack_,
                                           self.capacity * sizeof(StackRecord))
            if stack == NULL:
                # no free; __dealloc__ handles that
                return -1
            self.stack_ = stack
        stack = self.stack_
        stack[top].start = start
        stack[top].end = end
        stack[top].depth = depth
        stack[top].parent = parent
        stack[top].is_left = is_left
        stack[top].impurity = impurity
        stack[top].n_constant_features = n_constant_features
        # Increment stack pointer
        self.top = top + 1
        return 0
    cdef int pop(self, StackRecord* res) nogil:
        """Remove the top element from the stack and copy to ``res``.
        Returns 0 if pop was successful (and ``res`` is set); -1
        otherwise.
        """
        cdef SIZE_t top = self.top
        cdef StackRecord* stack = self.stack_
        if top <= 0:
            return -1
        res[0] = stack[top - 1]
        self.top = top - 1
        return 0
 # =============================================================================
 # PriorityHeap data structure
 # =============================================================================
 cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil:
    """Restore heap invariant parent.improvement > child.improvement from
       ``pos`` upwards. """
    if pos == 0:
        return
    cdef SIZE_t parent_pos = (pos - 1) / 2
    if heap[parent_pos].improvement < heap[pos].improvement:
        heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
        heapify_up(heap, parent_pos)
 cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos,
                       SIZE_t heap_length) nogil:
    """Restore heap invariant parent.improvement > children.improvement from
       ``pos`` downwards. """
    cdef SIZE_t left_pos = 2 * (pos + 1) - 1
    cdef SIZE_t right_pos = 2 * (pos + 1)
    cdef SIZE_t largest = pos
    if (left_pos < heap_length and
            heap[left_pos].improvement > heap[largest].improvement):
        largest = left_pos
    if (right_pos < heap_length and
            heap[right_pos].improvement > heap[largest].improvement):
        largest = right_pos
    if largest != pos:
        heap[pos], heap[largest] = heap[largest], heap[pos]
        heapify_down(heap, largest, heap_length)
 cdef class PriorityHeap:
    """A priority queue implemented as a binary heap.
    The heap invariant is that the impurity improvement of the parent record
    is larger then the impurity improvement of the children.
    Attributes
    ----------
    capacity : SIZE_t
        The capacity of the heap
    heap_ptr : SIZE_t
        The water mark of the heap; the heap grows from left to right in the
        array ``heap_``. The following invariant holds ``heap_ptr < capacity``.
    heap_ : PriorityHeapRecord*
        The array of heap records. The maximum element is on the left;
        the heap grows from left to right
    """
    def __cinit__(self, SIZE_t capacity):
        self.capacity = capacity
        self.heap_ptr = 0
        self.heap_ = <PriorityHeapRecord*> malloc(capacity * sizeof(PriorityHeapRecord))
        if self.heap_ == NULL:
            raise MemoryError()
    def __dealloc__(self):
        free(self.heap_)
    cdef bint is_empty(self) nogil:
        return self.heap_ptr <= 0
    cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
                  SIZE_t depth, bint is_leaf, double improvement,
                  double impurity, double impurity_left,
                  double impurity_right) nogil:
        """Push record on the priority heap.
        Returns 0 if successful; -1 on out of memory error.
        """
        cdef SIZE_t heap_ptr = self.heap_ptr
        cdef PriorityHeapRecord* heap = NULL
        # Resize if capacity not sufficient
        if heap_ptr >= self.capacity:
            self.capacity *= 2
            heap = <PriorityHeapRecord*> realloc(self.heap_,
                                                 self.capacity *
                                                 sizeof(PriorityHeapRecord))
            if heap == NULL:
                # no free; __dealloc__ handles that
                return -1
            self.heap_ = heap
        # Put element as last element of heap
        heap = self.heap_
        heap[heap_ptr].node_id = node_id
        heap[heap_ptr].start = start
        heap[heap_ptr].end = end
        heap[heap_ptr].pos = pos
        heap[heap_ptr].depth = depth
        heap[heap_ptr].is_leaf = is_leaf
        heap[heap_ptr].impurity = impurity
        heap[heap_ptr].impurity_left = impurity_left
        heap[heap_ptr].impurity_right = impurity_right
        heap[heap_ptr].improvement = improvement
        # Heapify up
        heapify_up(heap, heap_ptr)
        # Increase element count
        self.heap_ptr = heap_ptr + 1
        return 0
    cdef int pop(self, PriorityHeapRecord* res) nogil:
        """Remove max element from the heap. """
        cdef SIZE_t heap_ptr = self.heap_ptr
        cdef PriorityHeapRecord* heap = self.heap_
        if heap_ptr <= 0:
            return -1
        # Take first element
        res[0] = heap[0]
        # Put last element to the front
        heap[0], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[0]
        # Restore heap invariant
        if heap_ptr > 1:
            heapify_down(heap, 0, heap_ptr - 1)
        self.heap_ptr = heap_ptr - 1
        return 0
--- a/python/isaac/external/tree.py
+++ b/python/isaac/external/tree.py
--- a/python/isaac/external/tree.pyc
+++ b/python/isaac/external/tree.pyc