Python: added standalone decision tree from sklearn

2015-08-12 21:59:59 -07:00
parent 71224a1507
commit cc2d3416be
9 changed files with 42861 additions and 0 deletions
--- a/python/isaac/external/init.py
+++ b/python/isaac/external/init.py
@@ -0,0 +1,12 @@
+"""
+The :mod:`sklearn.tree` module includes decision tree-based models for
+classification and regression.
+"""
+
+from .tree import DecisionTreeClassifier
+from .tree import DecisionTreeRegressor
+from .tree import ExtraTreeClassifier
+from .tree import ExtraTreeRegressor
+
+__all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor",
+           "ExtraTreeClassifier", "ExtraTreeRegressor"]
--- a/python/isaac/external/init.pyc
+++ b/python/isaac/external/init.pyc
--- a/python/isaac/external/_tree.c
+++ b/python/isaac/external/_tree.c
--- a/python/isaac/external/_tree.pxd
+++ b/python/isaac/external/_tree.pxd
@@ -0,0 +1,274 @@
+# Authors: Gilles Louppe <g.louppe@gmail.com>
+#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Brian Holt <bdholt1@gmail.com>
+#          Joel Nothman <joel.nothman@gmail.com>
+#          Arnaud Joly <arnaud.v.joly@gmail.com>
+#
+# Licence: BSD 3 clause
+
+# See _tree.pyx for details.
+
+import numpy as np
+cimport numpy as np
+
+ctypedef np.npy_float32 DTYPE_t          # Type of X
+ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
+ctypedef np.npy_intp SIZE_t              # Type for indices and counters
+ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
+ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
+
+
+# =============================================================================
+# Stack data structure
+# =============================================================================
+
+# A record on the stack for depth-first tree growing
+cdef struct StackRecord:
+    SIZE_t start
+    SIZE_t end
+    SIZE_t depth
+    SIZE_t parent
+    bint is_left
+    double impurity
+    SIZE_t n_constant_features
+
+cdef class Stack:
+    cdef SIZE_t capacity
+    cdef SIZE_t top
+    cdef StackRecord* stack_
+
+    cdef bint is_empty(self) nogil
+    cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
+                  bint is_left, double impurity,
+                  SIZE_t n_constant_features) nogil
+    cdef int pop(self, StackRecord* res) nogil
+
+
+# =============================================================================
+# PriorityHeap data structure
+# =============================================================================
+
+# A record on the frontier for best-first tree growing
+cdef struct PriorityHeapRecord:
+    SIZE_t node_id
+    SIZE_t start
+    SIZE_t end
+    SIZE_t pos
+    SIZE_t depth
+    bint is_leaf
+    double impurity
+    double impurity_left
+    double impurity_right
+    double improvement
+
+cdef class PriorityHeap:
+    cdef SIZE_t capacity
+    cdef SIZE_t heap_ptr
+    cdef PriorityHeapRecord* heap_
+
+    cdef bint is_empty(self) nogil
+    cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
+                  SIZE_t depth, bint is_leaf, double improvement,
+                  double impurity, double impurity_left,
+                  double impurity_right) nogil
+    cdef int pop(self, PriorityHeapRecord* res) nogil
+
+
+# =============================================================================
+# Criterion
+# =============================================================================
+
+cdef class Criterion:
+    # The criterion computes the impurity of a node and the reduction of
+    # impurity of a split on that node. It also computes the output statistics
+    # such as the mean in regression and class probabilities in classification.
+
+    # Internal structures
+    cdef DOUBLE_t* y                     # Values of y
+    cdef SIZE_t y_stride                 # Stride in y (since n_outputs >= 1)
+    cdef DOUBLE_t* sample_weight         # Sample weights
+
+    cdef SIZE_t* samples                 # Sample indices in X, y
+    cdef SIZE_t start                    # samples[start:pos] are the samples in the left node
+    cdef SIZE_t pos                      # samples[pos:end] are the samples in the right node
+    cdef SIZE_t end
+
+    cdef SIZE_t n_outputs                # Number of outputs
+    cdef SIZE_t n_node_samples           # Number of samples in the node (end-start)
+    cdef double weighted_n_samples       # Weighted number of samples (in total)
+    cdef double weighted_n_node_samples  # Weighted number of samples in the node
+    cdef double weighted_n_left          # Weighted number of samples in the left node
+    cdef double weighted_n_right         # Weighted number of samples in the right node
+
+    # The criterion object is maintained such that left and right collected
+    # statistics correspond to samples[start:pos] and samples[pos:end].
+
+    # Methods
+    cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
+                   double weighted_n_samples, SIZE_t* samples, SIZE_t start,
+                   SIZE_t end) nogil
+    cdef void reset(self) nogil
+    cdef void update(self, SIZE_t new_pos) nogil
+    cdef double node_impurity(self) nogil
+    cdef void children_impurity(self, double* impurity_left,
+                                double* impurity_right) nogil
+    cdef void node_value(self, double* dest) nogil
+    cdef double impurity_improvement(self, double impurity) nogil
+
+
+# =============================================================================
+# Splitter
+# =============================================================================
+
+cdef struct SplitRecord:
+    # Data to track sample split
+    SIZE_t feature         # Which feature to split on.
+    SIZE_t pos             # Split samples array at the given position,
+                           # i.e. count of samples below threshold for feature.
+                           # pos is >= end if the node is a leaf.
+    double threshold       # Threshold to split at.
+    double improvement     # Impurity improvement given parent node.
+    double impurity_left   # Impurity of the left split.
+    double impurity_right  # Impurity of the right split.
+
+
+cdef class Splitter:
+    # The splitter searches in the input space for a feature and a threshold
+    # to split the samples samples[start:end].
+    #
+    # The impurity computations are delegated to a criterion object.
+
+    # Internal structures
+    cdef public Criterion criterion      # Impurity criterion
+    cdef public SIZE_t max_features      # Number of features to test
+    cdef public SIZE_t min_samples_leaf  # Min samples in a leaf
+    cdef public double min_weight_leaf   # Minimum weight in a leaf
+
+    cdef object random_state             # Random state
+    cdef UINT32_t rand_r_state           # sklearn_rand_r random number state
+
+    cdef SIZE_t* samples                 # Sample indices in X, y
+    cdef SIZE_t n_samples                # X.shape[0]
+    cdef double weighted_n_samples       # Weighted number of samples
+    cdef SIZE_t* features                # Feature indices in X
+    cdef SIZE_t* constant_features       # Constant features indices
+    cdef SIZE_t n_features               # X.shape[1]
+    cdef DTYPE_t* feature_values         # temp. array holding feature values
+
+    cdef SIZE_t start                    # Start position for the current node
+    cdef SIZE_t end                      # End position for the current node
+
+    cdef DOUBLE_t* y
+    cdef SIZE_t y_stride
+    cdef DOUBLE_t* sample_weight
+
+    # The samples vector `samples` is maintained by the Splitter object such
+    # that the samples contained in a node are contiguous. With this setting,
+    # `node_split` reorganizes the node samples `samples[start:end]` in two
+    # subsets `samples[start:pos]` and `samples[pos:end]`.
+
+    # The 1-d  `features` array of size n_features contains the features
+    # indices and allows fast sampling without replacement of features.
+
+    # The 1-d `constant_features` array of size n_features holds in
+    # `constant_features[:n_constant_features]` the feature ids with
+    # constant values for all the samples that reached a specific node.
+    # The value `n_constant_features` is given by the the parent node to its
+    # child nodes.  The content of the range `[n_constant_features:]` is left
+    # undefined, but preallocated for performance reasons
+    # This allows optimization with depth-based tree building.
+
+    # Methods
+    cdef void init(self, object X, np.ndarray y,
+                   DOUBLE_t* sample_weight) except *
+
+    cdef void node_reset(self, SIZE_t start, SIZE_t end,
+                         double* weighted_n_node_samples) nogil
+
+    cdef void node_split(self,
+                         double impurity,   # Impurity of the node
+                         SplitRecord* split,
+                         SIZE_t* n_constant_features) nogil
+
+    cdef void node_value(self, double* dest) nogil
+
+    cdef double node_impurity(self) nogil
+
+
+# =============================================================================
+# Tree
+# =============================================================================
+
+cdef struct Node:
+    # Base storage structure for the nodes in a Tree object
+
+    SIZE_t left_child                    # id of the left child of the node
+    SIZE_t right_child                   # id of the right child of the node
+    SIZE_t feature                       # Feature used for splitting the node
+    DOUBLE_t threshold                   # Threshold value at the node
+    DOUBLE_t impurity                    # Impurity of the node (i.e., the value of the criterion)
+    SIZE_t n_node_samples                # Number of samples at the node
+    DOUBLE_t weighted_n_node_samples     # Weighted number of samples at the node
+
+
+cdef class Tree:
+    # The Tree object is a binary tree structure constructed by the
+    # TreeBuilder. The tree structure is used for predictions and
+    # feature importances.
+
+    # Input/Output layout
+    cdef public SIZE_t n_features        # Number of features in X
+    cdef SIZE_t* n_classes               # Number of classes in y[:, k]
+    cdef public SIZE_t n_outputs         # Number of outputs in y
+    cdef public SIZE_t max_n_classes     # max(n_classes)
+
+    # Inner structures: values are stored separately from node structure,
+    # since size is determined at runtime.
+    cdef public SIZE_t max_depth         # Max depth of the tree
+    cdef public SIZE_t node_count        # Counter for node IDs
+    cdef public SIZE_t capacity          # Capacity of tree, in terms of nodes
+    cdef Node* nodes                     # Array of nodes
+    cdef double* value                   # (capacity, n_outputs, max_n_classes) array of values
+    cdef SIZE_t value_stride             # = n_outputs * max_n_classes
+
+    # Methods
+    cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
+                          SIZE_t feature, double threshold, double impurity,
+                          SIZE_t n_node_samples,
+                          double weighted_n_samples) nogil
+    cdef void _resize(self, SIZE_t capacity) except *
+    cdef int _resize_c(self, SIZE_t capacity=*) nogil
+
+    cdef np.ndarray _get_value_ndarray(self)
+    cdef np.ndarray _get_node_ndarray(self)
+
+    cpdef np.ndarray predict(self, object X)
+    cpdef np.ndarray apply(self, object X)
+    cdef np.ndarray _apply_dense(self, object X)
+    cdef np.ndarray _apply_sparse_csr(self, object X)
+
+    cpdef compute_feature_importances(self, normalize=*)
+
+
+# =============================================================================
+# Tree builder
+# =============================================================================
+
+cdef class TreeBuilder:
+    # The TreeBuilder recursively builds a Tree object from training samples,
+    # using a Splitter object for splitting internal nodes and assigning
+    # values to leaves.
+    #
+    # This class controls the various stopping criteria and the node splitting
+    # evaluation order, e.g. depth-first or best-first.
+
+    cdef Splitter splitter          # Splitting algorithm
+
+    cdef SIZE_t min_samples_split   # Minimum number of samples in an internal node
+    cdef SIZE_t min_samples_leaf    # Minimum number of samples in a leaf
+    cdef double min_weight_leaf     # Minimum weight in a leaf
+    cdef SIZE_t max_depth           # Maximal tree depth
+
+    cpdef build(self, Tree tree, object X, np.ndarray y,
+                np.ndarray sample_weight=*)
+    cdef _check_input(self, object X, np.ndarray y, np.ndarray sample_weight)
--- a/python/isaac/external/_tree.pyx
+++ b/python/isaac/external/_tree.pyx
--- a/python/isaac/external/_utils.pxd
+++ b/python/isaac/external/_utils.pxd
@@ -0,0 +1,68 @@
+# Authors: Gilles Louppe <g.louppe@gmail.com>
+#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Arnaud Joly <arnaud.v.joly@gmail.com>
+#
+# Licence: BSD 3 clause
+
+# See _utils.pyx for details.
+
+import numpy as np
+cimport numpy as np
+
+ctypedef np.npy_intp SIZE_t              # Type for indices and counters
+
+
+# =============================================================================
+# Stack data structure
+# =============================================================================
+
+# A record on the stack for depth-first tree growing
+cdef struct StackRecord:
+    SIZE_t start
+    SIZE_t end
+    SIZE_t depth
+    SIZE_t parent
+    bint is_left
+    double impurity
+    SIZE_t n_constant_features
+
+cdef class Stack:
+    cdef SIZE_t capacity
+    cdef SIZE_t top
+    cdef StackRecord* stack_
+
+    cdef bint is_empty(self) nogil
+    cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
+                  bint is_left, double impurity,
+                  SIZE_t n_constant_features) nogil
+    cdef int pop(self, StackRecord* res) nogil
+
+
+# =============================================================================
+# PriorityHeap data structure
+# =============================================================================
+
+# A record on the frontier for best-first tree growing
+cdef struct PriorityHeapRecord:
+    SIZE_t node_id
+    SIZE_t start
+    SIZE_t end
+    SIZE_t pos
+    SIZE_t depth
+    bint is_leaf
+    double impurity
+    double impurity_left
+    double impurity_right
+    double improvement
+
+cdef class PriorityHeap:
+    cdef SIZE_t capacity
+    cdef SIZE_t heap_ptr
+    cdef PriorityHeapRecord* heap_
+
+    cdef bint is_empty(self) nogil
+    cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
+                  SIZE_t depth, bint is_leaf, double improvement,
+                  double impurity, double impurity_left,
+                  double impurity_right) nogil
+    cdef int pop(self, PriorityHeapRecord* res) nogil
--- a/python/isaac/external/_utils.pyx
+++ b/python/isaac/external/_utils.pyx
@@ -0,0 +1,230 @@
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+
+# Authors: Gilles Louppe <g.louppe@gmail.com>
+#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Arnaud Joly <arnaud.v.joly@gmail.com>
+#
+# Licence: BSD 3 clause
+
+from libc.stdlib cimport free, malloc, realloc
+
+# =============================================================================
+# Stack data structure
+# =============================================================================
+
+cdef class Stack:
+    """A LIFO data structure.
+
+    Attributes
+    ----------
+    capacity : SIZE_t
+        The elements the stack can hold; if more added then ``self.stack_``
+        needs to be resized.
+
+    top : SIZE_t
+        The number of elements currently on the stack.
+
+    stack : StackRecord pointer
+        The stack of records (upward in the stack corresponds to the right).
+    """
+
+    def __cinit__(self, SIZE_t capacity):
+        self.capacity = capacity
+        self.top = 0
+        self.stack_ = <StackRecord*> malloc(capacity * sizeof(StackRecord))
+        if self.stack_ == NULL:
+            raise MemoryError()
+
+    def __dealloc__(self):
+        free(self.stack_)
+
+    cdef bint is_empty(self) nogil:
+        return self.top <= 0
+
+    cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
+                  bint is_left, double impurity,
+                  SIZE_t n_constant_features) nogil:
+        """Push a new element onto the stack.
+
+        Returns 0 if successful; -1 on out of memory error.
+        """
+        cdef SIZE_t top = self.top
+        cdef StackRecord* stack = NULL
+
+        # Resize if capacity not sufficient
+        if top >= self.capacity:
+            self.capacity *= 2
+            stack = <StackRecord*> realloc(self.stack_,
+                                           self.capacity * sizeof(StackRecord))
+            if stack == NULL:
+                # no free; __dealloc__ handles that
+                return -1
+            self.stack_ = stack
+
+        stack = self.stack_
+        stack[top].start = start
+        stack[top].end = end
+        stack[top].depth = depth
+        stack[top].parent = parent
+        stack[top].is_left = is_left
+        stack[top].impurity = impurity
+        stack[top].n_constant_features = n_constant_features
+
+        # Increment stack pointer
+        self.top = top + 1
+        return 0
+
+    cdef int pop(self, StackRecord* res) nogil:
+        """Remove the top element from the stack and copy to ``res``.
+
+        Returns 0 if pop was successful (and ``res`` is set); -1
+        otherwise.
+        """
+        cdef SIZE_t top = self.top
+        cdef StackRecord* stack = self.stack_
+
+        if top <= 0:
+            return -1
+
+        res[0] = stack[top - 1]
+        self.top = top - 1
+
+        return 0
+
+
+# =============================================================================
+# PriorityHeap data structure
+# =============================================================================
+
+cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil:
+    """Restore heap invariant parent.improvement > child.improvement from
+       ``pos`` upwards. """
+    if pos == 0:
+        return
+
+    cdef SIZE_t parent_pos = (pos - 1) / 2
+
+    if heap[parent_pos].improvement < heap[pos].improvement:
+        heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
+        heapify_up(heap, parent_pos)
+
+
+cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos,
+                       SIZE_t heap_length) nogil:
+    """Restore heap invariant parent.improvement > children.improvement from
+       ``pos`` downwards. """
+    cdef SIZE_t left_pos = 2 * (pos + 1) - 1
+    cdef SIZE_t right_pos = 2 * (pos + 1)
+    cdef SIZE_t largest = pos
+
+    if (left_pos < heap_length and
+            heap[left_pos].improvement > heap[largest].improvement):
+        largest = left_pos
+
+    if (right_pos < heap_length and
+            heap[right_pos].improvement > heap[largest].improvement):
+        largest = right_pos
+
+    if largest != pos:
+        heap[pos], heap[largest] = heap[largest], heap[pos]
+        heapify_down(heap, largest, heap_length)
+
+
+cdef class PriorityHeap:
+    """A priority queue implemented as a binary heap.
+
+    The heap invariant is that the impurity improvement of the parent record
+    is larger then the impurity improvement of the children.
+
+    Attributes
+    ----------
+    capacity : SIZE_t
+        The capacity of the heap
+
+    heap_ptr : SIZE_t
+        The water mark of the heap; the heap grows from left to right in the
+        array ``heap_``. The following invariant holds ``heap_ptr < capacity``.
+
+    heap_ : PriorityHeapRecord*
+        The array of heap records. The maximum element is on the left;
+        the heap grows from left to right
+    """
+
+    def __cinit__(self, SIZE_t capacity):
+        self.capacity = capacity
+        self.heap_ptr = 0
+        self.heap_ = <PriorityHeapRecord*> malloc(capacity * sizeof(PriorityHeapRecord))
+        if self.heap_ == NULL:
+            raise MemoryError()
+
+    def __dealloc__(self):
+        free(self.heap_)
+
+    cdef bint is_empty(self) nogil:
+        return self.heap_ptr <= 0
+
+    cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
+                  SIZE_t depth, bint is_leaf, double improvement,
+                  double impurity, double impurity_left,
+                  double impurity_right) nogil:
+        """Push record on the priority heap.
+
+        Returns 0 if successful; -1 on out of memory error.
+        """
+        cdef SIZE_t heap_ptr = self.heap_ptr
+        cdef PriorityHeapRecord* heap = NULL
+
+        # Resize if capacity not sufficient
+        if heap_ptr >= self.capacity:
+            self.capacity *= 2
+            heap = <PriorityHeapRecord*> realloc(self.heap_,
+                                                 self.capacity *
+                                                 sizeof(PriorityHeapRecord))
+            if heap == NULL:
+                # no free; __dealloc__ handles that
+                return -1
+            self.heap_ = heap
+
+        # Put element as last element of heap
+        heap = self.heap_
+        heap[heap_ptr].node_id = node_id
+        heap[heap_ptr].start = start
+        heap[heap_ptr].end = end
+        heap[heap_ptr].pos = pos
+        heap[heap_ptr].depth = depth
+        heap[heap_ptr].is_leaf = is_leaf
+        heap[heap_ptr].impurity = impurity
+        heap[heap_ptr].impurity_left = impurity_left
+        heap[heap_ptr].impurity_right = impurity_right
+        heap[heap_ptr].improvement = improvement
+
+        # Heapify up
+        heapify_up(heap, heap_ptr)
+
+        # Increase element count
+        self.heap_ptr = heap_ptr + 1
+        return 0
+
+    cdef int pop(self, PriorityHeapRecord* res) nogil:
+        """Remove max element from the heap. """
+        cdef SIZE_t heap_ptr = self.heap_ptr
+        cdef PriorityHeapRecord* heap = self.heap_
+
+        if heap_ptr <= 0:
+            return -1
+
+        # Take first element
+        res[0] = heap[0]
+
+        # Put last element to the front
+        heap[0], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[0]
+
+        # Restore heap invariant
+        if heap_ptr > 1:
+            heapify_down(heap, 0, heap_ptr - 1)
+
+        self.heap_ptr = heap_ptr - 1
+
+        return 0
--- a/python/isaac/external/tree.py
+++ b/python/isaac/external/tree.py
--- a/python/isaac/external/tree.pyc
+++ b/python/isaac/external/tree.pyc