3678 lines
135 KiB
Cython
3678 lines
135 KiB
Cython
# cython: cdivision=True
|
|
# cython: boundscheck=False
|
|
# cython: wraparound=False
|
|
|
|
# Authors: Gilles Louppe <g.louppe@gmail.com>
|
|
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
|
# Brian Holt <bdholt1@gmail.com>
|
|
# Noel Dawe <noel@dawe.me>
|
|
# Satrajit Gosh <satrajit.ghosh@gmail.com>
|
|
# Lars Buitinck <L.J.Buitinck@uva.nl>
|
|
# Arnaud Joly <arnaud.v.joly@gmail.com>
|
|
# Joel Nothman <joel.nothman@gmail.com>
|
|
# Fares Hedayati <fares.hedayati@gmail.com>
|
|
#
|
|
# Licence: BSD 3 clause
|
|
|
|
from libc.stdlib cimport malloc, calloc, free, realloc, qsort
|
|
|
|
from libc.string cimport memcpy, memset
|
|
from libc.math cimport log as ln
|
|
from cpython cimport Py_INCREF, PyObject
|
|
|
|
import numpy as np
|
|
cimport numpy as np
|
|
np.import_array()
|
|
from _tree cimport StackRecord
|
|
|
|
cdef extern from "numpy/arrayobject.h":
|
|
object PyArray_NewFromDescr(object subtype, np.dtype descr,
|
|
int nd, np.npy_intp* dims,
|
|
np.npy_intp* strides,
|
|
void* data, int flags, object obj)
|
|
|
|
# =============================================================================
|
|
# Types and constants
|
|
# =============================================================================
|
|
|
|
from numpy import float32 as DTYPE
|
|
from numpy import float64 as DOUBLE
|
|
|
|
cdef double INFINITY = np.inf
|
|
TREE_LEAF = -1
|
|
TREE_UNDEFINED = -2
|
|
cdef SIZE_t _TREE_LEAF = TREE_LEAF
|
|
cdef SIZE_t _TREE_UNDEFINED = TREE_UNDEFINED
|
|
cdef SIZE_t INITIAL_STACK_SIZE = 10
|
|
|
|
cdef DTYPE_t MIN_IMPURITY_SPLIT = 1e-7
|
|
|
|
# Mitigate precision differences between 32 bit and 64 bit
|
|
cdef DTYPE_t FEATURE_THRESHOLD = 1e-7
|
|
|
|
# Constant to switch between algorithm non zero value extract algorithm
|
|
# in SparseSplitter
|
|
cdef DTYPE_t EXTRACT_NNZ_SWITCH = 0.1
|
|
|
|
# Some handy constants (BestFirstTreeBuilder)
|
|
cdef int IS_FIRST = 1
|
|
cdef int IS_NOT_FIRST = 0
|
|
cdef int IS_LEFT = 1
|
|
cdef int IS_NOT_LEFT = 0
|
|
|
|
cdef enum:
|
|
# Max value for our rand_r replacement (near the bottom).
|
|
# We don't use RAND_MAX because it's different across platforms and
|
|
# particularly tiny on Windows/MSVC.
|
|
RAND_R_MAX = 0x7FFFFFFF
|
|
|
|
# Repeat struct definition for numpy
|
|
NODE_DTYPE = np.dtype({
|
|
'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity',
|
|
'n_node_samples', 'weighted_n_node_samples'],
|
|
'formats': [np.intp, np.intp, np.intp, np.float64, np.float64, np.intp,
|
|
np.float64],
|
|
'offsets': [
|
|
<Py_ssize_t> &(<Node*> NULL).left_child,
|
|
<Py_ssize_t> &(<Node*> NULL).right_child,
|
|
<Py_ssize_t> &(<Node*> NULL).feature,
|
|
<Py_ssize_t> &(<Node*> NULL).threshold,
|
|
<Py_ssize_t> &(<Node*> NULL).impurity,
|
|
<Py_ssize_t> &(<Node*> NULL).n_node_samples,
|
|
<Py_ssize_t> &(<Node*> NULL).weighted_n_node_samples
|
|
]
|
|
})
|
|
|
|
# =============================================================================
|
|
# Stack data structure
|
|
# =============================================================================
|
|
|
|
cdef class Stack:
|
|
"""A LIFO data structure.
|
|
|
|
Attributes
|
|
----------
|
|
capacity : SIZE_t
|
|
The elements the stack can hold; if more added then ``self.stack_``
|
|
needs to be resized.
|
|
|
|
top : SIZE_t
|
|
The number of elements currently on the stack.
|
|
|
|
stack : StackRecord pointer
|
|
The stack of records (upward in the stack corresponds to the right).
|
|
"""
|
|
|
|
def __cinit__(self, SIZE_t capacity):
|
|
self.capacity = capacity
|
|
self.top = 0
|
|
self.stack_ = <StackRecord*> malloc(capacity * sizeof(StackRecord))
|
|
if self.stack_ == NULL:
|
|
raise MemoryError()
|
|
|
|
def __dealloc__(self):
|
|
free(self.stack_)
|
|
|
|
cdef bint is_empty(self) nogil:
|
|
return self.top <= 0
|
|
|
|
cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
|
|
bint is_left, double impurity,
|
|
SIZE_t n_constant_features) nogil:
|
|
"""Push a new element onto the stack.
|
|
|
|
Returns 0 if successful; -1 on out of memory error.
|
|
"""
|
|
cdef SIZE_t top = self.top
|
|
cdef StackRecord* stack = NULL
|
|
|
|
# Resize if capacity not sufficient
|
|
if top >= self.capacity:
|
|
self.capacity *= 2
|
|
stack = <StackRecord*> realloc(self.stack_,
|
|
self.capacity * sizeof(StackRecord))
|
|
if stack == NULL:
|
|
# no free; __dealloc__ handles that
|
|
return -1
|
|
self.stack_ = stack
|
|
|
|
stack = self.stack_
|
|
stack[top].start = start
|
|
stack[top].end = end
|
|
stack[top].depth = depth
|
|
stack[top].parent = parent
|
|
stack[top].is_left = is_left
|
|
stack[top].impurity = impurity
|
|
stack[top].n_constant_features = n_constant_features
|
|
|
|
# Increment stack pointer
|
|
self.top = top + 1
|
|
return 0
|
|
|
|
cdef int pop(self, StackRecord* res) nogil:
|
|
"""Remove the top element from the stack and copy to ``res``.
|
|
|
|
Returns 0 if pop was successful (and ``res`` is set); -1
|
|
otherwise.
|
|
"""
|
|
cdef SIZE_t top = self.top
|
|
cdef StackRecord* stack = self.stack_
|
|
|
|
if top <= 0:
|
|
return -1
|
|
|
|
res[0] = stack[top - 1]
|
|
self.top = top - 1
|
|
|
|
return 0
|
|
|
|
|
|
# =============================================================================
|
|
# PriorityHeap data structure
|
|
# =============================================================================
|
|
|
|
cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil:
|
|
"""Restore heap invariant parent.improvement > child.improvement from
|
|
``pos`` upwards. """
|
|
if pos == 0:
|
|
return
|
|
|
|
cdef SIZE_t parent_pos = (pos - 1) / 2
|
|
|
|
if heap[parent_pos].improvement < heap[pos].improvement:
|
|
heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
|
|
heapify_up(heap, parent_pos)
|
|
|
|
|
|
cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos,
|
|
SIZE_t heap_length) nogil:
|
|
"""Restore heap invariant parent.improvement > children.improvement from
|
|
``pos`` downwards. """
|
|
cdef SIZE_t left_pos = 2 * (pos + 1) - 1
|
|
cdef SIZE_t right_pos = 2 * (pos + 1)
|
|
cdef SIZE_t largest = pos
|
|
|
|
if (left_pos < heap_length and
|
|
heap[left_pos].improvement > heap[largest].improvement):
|
|
largest = left_pos
|
|
|
|
if (right_pos < heap_length and
|
|
heap[right_pos].improvement > heap[largest].improvement):
|
|
largest = right_pos
|
|
|
|
if largest != pos:
|
|
heap[pos], heap[largest] = heap[largest], heap[pos]
|
|
heapify_down(heap, largest, heap_length)
|
|
|
|
|
|
cdef class PriorityHeap:
|
|
"""A priority queue implemented as a binary heap.
|
|
|
|
The heap invariant is that the impurity improvement of the parent record
|
|
is larger then the impurity improvement of the children.
|
|
|
|
Attributes
|
|
----------
|
|
capacity : SIZE_t
|
|
The capacity of the heap
|
|
|
|
heap_ptr : SIZE_t
|
|
The water mark of the heap; the heap grows from left to right in the
|
|
array ``heap_``. The following invariant holds ``heap_ptr < capacity``.
|
|
|
|
heap_ : PriorityHeapRecord*
|
|
The array of heap records. The maximum element is on the left;
|
|
the heap grows from left to right
|
|
"""
|
|
|
|
def __cinit__(self, SIZE_t capacity):
|
|
self.capacity = capacity
|
|
self.heap_ptr = 0
|
|
self.heap_ = <PriorityHeapRecord*> malloc(capacity * sizeof(PriorityHeapRecord))
|
|
if self.heap_ == NULL:
|
|
raise MemoryError()
|
|
|
|
def __dealloc__(self):
|
|
free(self.heap_)
|
|
|
|
cdef bint is_empty(self) nogil:
|
|
return self.heap_ptr <= 0
|
|
|
|
cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
|
|
SIZE_t depth, bint is_leaf, double improvement,
|
|
double impurity, double impurity_left,
|
|
double impurity_right) nogil:
|
|
"""Push record on the priority heap.
|
|
|
|
Returns 0 if successful; -1 on out of memory error.
|
|
"""
|
|
cdef SIZE_t heap_ptr = self.heap_ptr
|
|
cdef PriorityHeapRecord* heap = NULL
|
|
|
|
# Resize if capacity not sufficient
|
|
if heap_ptr >= self.capacity:
|
|
self.capacity *= 2
|
|
heap = <PriorityHeapRecord*> realloc(self.heap_,
|
|
self.capacity *
|
|
sizeof(PriorityHeapRecord))
|
|
if heap == NULL:
|
|
# no free; __dealloc__ handles that
|
|
return -1
|
|
self.heap_ = heap
|
|
|
|
# Put element as last element of heap
|
|
heap = self.heap_
|
|
heap[heap_ptr].node_id = node_id
|
|
heap[heap_ptr].start = start
|
|
heap[heap_ptr].end = end
|
|
heap[heap_ptr].pos = pos
|
|
heap[heap_ptr].depth = depth
|
|
heap[heap_ptr].is_leaf = is_leaf
|
|
heap[heap_ptr].impurity = impurity
|
|
heap[heap_ptr].impurity_left = impurity_left
|
|
heap[heap_ptr].impurity_right = impurity_right
|
|
heap[heap_ptr].improvement = improvement
|
|
|
|
# Heapify up
|
|
heapify_up(heap, heap_ptr)
|
|
|
|
# Increase element count
|
|
self.heap_ptr = heap_ptr + 1
|
|
return 0
|
|
|
|
cdef int pop(self, PriorityHeapRecord* res) nogil:
|
|
"""Remove max element from the heap. """
|
|
cdef SIZE_t heap_ptr = self.heap_ptr
|
|
cdef PriorityHeapRecord* heap = self.heap_
|
|
|
|
if heap_ptr <= 0:
|
|
return -1
|
|
|
|
# Take first element
|
|
res[0] = heap[0]
|
|
|
|
# Put last element to the front
|
|
heap[0], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[0]
|
|
|
|
# Restore heap invariant
|
|
if heap_ptr > 1:
|
|
heapify_down(heap, 0, heap_ptr - 1)
|
|
|
|
self.heap_ptr = heap_ptr - 1
|
|
|
|
return 0
|
|
|
|
|
|
# =============================================================================
|
|
# Criterion
|
|
# =============================================================================
|
|
|
|
cdef class Criterion:
|
|
"""Interface for impurity criteria."""
|
|
|
|
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
|
|
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
|
|
SIZE_t end) nogil:
|
|
"""Initialize the criterion at node samples[start:end] and
|
|
children samples[start:start] and samples[start:end]."""
|
|
pass
|
|
|
|
cdef void reset(self) nogil:
|
|
"""Reset the criterion at pos=start."""
|
|
pass
|
|
|
|
cdef void update(self, SIZE_t new_pos) nogil:
|
|
"""Update the collected statistics by moving samples[pos:new_pos] from
|
|
the right child to the left child."""
|
|
pass
|
|
|
|
cdef double node_impurity(self) nogil:
|
|
"""Evaluate the impurity of the current node, i.e. the impurity of
|
|
samples[start:end]."""
|
|
pass
|
|
|
|
cdef void children_impurity(self, double* impurity_left,
|
|
double* impurity_right) nogil:
|
|
"""Evaluate the impurity in children nodes, i.e. the impurity of
|
|
samples[start:pos] + the impurity of samples[pos:end]."""
|
|
pass
|
|
|
|
cdef void node_value(self, double* dest) nogil:
|
|
"""Compute the node value of samples[start:end] into dest."""
|
|
pass
|
|
|
|
cdef double impurity_improvement(self, double impurity) nogil:
|
|
"""Weighted impurity improvement, i.e.
|
|
|
|
N_t / N * (impurity - N_t_L / N_t * left impurity
|
|
- N_t_L / N_t * right impurity),
|
|
|
|
where N is the total number of samples, N_t is the number of samples
|
|
in the current node, N_t_L is the number of samples in the left
|
|
child and N_t_R is the number of samples in the right child."""
|
|
cdef double impurity_left
|
|
cdef double impurity_right
|
|
|
|
self.children_impurity(&impurity_left, &impurity_right)
|
|
|
|
return ((self.weighted_n_node_samples / self.weighted_n_samples) *
|
|
(impurity - self.weighted_n_right / self.weighted_n_node_samples * impurity_right
|
|
- self.weighted_n_left / self.weighted_n_node_samples * impurity_left))
|
|
|
|
|
|
cdef class ClassificationCriterion(Criterion):
|
|
"""Abstract criterion for classification."""
|
|
cdef SIZE_t* n_classes
|
|
cdef SIZE_t label_count_stride
|
|
cdef double* label_count_left
|
|
cdef double* label_count_right
|
|
cdef double* label_count_total
|
|
|
|
def __cinit__(self, SIZE_t n_outputs,
|
|
np.ndarray[SIZE_t, ndim=1] n_classes):
|
|
# Default values
|
|
self.y = NULL
|
|
self.y_stride = 0
|
|
self.sample_weight = NULL
|
|
|
|
self.samples = NULL
|
|
self.start = 0
|
|
self.pos = 0
|
|
self.end = 0
|
|
|
|
self.n_outputs = n_outputs
|
|
self.n_node_samples = 0
|
|
self.weighted_n_node_samples = 0.0
|
|
self.weighted_n_left = 0.0
|
|
self.weighted_n_right = 0.0
|
|
|
|
self.label_count_left = NULL
|
|
self.label_count_right = NULL
|
|
self.label_count_total = NULL
|
|
|
|
# Count labels for each output
|
|
self.n_classes = NULL
|
|
safe_realloc(&self.n_classes, n_outputs)
|
|
|
|
cdef SIZE_t k = 0
|
|
cdef SIZE_t label_count_stride = 0
|
|
|
|
for k in range(n_outputs):
|
|
self.n_classes[k] = n_classes[k]
|
|
|
|
if n_classes[k] > label_count_stride:
|
|
label_count_stride = n_classes[k]
|
|
|
|
self.label_count_stride = label_count_stride
|
|
|
|
# Allocate counters
|
|
cdef SIZE_t n_elements = n_outputs * label_count_stride
|
|
self.label_count_left = <double*> calloc(n_elements, sizeof(double))
|
|
self.label_count_right = <double*> calloc(n_elements, sizeof(double))
|
|
self.label_count_total = <double*> calloc(n_elements, sizeof(double))
|
|
|
|
# Check for allocation errors
|
|
if (self.label_count_left == NULL or
|
|
self.label_count_right == NULL or
|
|
self.label_count_total == NULL):
|
|
raise MemoryError()
|
|
|
|
def __dealloc__(self):
|
|
"""Destructor."""
|
|
free(self.n_classes)
|
|
free(self.label_count_left)
|
|
free(self.label_count_right)
|
|
free(self.label_count_total)
|
|
|
|
def __reduce__(self):
|
|
return (ClassificationCriterion,
|
|
(self.n_outputs,
|
|
sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)),
|
|
self.__getstate__())
|
|
|
|
def __getstate__(self):
|
|
return {}
|
|
|
|
def __setstate__(self, d):
|
|
pass
|
|
|
|
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride,
|
|
DOUBLE_t* sample_weight, double weighted_n_samples,
|
|
SIZE_t* samples, SIZE_t start, SIZE_t end) nogil:
|
|
"""Initialize the criterion at node samples[start:end] and
|
|
children samples[start:start] and samples[start:end]."""
|
|
# Initialize fields
|
|
self.y = y
|
|
self.y_stride = y_stride
|
|
self.sample_weight = sample_weight
|
|
self.samples = samples
|
|
self.start = start
|
|
self.end = end
|
|
self.n_node_samples = end - start
|
|
self.weighted_n_samples = weighted_n_samples
|
|
cdef double weighted_n_node_samples = 0.0
|
|
|
|
# Initialize label_count_total and weighted_n_node_samples
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef SIZE_t* n_classes = self.n_classes
|
|
cdef SIZE_t label_count_stride = self.label_count_stride
|
|
cdef double* label_count_total = self.label_count_total
|
|
|
|
cdef SIZE_t i = 0
|
|
cdef SIZE_t p = 0
|
|
cdef SIZE_t k = 0
|
|
cdef SIZE_t c = 0
|
|
cdef DOUBLE_t w = 1.0
|
|
cdef SIZE_t offset = 0
|
|
|
|
for k in range(n_outputs):
|
|
memset(label_count_total + offset, 0,
|
|
n_classes[k] * sizeof(double))
|
|
offset += label_count_stride
|
|
|
|
for p in range(start, end):
|
|
i = samples[p]
|
|
|
|
if sample_weight != NULL:
|
|
w = sample_weight[i]
|
|
|
|
for k in range(n_outputs):
|
|
c = <SIZE_t> y[i * y_stride + k]
|
|
label_count_total[k * label_count_stride + c] += w
|
|
|
|
weighted_n_node_samples += w
|
|
|
|
self.weighted_n_node_samples = weighted_n_node_samples
|
|
|
|
# Reset to pos=start
|
|
self.reset()
|
|
|
|
cdef void reset(self) nogil:
|
|
"""Reset the criterion at pos=start."""
|
|
self.pos = self.start
|
|
|
|
self.weighted_n_left = 0.0
|
|
self.weighted_n_right = self.weighted_n_node_samples
|
|
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef SIZE_t* n_classes = self.n_classes
|
|
cdef SIZE_t label_count_stride = self.label_count_stride
|
|
cdef double* label_count_total = self.label_count_total
|
|
cdef double* label_count_left = self.label_count_left
|
|
cdef double* label_count_right = self.label_count_right
|
|
|
|
cdef SIZE_t k = 0
|
|
|
|
for k in range(n_outputs):
|
|
memset(label_count_left, 0, n_classes[k] * sizeof(double))
|
|
memcpy(label_count_right, label_count_total,
|
|
n_classes[k] * sizeof(double))
|
|
|
|
label_count_total += label_count_stride
|
|
label_count_left += label_count_stride
|
|
label_count_right += label_count_stride
|
|
|
|
cdef void update(self, SIZE_t new_pos) nogil:
|
|
"""Update the collected statistics by moving samples[pos:new_pos] from
|
|
the right child to the left child."""
|
|
cdef DOUBLE_t* y = self.y
|
|
cdef SIZE_t y_stride = self.y_stride
|
|
cdef DOUBLE_t* sample_weight = self.sample_weight
|
|
|
|
cdef SIZE_t* samples = self.samples
|
|
cdef SIZE_t pos = self.pos
|
|
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef SIZE_t* n_classes = self.n_classes
|
|
cdef SIZE_t label_count_stride = self.label_count_stride
|
|
cdef double* label_count_total = self.label_count_total
|
|
cdef double* label_count_left = self.label_count_left
|
|
cdef double* label_count_right = self.label_count_right
|
|
|
|
cdef SIZE_t i
|
|
cdef SIZE_t p
|
|
cdef SIZE_t k
|
|
cdef SIZE_t label_index
|
|
cdef DOUBLE_t w = 1.0
|
|
cdef DOUBLE_t diff_w = 0.0
|
|
|
|
# Note: We assume start <= pos < new_pos <= end
|
|
|
|
for p in range(pos, new_pos):
|
|
i = samples[p]
|
|
|
|
if sample_weight != NULL:
|
|
w = sample_weight[i]
|
|
|
|
for k in range(n_outputs):
|
|
label_index = (k * label_count_stride +
|
|
<SIZE_t> y[i * y_stride + k])
|
|
label_count_left[label_index] += w
|
|
label_count_right[label_index] -= w
|
|
|
|
diff_w += w
|
|
|
|
self.weighted_n_left += diff_w
|
|
self.weighted_n_right -= diff_w
|
|
|
|
self.pos = new_pos
|
|
|
|
cdef double node_impurity(self) nogil:
|
|
pass
|
|
|
|
cdef void children_impurity(self, double* impurity_left,
|
|
double* impurity_right) nogil:
|
|
pass
|
|
|
|
cdef void node_value(self, double* dest) nogil:
|
|
"""Compute the node value of samples[start:end] into dest."""
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef SIZE_t* n_classes = self.n_classes
|
|
cdef SIZE_t label_count_stride = self.label_count_stride
|
|
cdef double* label_count_total = self.label_count_total
|
|
cdef SIZE_t k
|
|
|
|
for k in range(n_outputs):
|
|
memcpy(dest, label_count_total, n_classes[k] * sizeof(double))
|
|
dest += label_count_stride
|
|
label_count_total += label_count_stride
|
|
|
|
|
|
cdef class Entropy(ClassificationCriterion):
|
|
"""Cross Entropy impurity criteria.
|
|
|
|
Let the target be a classification outcome taking values in 0, 1, ..., K-1.
|
|
If node m represents a region Rm with Nm observations, then let
|
|
|
|
pmk = 1/ Nm \sum_{x_i in Rm} I(yi = k)
|
|
|
|
be the proportion of class k observations in node m.
|
|
|
|
The cross-entropy is then defined as
|
|
|
|
cross-entropy = - \sum_{k=0}^{K-1} pmk log(pmk)
|
|
"""
|
|
cdef double node_impurity(self) nogil:
|
|
"""Evaluate the impurity of the current node, i.e. the impurity of
|
|
samples[start:end]."""
|
|
cdef double weighted_n_node_samples = self.weighted_n_node_samples
|
|
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef SIZE_t* n_classes = self.n_classes
|
|
cdef SIZE_t label_count_stride = self.label_count_stride
|
|
cdef double* label_count_total = self.label_count_total
|
|
|
|
cdef double entropy = 0.0
|
|
cdef double total = 0.0
|
|
cdef double tmp
|
|
cdef SIZE_t k
|
|
cdef SIZE_t c
|
|
|
|
for k in range(n_outputs):
|
|
entropy = 0.0
|
|
|
|
for c in range(n_classes[k]):
|
|
tmp = label_count_total[c]
|
|
if tmp > 0.0:
|
|
tmp /= weighted_n_node_samples
|
|
entropy -= tmp * log(tmp)
|
|
|
|
total += entropy
|
|
label_count_total += label_count_stride
|
|
|
|
return total / n_outputs
|
|
|
|
cdef void children_impurity(self, double* impurity_left,
|
|
double* impurity_right) nogil:
|
|
"""Evaluate the impurity in children nodes, i.e. the impurity of the
|
|
left child (samples[start:pos]) and the impurity the right child
|
|
(samples[pos:end])."""
|
|
cdef double weighted_n_node_samples = self.weighted_n_node_samples
|
|
cdef double weighted_n_left = self.weighted_n_left
|
|
cdef double weighted_n_right = self.weighted_n_right
|
|
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef SIZE_t* n_classes = self.n_classes
|
|
cdef SIZE_t label_count_stride = self.label_count_stride
|
|
cdef double* label_count_left = self.label_count_left
|
|
cdef double* label_count_right = self.label_count_right
|
|
|
|
cdef double entropy_left = 0.0
|
|
cdef double entropy_right = 0.0
|
|
cdef double total_left = 0.0
|
|
cdef double total_right = 0.0
|
|
cdef double tmp
|
|
cdef SIZE_t k
|
|
cdef SIZE_t c
|
|
|
|
for k in range(n_outputs):
|
|
entropy_left = 0.0
|
|
entropy_right = 0.0
|
|
|
|
for c in range(n_classes[k]):
|
|
tmp = label_count_left[c]
|
|
if tmp > 0.0:
|
|
tmp /= weighted_n_left
|
|
entropy_left -= tmp * log(tmp)
|
|
|
|
tmp = label_count_right[c]
|
|
if tmp > 0.0:
|
|
tmp /= weighted_n_right
|
|
entropy_right -= tmp * log(tmp)
|
|
|
|
total_left += entropy_left
|
|
total_right += entropy_right
|
|
label_count_left += label_count_stride
|
|
label_count_right += label_count_stride
|
|
|
|
impurity_left[0] = total_left / n_outputs
|
|
impurity_right[0] = total_right / n_outputs
|
|
|
|
|
|
cdef class Gini(ClassificationCriterion):
|
|
"""Gini Index impurity criteria.
|
|
|
|
Let the target be a classification outcome taking values in 0, 1, ..., K-1.
|
|
If node m represents a region Rm with Nm observations, then let
|
|
|
|
pmk = 1/ Nm \sum_{x_i in Rm} I(yi = k)
|
|
|
|
be the proportion of class k observations in node m.
|
|
|
|
The Gini Index is then defined as:
|
|
|
|
index = \sum_{k=0}^{K-1} pmk (1 - pmk)
|
|
= 1 - \sum_{k=0}^{K-1} pmk ** 2
|
|
"""
|
|
cdef double node_impurity(self) nogil:
|
|
"""Evaluate the impurity of the current node, i.e. the impurity of
|
|
samples[start:end]."""
|
|
cdef double weighted_n_node_samples = self.weighted_n_node_samples
|
|
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef SIZE_t* n_classes = self.n_classes
|
|
cdef SIZE_t label_count_stride = self.label_count_stride
|
|
cdef double* label_count_total = self.label_count_total
|
|
|
|
cdef double gini = 0.0
|
|
cdef double total = 0.0
|
|
cdef double tmp
|
|
cdef SIZE_t k
|
|
cdef SIZE_t c
|
|
|
|
for k in range(n_outputs):
|
|
gini = 0.0
|
|
|
|
for c in range(n_classes[k]):
|
|
tmp = label_count_total[c]
|
|
gini += tmp * tmp
|
|
|
|
gini = 1.0 - gini / (weighted_n_node_samples *
|
|
weighted_n_node_samples)
|
|
|
|
total += gini
|
|
label_count_total += label_count_stride
|
|
|
|
return total / n_outputs
|
|
|
|
cdef void children_impurity(self, double* impurity_left,
|
|
double* impurity_right) nogil:
|
|
"""Evaluate the impurity in children nodes, i.e. the impurity of the
|
|
left child (samples[start:pos]) and the impurity the right child
|
|
(samples[pos:end])."""
|
|
cdef double weighted_n_node_samples = self.weighted_n_node_samples
|
|
cdef double weighted_n_left = self.weighted_n_left
|
|
cdef double weighted_n_right = self.weighted_n_right
|
|
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef SIZE_t* n_classes = self.n_classes
|
|
cdef SIZE_t label_count_stride = self.label_count_stride
|
|
cdef double* label_count_left = self.label_count_left
|
|
cdef double* label_count_right = self.label_count_right
|
|
|
|
cdef double gini_left = 0.0
|
|
cdef double gini_right = 0.0
|
|
cdef double total = 0.0
|
|
cdef double total_left = 0.0
|
|
cdef double total_right = 0.0
|
|
cdef double tmp
|
|
cdef SIZE_t k
|
|
cdef SIZE_t c
|
|
|
|
for k in range(n_outputs):
|
|
gini_left = 0.0
|
|
gini_right = 0.0
|
|
|
|
for c in range(n_classes[k]):
|
|
tmp = label_count_left[c]
|
|
gini_left += tmp * tmp
|
|
tmp = label_count_right[c]
|
|
gini_right += tmp * tmp
|
|
|
|
gini_left = 1.0 - gini_left / (weighted_n_left *
|
|
weighted_n_left)
|
|
gini_right = 1.0 - gini_right / (weighted_n_right *
|
|
weighted_n_right)
|
|
|
|
total_left += gini_left
|
|
total_right += gini_right
|
|
label_count_left += label_count_stride
|
|
label_count_right += label_count_stride
|
|
|
|
impurity_left[0] = total_left / n_outputs
|
|
impurity_right[0] = total_right / n_outputs
|
|
|
|
|
|
cdef class RegressionCriterion(Criterion):
|
|
"""Abstract criterion for regression.
|
|
|
|
Computes variance of the target values left and right of the split point.
|
|
Computation is linear in `n_samples` by using ::
|
|
|
|
var = \sum_i^n (y_i - y_bar) ** 2
|
|
= (\sum_i^n y_i ** 2) - n_samples y_bar ** 2
|
|
"""
|
|
cdef double* mean_left
|
|
cdef double* mean_right
|
|
cdef double* mean_total
|
|
cdef double* sq_sum_left
|
|
cdef double* sq_sum_right
|
|
cdef double* sq_sum_total
|
|
cdef double* var_left
|
|
cdef double* var_right
|
|
cdef double* sum_left
|
|
cdef double* sum_right
|
|
cdef double* sum_total
|
|
|
|
def __cinit__(self, SIZE_t n_outputs):
|
|
# Default values
|
|
self.y = NULL
|
|
self.y_stride = 0
|
|
self.sample_weight = NULL
|
|
|
|
self.samples = NULL
|
|
self.start = 0
|
|
self.pos = 0
|
|
self.end = 0
|
|
|
|
self.n_outputs = n_outputs
|
|
self.n_node_samples = 0
|
|
self.weighted_n_node_samples = 0.0
|
|
self.weighted_n_left = 0.0
|
|
self.weighted_n_right = 0.0
|
|
|
|
# Allocate accumulators. Make sure they are NULL, not uninitialized,
|
|
# before an exception can be raised (which triggers __dealloc__).
|
|
self.mean_left = NULL
|
|
self.mean_right = NULL
|
|
self.mean_total = NULL
|
|
self.sq_sum_left = NULL
|
|
self.sq_sum_right = NULL
|
|
self.sq_sum_total = NULL
|
|
self.var_left = NULL
|
|
self.var_right = NULL
|
|
self.sum_left = NULL
|
|
self.sum_right = NULL
|
|
self.sum_total = NULL
|
|
|
|
self.mean_left = <double*> calloc(n_outputs, sizeof(double))
|
|
self.mean_right = <double*> calloc(n_outputs, sizeof(double))
|
|
self.mean_total = <double*> calloc(n_outputs, sizeof(double))
|
|
self.sq_sum_left = <double*> calloc(n_outputs, sizeof(double))
|
|
self.sq_sum_right = <double*> calloc(n_outputs, sizeof(double))
|
|
self.sq_sum_total = <double*> calloc(n_outputs, sizeof(double))
|
|
self.var_left = <double*> calloc(n_outputs, sizeof(double))
|
|
self.var_right = <double*> calloc(n_outputs, sizeof(double))
|
|
self.sum_left = <double*> calloc(n_outputs, sizeof(double))
|
|
self.sum_right = <double*> calloc(n_outputs, sizeof(double))
|
|
self.sum_total = <double*> calloc(n_outputs, sizeof(double))
|
|
|
|
if (self.mean_left == NULL or
|
|
self.mean_right == NULL or
|
|
self.mean_total == NULL or
|
|
self.sq_sum_left == NULL or
|
|
self.sq_sum_right == NULL or
|
|
self.sq_sum_total == NULL or
|
|
self.var_left == NULL or
|
|
self.var_right == NULL or
|
|
self.sum_left == NULL or
|
|
self.sum_right == NULL or
|
|
self.sum_total == NULL):
|
|
raise MemoryError()
|
|
|
|
def __dealloc__(self):
|
|
"""Destructor."""
|
|
free(self.mean_left)
|
|
free(self.mean_right)
|
|
free(self.mean_total)
|
|
free(self.sq_sum_left)
|
|
free(self.sq_sum_right)
|
|
free(self.sq_sum_total)
|
|
free(self.var_left)
|
|
free(self.var_right)
|
|
free(self.sum_left)
|
|
free(self.sum_right)
|
|
free(self.sum_total)
|
|
|
|
def __reduce__(self):
|
|
return (RegressionCriterion, (self.n_outputs,), self.__getstate__())
|
|
|
|
def __getstate__(self):
|
|
return {}
|
|
|
|
def __setstate__(self, d):
|
|
pass
|
|
|
|
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
|
|
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
|
|
SIZE_t end) nogil:
|
|
"""Initialize the criterion at node samples[start:end] and
|
|
children samples[start:start] and samples[start:end]."""
|
|
# Initialize fields
|
|
self.y = y
|
|
self.y_stride = y_stride
|
|
self.sample_weight = sample_weight
|
|
self.samples = samples
|
|
self.start = start
|
|
self.end = end
|
|
self.n_node_samples = end - start
|
|
self.weighted_n_samples = weighted_n_samples
|
|
cdef double weighted_n_node_samples = 0.
|
|
|
|
# Initialize accumulators
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef double* mean_left = self.mean_left
|
|
cdef double* mean_right = self.mean_right
|
|
cdef double* mean_total = self.mean_total
|
|
cdef double* sq_sum_left = self.sq_sum_left
|
|
cdef double* sq_sum_right = self.sq_sum_right
|
|
cdef double* sq_sum_total = self.sq_sum_total
|
|
cdef double* var_left = self.var_left
|
|
cdef double* var_right = self.var_right
|
|
cdef double* sum_left = self.sum_left
|
|
cdef double* sum_right = self.sum_right
|
|
cdef double* sum_total = self.sum_total
|
|
|
|
cdef SIZE_t i = 0
|
|
cdef SIZE_t p = 0
|
|
cdef SIZE_t k = 0
|
|
cdef DOUBLE_t y_ik = 0.0
|
|
cdef DOUBLE_t w_y_ik = 0.0
|
|
cdef DOUBLE_t w = 1.0
|
|
|
|
cdef SIZE_t n_bytes = n_outputs * sizeof(double)
|
|
memset(mean_left, 0, n_bytes)
|
|
memset(mean_right, 0, n_bytes)
|
|
memset(mean_total, 0, n_bytes)
|
|
memset(sq_sum_left, 0, n_bytes)
|
|
memset(sq_sum_right, 0, n_bytes)
|
|
memset(sq_sum_total, 0, n_bytes)
|
|
memset(var_left, 0, n_bytes)
|
|
memset(var_right, 0, n_bytes)
|
|
memset(sum_left, 0, n_bytes)
|
|
memset(sum_right, 0, n_bytes)
|
|
memset(sum_total, 0, n_bytes)
|
|
|
|
for p in range(start, end):
|
|
i = samples[p]
|
|
|
|
if sample_weight != NULL:
|
|
w = sample_weight[i]
|
|
|
|
for k in range(n_outputs):
|
|
y_ik = y[i * y_stride + k]
|
|
w_y_ik = w * y_ik
|
|
sum_total[k] += w_y_ik
|
|
sq_sum_total[k] += w_y_ik * y_ik
|
|
|
|
weighted_n_node_samples += w
|
|
|
|
self.weighted_n_node_samples = weighted_n_node_samples
|
|
|
|
for k in range(n_outputs):
|
|
mean_total[k] = sum_total[k] / weighted_n_node_samples
|
|
|
|
# Reset to pos=start
|
|
self.reset()
|
|
|
|
cdef void reset(self) nogil:
|
|
"""Reset the criterion at pos=start."""
|
|
self.pos = self.start
|
|
|
|
self.weighted_n_left = 0.0
|
|
self.weighted_n_right = self.weighted_n_node_samples
|
|
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef double* mean_left = self.mean_left
|
|
cdef double* mean_right = self.mean_right
|
|
cdef double* mean_total = self.mean_total
|
|
cdef double* sq_sum_left = self.sq_sum_left
|
|
cdef double* sq_sum_right = self.sq_sum_right
|
|
cdef double* sq_sum_total = self.sq_sum_total
|
|
cdef double* var_left = self.var_left
|
|
cdef double* var_right = self.var_right
|
|
cdef double weighted_n_node_samples = self.weighted_n_node_samples
|
|
cdef double* sum_left = self.sum_left
|
|
cdef double* sum_right = self.sum_right
|
|
cdef double* sum_total = self.sum_total
|
|
|
|
cdef SIZE_t k = 0
|
|
|
|
for k in range(n_outputs):
|
|
mean_right[k] = mean_total[k]
|
|
mean_left[k] = 0.0
|
|
sq_sum_right[k] = sq_sum_total[k]
|
|
sq_sum_left[k] = 0.0
|
|
var_right[k] = (sq_sum_right[k] / weighted_n_node_samples -
|
|
mean_right[k] * mean_right[k])
|
|
var_left[k] = 0.0
|
|
sum_right[k] = sum_total[k]
|
|
sum_left[k] = 0.0
|
|
|
|
cdef void update(self, SIZE_t new_pos) nogil:
|
|
"""Update the collected statistics by moving samples[pos:new_pos] from
|
|
the right child to the left child."""
|
|
cdef DOUBLE_t* y = self.y
|
|
cdef SIZE_t y_stride = self.y_stride
|
|
cdef DOUBLE_t* sample_weight = self.sample_weight
|
|
|
|
cdef SIZE_t* samples = self.samples
|
|
cdef SIZE_t pos = self.pos
|
|
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef double* mean_left = self.mean_left
|
|
cdef double* mean_right = self.mean_right
|
|
cdef double* sq_sum_left = self.sq_sum_left
|
|
cdef double* sq_sum_right = self.sq_sum_right
|
|
cdef double* var_left = self.var_left
|
|
cdef double* var_right = self.var_right
|
|
cdef double* sum_left = self.sum_left
|
|
cdef double* sum_right = self.sum_right
|
|
|
|
cdef double weighted_n_left = self.weighted_n_left
|
|
cdef double weighted_n_right = self.weighted_n_right
|
|
|
|
cdef SIZE_t i
|
|
cdef SIZE_t p
|
|
cdef SIZE_t k
|
|
cdef DOUBLE_t w = 1.0
|
|
cdef DOUBLE_t diff_w = 0.0
|
|
cdef DOUBLE_t y_ik, w_y_ik
|
|
|
|
# Note: We assume start <= pos < new_pos <= end
|
|
for p in range(pos, new_pos):
|
|
i = samples[p]
|
|
|
|
if sample_weight != NULL:
|
|
w = sample_weight[i]
|
|
|
|
for k in range(n_outputs):
|
|
y_ik = y[i * y_stride + k]
|
|
w_y_ik = w * y_ik
|
|
|
|
sum_left[k] += w_y_ik
|
|
sum_right[k] -= w_y_ik
|
|
|
|
sq_sum_left[k] += w_y_ik * y_ik
|
|
sq_sum_right[k] -= w_y_ik * y_ik
|
|
|
|
diff_w += w
|
|
|
|
weighted_n_left += diff_w
|
|
weighted_n_right -= diff_w
|
|
|
|
for k in range(n_outputs):
|
|
mean_left[k] = sum_left[k] / weighted_n_left
|
|
mean_right[k] = sum_right[k] / weighted_n_right
|
|
var_left[k] = (sq_sum_left[k] / weighted_n_left -
|
|
mean_left[k] * mean_left[k])
|
|
var_right[k] = (sq_sum_right[k] / weighted_n_right -
|
|
mean_right[k] * mean_right[k])
|
|
|
|
self.weighted_n_left = weighted_n_left
|
|
self.weighted_n_right = weighted_n_right
|
|
|
|
self.pos = new_pos
|
|
|
|
cdef double node_impurity(self) nogil:
|
|
pass
|
|
|
|
cdef void children_impurity(self, double* impurity_left,
|
|
double* impurity_right) nogil:
|
|
pass
|
|
|
|
cdef void node_value(self, double* dest) nogil:
|
|
"""Compute the node value of samples[start:end] into dest."""
|
|
memcpy(dest, self.mean_total, self.n_outputs * sizeof(double))
|
|
|
|
|
|
cdef class MSE(RegressionCriterion):
|
|
"""Mean squared error impurity criterion.
|
|
|
|
MSE = var_left + var_right
|
|
"""
|
|
cdef double node_impurity(self) nogil:
|
|
"""Evaluate the impurity of the current node, i.e. the impurity of
|
|
samples[start:end]."""
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef double* sq_sum_total = self.sq_sum_total
|
|
cdef double* mean_total = self.mean_total
|
|
cdef double weighted_n_node_samples = self.weighted_n_node_samples
|
|
cdef double total = 0.0
|
|
cdef SIZE_t k
|
|
|
|
for k in range(n_outputs):
|
|
total += (sq_sum_total[k] / weighted_n_node_samples -
|
|
mean_total[k] * mean_total[k])
|
|
|
|
return total / n_outputs
|
|
|
|
cdef void children_impurity(self, double* impurity_left,
|
|
double* impurity_right) nogil:
|
|
"""Evaluate the impurity in children nodes, i.e. the impurity of the
|
|
left child (samples[start:pos]) and the impurity the right child
|
|
(samples[pos:end])."""
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef double* var_left = self.var_left
|
|
cdef double* var_right = self.var_right
|
|
cdef double total_left = 0.0
|
|
cdef double total_right = 0.0
|
|
cdef SIZE_t k
|
|
|
|
for k in range(n_outputs):
|
|
total_left += var_left[k]
|
|
total_right += var_right[k]
|
|
|
|
impurity_left[0] = total_left / n_outputs
|
|
impurity_right[0] = total_right / n_outputs
|
|
|
|
|
|
cdef class FriedmanMSE(MSE):
|
|
"""Mean squared error impurity criterion with improvement score by Friedman
|
|
|
|
Uses the formula (35) in Friedmans original Gradient Boosting paper:
|
|
|
|
diff = mean_left - mean_right
|
|
improvement = n_left * n_right * diff^2 / (n_left + n_right)
|
|
"""
|
|
|
|
cdef double impurity_improvement(self, double impurity) nogil:
|
|
cdef SIZE_t n_outputs = self.n_outputs
|
|
cdef SIZE_t k
|
|
cdef double* sum_left = self.sum_left
|
|
cdef double* sum_right = self.sum_right
|
|
cdef double total_sum_left = 0.0
|
|
cdef double total_sum_right = 0.0
|
|
cdef double weighted_n_left = self.weighted_n_left
|
|
cdef double weighted_n_right = self.weighted_n_right
|
|
cdef double diff = 0.0
|
|
|
|
for k from 0 <= k < n_outputs:
|
|
total_sum_left += sum_left[k]
|
|
total_sum_right += sum_right[k]
|
|
|
|
total_sum_left = total_sum_left / n_outputs
|
|
total_sum_right = total_sum_right / n_outputs
|
|
diff = ((total_sum_left / weighted_n_left) -
|
|
(total_sum_right / weighted_n_right))
|
|
|
|
return (weighted_n_left * weighted_n_right * diff * diff /
|
|
(weighted_n_left + weighted_n_right))
|
|
|
|
# =============================================================================
|
|
# Splitter
|
|
# =============================================================================
|
|
|
|
cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) nogil:
|
|
self.impurity_left = INFINITY
|
|
self.impurity_right = INFINITY
|
|
self.pos = start_pos
|
|
self.feature = 0
|
|
self.threshold = 0.
|
|
self.improvement = -INFINITY
|
|
|
|
|
|
cdef class Splitter:
|
|
def __cinit__(self, Criterion criterion, SIZE_t max_features,
|
|
SIZE_t min_samples_leaf, double min_weight_leaf,
|
|
object random_state):
|
|
self.criterion = criterion
|
|
|
|
self.samples = NULL
|
|
self.n_samples = 0
|
|
self.features = NULL
|
|
self.n_features = 0
|
|
self.feature_values = NULL
|
|
|
|
self.y = NULL
|
|
self.y_stride = 0
|
|
self.sample_weight = NULL
|
|
|
|
self.max_features = max_features
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_leaf = min_weight_leaf
|
|
self.random_state = random_state
|
|
|
|
def __dealloc__(self):
|
|
"""Destructor."""
|
|
free(self.samples)
|
|
free(self.features)
|
|
free(self.constant_features)
|
|
free(self.feature_values)
|
|
|
|
def __getstate__(self):
|
|
return {}
|
|
|
|
def __setstate__(self, d):
|
|
pass
|
|
|
|
cdef void init(self,
|
|
object X,
|
|
np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
|
|
DOUBLE_t* sample_weight) except *:
|
|
"""Initialize the splitter."""
|
|
# Reset random state
|
|
self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
|
|
# Initialize samples and features structures
|
|
cdef SIZE_t n_samples = X.shape[0]
|
|
cdef SIZE_t* samples = safe_realloc(&self.samples, n_samples)
|
|
|
|
cdef SIZE_t i, j
|
|
cdef double weighted_n_samples = 0.0
|
|
j = 0
|
|
|
|
for i in range(n_samples):
|
|
# Only work with positively weighted samples
|
|
if sample_weight == NULL or sample_weight[i] != 0.0:
|
|
samples[j] = i
|
|
j += 1
|
|
|
|
if sample_weight != NULL:
|
|
weighted_n_samples += sample_weight[i]
|
|
else:
|
|
weighted_n_samples += 1.0
|
|
|
|
self.n_samples = j
|
|
self.weighted_n_samples = weighted_n_samples
|
|
|
|
cdef SIZE_t n_features = X.shape[1]
|
|
cdef SIZE_t* features = safe_realloc(&self.features, n_features)
|
|
|
|
for i in range(n_features):
|
|
features[i] = i
|
|
|
|
self.n_features = n_features
|
|
|
|
safe_realloc(&self.feature_values, n_samples)
|
|
safe_realloc(&self.constant_features, n_features)
|
|
|
|
# Initialize y, sample_weight
|
|
self.y = <DOUBLE_t*> y.data
|
|
self.y_stride = <SIZE_t> y.strides[0] / <SIZE_t> y.itemsize
|
|
self.sample_weight = sample_weight
|
|
|
|
cdef void node_reset(self, SIZE_t start, SIZE_t end,
|
|
double* weighted_n_node_samples) nogil:
|
|
"""Reset splitter on node samples[start:end]."""
|
|
self.start = start
|
|
self.end = end
|
|
|
|
self.criterion.init(self.y,
|
|
self.y_stride,
|
|
self.sample_weight,
|
|
self.weighted_n_samples,
|
|
self.samples,
|
|
start,
|
|
end)
|
|
|
|
weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
|
|
|
|
cdef void node_split(self, double impurity, SplitRecord* split,
|
|
SIZE_t* n_constant_features) nogil:
|
|
"""Find a split on node samples[start:end]."""
|
|
pass
|
|
|
|
cdef void node_value(self, double* dest) nogil:
|
|
"""Copy the value of node samples[start:end] into dest."""
|
|
self.criterion.node_value(dest)
|
|
|
|
cdef double node_impurity(self) nogil:
|
|
"""Copy the impurity of node samples[start:end."""
|
|
return self.criterion.node_impurity()
|
|
|
|
|
|
cdef class BaseDenseSplitter(Splitter):
|
|
cdef DTYPE_t* X
|
|
cdef SIZE_t X_sample_stride
|
|
cdef SIZE_t X_fx_stride
|
|
|
|
def __cinit__(self, Criterion criterion, SIZE_t max_features,
|
|
SIZE_t min_samples_leaf, double min_weight_leaf,
|
|
object random_state):
|
|
# Parent __cinit__ is automatically called
|
|
|
|
self.X = NULL
|
|
self.X_sample_stride = 0
|
|
self.X_fx_stride = 0
|
|
|
|
cdef void init(self,
|
|
object X,
|
|
np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
|
|
DOUBLE_t* sample_weight) except *:
|
|
"""Initialize the splitter."""
|
|
|
|
# Call parent init
|
|
Splitter.init(self, X, y, sample_weight)
|
|
|
|
# Initialize X
|
|
cdef np.ndarray X_ndarray = X
|
|
|
|
self.X = <DTYPE_t*> X_ndarray.data
|
|
self.X_sample_stride = <SIZE_t> X.strides[0] / <SIZE_t> X.itemsize
|
|
self.X_fx_stride = <SIZE_t> X.strides[1] / <SIZE_t> X.itemsize
|
|
|
|
|
|
cdef class BestSplitter(BaseDenseSplitter):
|
|
"""Splitter for finding the best split."""
|
|
def __reduce__(self):
|
|
return (BestSplitter, (self.criterion,
|
|
self.max_features,
|
|
self.min_samples_leaf,
|
|
self.min_weight_leaf,
|
|
self.random_state), self.__getstate__())
|
|
|
|
cdef void node_split(self, double impurity, SplitRecord* split,
|
|
SIZE_t* n_constant_features) nogil:
|
|
"""Find the best split on node samples[start:end]."""
|
|
# Find the best split
|
|
cdef SIZE_t* samples = self.samples
|
|
cdef SIZE_t start = self.start
|
|
cdef SIZE_t end = self.end
|
|
|
|
cdef SIZE_t* features = self.features
|
|
cdef SIZE_t* constant_features = self.constant_features
|
|
cdef SIZE_t n_features = self.n_features
|
|
|
|
cdef DTYPE_t* X = self.X
|
|
cdef DTYPE_t* Xf = self.feature_values
|
|
cdef SIZE_t X_sample_stride = self.X_sample_stride
|
|
cdef SIZE_t X_fx_stride = self.X_fx_stride
|
|
cdef SIZE_t max_features = self.max_features
|
|
cdef SIZE_t min_samples_leaf = self.min_samples_leaf
|
|
cdef double min_weight_leaf = self.min_weight_leaf
|
|
cdef UINT32_t* random_state = &self.rand_r_state
|
|
|
|
cdef SplitRecord best, current
|
|
|
|
cdef SIZE_t f_i = n_features
|
|
cdef SIZE_t f_j, p, tmp
|
|
cdef SIZE_t n_visited_features = 0
|
|
# Number of features discovered to be constant during the split search
|
|
cdef SIZE_t n_found_constants = 0
|
|
# Number of features known to be constant and drawn without replacement
|
|
cdef SIZE_t n_drawn_constants = 0
|
|
cdef SIZE_t n_known_constants = n_constant_features[0]
|
|
# n_total_constants = n_known_constants + n_found_constants
|
|
cdef SIZE_t n_total_constants = n_known_constants
|
|
cdef DTYPE_t current_feature_value
|
|
cdef SIZE_t partition_end
|
|
|
|
_init_split(&best, end)
|
|
|
|
# Sample up to max_features without replacement using a
|
|
# Fisher-Yates-based algorithm (using the local variables `f_i` and
|
|
# `f_j` to compute a permutation of the `features` array).
|
|
#
|
|
# Skip the CPU intensive evaluation of the impurity criterion for
|
|
# features that were already detected as constant (hence not suitable
|
|
# for good splitting) by ancestor nodes and save the information on
|
|
# newly discovered constant features to spare computation on descendant
|
|
# nodes.
|
|
while (f_i > n_total_constants and # Stop early if remaining features
|
|
# are constant
|
|
(n_visited_features < max_features or
|
|
# At least one drawn features must be non constant
|
|
n_visited_features <= n_found_constants + n_drawn_constants)):
|
|
|
|
n_visited_features += 1
|
|
|
|
# Loop invariant: elements of features in
|
|
# - [:n_drawn_constant[ holds drawn and known constant features;
|
|
# - [n_drawn_constant:n_known_constant[ holds known constant
|
|
# features that haven't been drawn yet;
|
|
# - [n_known_constant:n_total_constant[ holds newly found constant
|
|
# features;
|
|
# - [n_total_constant:f_i[ holds features that haven't been drawn
|
|
# yet and aren't constant apriori.
|
|
# - [f_i:n_features[ holds features that have been drawn
|
|
# and aren't constant.
|
|
|
|
# Draw a feature at random
|
|
f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
|
|
random_state)
|
|
|
|
if f_j < n_known_constants:
|
|
# f_j in the interval [n_drawn_constants, n_known_constants[
|
|
tmp = features[f_j]
|
|
features[f_j] = features[n_drawn_constants]
|
|
features[n_drawn_constants] = tmp
|
|
|
|
n_drawn_constants += 1
|
|
|
|
else:
|
|
# f_j in the interval [n_known_constants, f_i - n_found_constants[
|
|
f_j += n_found_constants
|
|
# f_j in the interval [n_total_constants, f_i[
|
|
|
|
current.feature = features[f_j]
|
|
|
|
# Sort samples along that feature; first copy the feature
|
|
# values for the active samples into Xf, s.t.
|
|
# Xf[i] == X[samples[i], j], so the sort uses the cache more
|
|
# effectively.
|
|
for p in range(start, end):
|
|
Xf[p] = X[X_sample_stride * samples[p] +
|
|
X_fx_stride * current.feature]
|
|
|
|
sort(Xf + start, samples + start, end - start)
|
|
|
|
if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD:
|
|
features[f_j] = features[n_total_constants]
|
|
features[n_total_constants] = current.feature
|
|
|
|
n_found_constants += 1
|
|
n_total_constants += 1
|
|
|
|
else:
|
|
f_i -= 1
|
|
features[f_i], features[f_j] = features[f_j], features[f_i]
|
|
|
|
# Evaluate all splits
|
|
self.criterion.reset()
|
|
p = start
|
|
|
|
while p < end:
|
|
while (p + 1 < end and
|
|
Xf[p + 1] <= Xf[p] + FEATURE_THRESHOLD):
|
|
p += 1
|
|
|
|
# (p + 1 >= end) or (X[samples[p + 1], current.feature] >
|
|
# X[samples[p], current.feature])
|
|
p += 1
|
|
# (p >= end) or (X[samples[p], current.feature] >
|
|
# X[samples[p - 1], current.feature])
|
|
|
|
if p < end:
|
|
current.pos = p
|
|
|
|
# Reject if min_samples_leaf is not guaranteed
|
|
if (((current.pos - start) < min_samples_leaf) or
|
|
((end - current.pos) < min_samples_leaf)):
|
|
continue
|
|
|
|
self.criterion.update(current.pos)
|
|
|
|
# Reject if min_weight_leaf is not satisfied
|
|
if ((self.criterion.weighted_n_left < min_weight_leaf) or
|
|
(self.criterion.weighted_n_right < min_weight_leaf)):
|
|
continue
|
|
|
|
current.improvement = self.criterion.impurity_improvement(impurity)
|
|
|
|
if current.improvement > best.improvement:
|
|
self.criterion.children_impurity(¤t.impurity_left,
|
|
¤t.impurity_right)
|
|
current.threshold = (Xf[p - 1] + Xf[p]) / 2.0
|
|
|
|
if current.threshold == Xf[p]:
|
|
current.threshold = Xf[p - 1]
|
|
|
|
best = current # copy
|
|
|
|
# Reorganize into samples[start:best.pos] + samples[best.pos:end]
|
|
if best.pos < end:
|
|
partition_end = end
|
|
p = start
|
|
|
|
while p < partition_end:
|
|
if X[X_sample_stride * samples[p] +
|
|
X_fx_stride * best.feature] <= best.threshold:
|
|
p += 1
|
|
|
|
else:
|
|
partition_end -= 1
|
|
|
|
tmp = samples[partition_end]
|
|
samples[partition_end] = samples[p]
|
|
samples[p] = tmp
|
|
|
|
# Respect invariant for constant features: the original order of
|
|
# element in features[:n_known_constants] must be preserved for sibling
|
|
# and child nodes
|
|
memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants)
|
|
|
|
# Copy newly found constant features
|
|
memcpy(constant_features + n_known_constants,
|
|
features + n_known_constants,
|
|
sizeof(SIZE_t) * n_found_constants)
|
|
|
|
# Return values
|
|
split[0] = best
|
|
n_constant_features[0] = n_total_constants
|
|
|
|
|
|
# Sort n-element arrays pointed to by Xf and samples, simultaneously,
|
|
# by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997).
|
|
cdef inline void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
|
|
cdef int maxd = 2 * <int>log(n)
|
|
introsort(Xf, samples, n, maxd)
|
|
|
|
|
|
cdef inline void swap(DTYPE_t* Xf, SIZE_t* samples, SIZE_t i, SIZE_t j) nogil:
|
|
# Helper for sort
|
|
Xf[i], Xf[j] = Xf[j], Xf[i]
|
|
samples[i], samples[j] = samples[j], samples[i]
|
|
|
|
|
|
cdef inline DTYPE_t median3(DTYPE_t* Xf, SIZE_t n) nogil:
|
|
# Median of three pivot selection, after Bentley and McIlroy (1993).
|
|
# Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
|
|
cdef DTYPE_t a = Xf[0], b = Xf[n / 2], c = Xf[n - 1]
|
|
if a < b:
|
|
if b < c:
|
|
return b
|
|
elif a < c:
|
|
return c
|
|
else:
|
|
return a
|
|
elif b < c:
|
|
if a < c:
|
|
return a
|
|
else:
|
|
return c
|
|
else:
|
|
return b
|
|
|
|
|
|
# Introsort with median of 3 pivot selection and 3-way partition function
|
|
# (robust to repeated elements, e.g. lots of zero features).
|
|
cdef void introsort(DTYPE_t* Xf, SIZE_t *samples, SIZE_t n, int maxd) nogil:
|
|
cdef DTYPE_t pivot
|
|
cdef SIZE_t i, l, r
|
|
|
|
while n > 1:
|
|
if maxd <= 0: # max depth limit exceeded ("gone quadratic")
|
|
heapsort(Xf, samples, n)
|
|
return
|
|
maxd -= 1
|
|
|
|
pivot = median3(Xf, n)
|
|
|
|
# Three-way partition.
|
|
i = l = 0
|
|
r = n
|
|
while i < r:
|
|
if Xf[i] < pivot:
|
|
swap(Xf, samples, i, l)
|
|
i += 1
|
|
l += 1
|
|
elif Xf[i] > pivot:
|
|
r -= 1
|
|
swap(Xf, samples, i, r)
|
|
else:
|
|
i += 1
|
|
|
|
introsort(Xf, samples, l, maxd)
|
|
Xf += r
|
|
samples += r
|
|
n -= r
|
|
|
|
|
|
cdef inline void sift_down(DTYPE_t* Xf, SIZE_t* samples,
|
|
SIZE_t start, SIZE_t end) nogil:
|
|
# Restore heap order in Xf[start:end] by moving the max element to start.
|
|
cdef SIZE_t child, maxind, root
|
|
|
|
root = start
|
|
while True:
|
|
child = root * 2 + 1
|
|
|
|
# find max of root, left child, right child
|
|
maxind = root
|
|
if child < end and Xf[maxind] < Xf[child]:
|
|
maxind = child
|
|
if child + 1 < end and Xf[maxind] < Xf[child + 1]:
|
|
maxind = child + 1
|
|
|
|
if maxind == root:
|
|
break
|
|
else:
|
|
swap(Xf, samples, root, maxind)
|
|
root = maxind
|
|
|
|
|
|
cdef void heapsort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
|
|
cdef SIZE_t start, end
|
|
|
|
# heapify
|
|
start = (n - 2) / 2
|
|
end = n
|
|
while True:
|
|
sift_down(Xf, samples, start, end)
|
|
if start == 0:
|
|
break
|
|
start -= 1
|
|
|
|
# sort by shrinking the heap, putting the max element immediately after it
|
|
end = n - 1
|
|
while end > 0:
|
|
swap(Xf, samples, 0, end)
|
|
sift_down(Xf, samples, 0, end)
|
|
end = end - 1
|
|
|
|
|
|
cdef class RandomSplitter(BaseDenseSplitter):
|
|
"""Splitter for finding the best random split."""
|
|
def __reduce__(self):
|
|
return (RandomSplitter, (self.criterion,
|
|
self.max_features,
|
|
self.min_samples_leaf,
|
|
self.min_weight_leaf,
|
|
self.random_state), self.__getstate__())
|
|
|
|
cdef void node_split(self, double impurity, SplitRecord* split,
|
|
SIZE_t* n_constant_features) nogil:
|
|
"""Find the best random split on node samples[start:end]."""
|
|
# Draw random splits and pick the best
|
|
cdef SIZE_t* samples = self.samples
|
|
cdef SIZE_t start = self.start
|
|
cdef SIZE_t end = self.end
|
|
|
|
cdef SIZE_t* features = self.features
|
|
cdef SIZE_t* constant_features = self.constant_features
|
|
cdef SIZE_t n_features = self.n_features
|
|
|
|
cdef DTYPE_t* X = self.X
|
|
cdef DTYPE_t* Xf = self.feature_values
|
|
cdef SIZE_t X_sample_stride = self.X_sample_stride
|
|
cdef SIZE_t X_fx_stride = self.X_fx_stride
|
|
cdef SIZE_t max_features = self.max_features
|
|
cdef SIZE_t min_samples_leaf = self.min_samples_leaf
|
|
cdef double min_weight_leaf = self.min_weight_leaf
|
|
cdef UINT32_t* random_state = &self.rand_r_state
|
|
|
|
cdef SplitRecord best, current
|
|
|
|
cdef SIZE_t f_i = n_features
|
|
cdef SIZE_t f_j, p, tmp
|
|
# Number of features discovered to be constant during the split search
|
|
cdef SIZE_t n_found_constants = 0
|
|
# Number of features known to be constant and drawn without replacement
|
|
cdef SIZE_t n_drawn_constants = 0
|
|
cdef SIZE_t n_known_constants = n_constant_features[0]
|
|
# n_total_constants = n_known_constants + n_found_constants
|
|
cdef SIZE_t n_total_constants = n_known_constants
|
|
cdef SIZE_t n_visited_features = 0
|
|
cdef DTYPE_t min_feature_value
|
|
cdef DTYPE_t max_feature_value
|
|
cdef DTYPE_t current_feature_value
|
|
cdef SIZE_t partition_end
|
|
|
|
_init_split(&best, end)
|
|
|
|
# Sample up to max_features without replacement using a
|
|
# Fisher-Yates-based algorithm (using the local variables `f_i` and
|
|
# `f_j` to compute a permutation of the `features` array).
|
|
#
|
|
# Skip the CPU intensive evaluation of the impurity criterion for
|
|
# features that were already detected as constant (hence not suitable
|
|
# for good splitting) by ancestor nodes and save the information on
|
|
# newly discovered constant features to spare computation on descendant
|
|
# nodes.
|
|
while (f_i > n_total_constants and # Stop early if remaining features
|
|
# are constant
|
|
(n_visited_features < max_features or
|
|
# At least one drawn features must be non constant
|
|
n_visited_features <= n_found_constants + n_drawn_constants)):
|
|
n_visited_features += 1
|
|
|
|
# Loop invariant: elements of features in
|
|
# - [:n_drawn_constant[ holds drawn and known constant features;
|
|
# - [n_drawn_constant:n_known_constant[ holds known constant
|
|
# features that haven't been drawn yet;
|
|
# - [n_known_constant:n_total_constant[ holds newly found constant
|
|
# features;
|
|
# - [n_total_constant:f_i[ holds features that haven't been drawn
|
|
# yet and aren't constant apriori.
|
|
# - [f_i:n_features[ holds features that have been drawn
|
|
# and aren't constant.
|
|
|
|
# Draw a feature at random
|
|
f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
|
|
random_state)
|
|
|
|
if f_j < n_known_constants:
|
|
# f_j in the interval [n_drawn_constants, n_known_constants[
|
|
tmp = features[f_j]
|
|
features[f_j] = features[n_drawn_constants]
|
|
features[n_drawn_constants] = tmp
|
|
|
|
n_drawn_constants += 1
|
|
|
|
else:
|
|
# f_j in the interval [n_known_constants, f_i - n_found_constants[
|
|
f_j += n_found_constants
|
|
# f_j in the interval [n_total_constants, f_i[
|
|
|
|
current.feature = features[f_j]
|
|
|
|
# Find min, max
|
|
min_feature_value = X[X_sample_stride * samples[start] +
|
|
X_fx_stride * current.feature]
|
|
max_feature_value = min_feature_value
|
|
Xf[start] = min_feature_value
|
|
|
|
for p in range(start + 1, end):
|
|
current_feature_value = X[X_sample_stride * samples[p] +
|
|
X_fx_stride * current.feature]
|
|
Xf[p] = current_feature_value
|
|
|
|
if current_feature_value < min_feature_value:
|
|
min_feature_value = current_feature_value
|
|
elif current_feature_value > max_feature_value:
|
|
max_feature_value = current_feature_value
|
|
|
|
if max_feature_value <= min_feature_value + FEATURE_THRESHOLD:
|
|
features[f_j] = features[n_total_constants]
|
|
features[n_total_constants] = current.feature
|
|
|
|
n_found_constants += 1
|
|
n_total_constants += 1
|
|
|
|
else:
|
|
f_i -= 1
|
|
features[f_i], features[f_j] = features[f_j], features[f_i]
|
|
|
|
# Draw a random threshold
|
|
current.threshold = rand_uniform(min_feature_value,
|
|
max_feature_value,
|
|
random_state)
|
|
|
|
if current.threshold == max_feature_value:
|
|
current.threshold = min_feature_value
|
|
|
|
# Partition
|
|
partition_end = end
|
|
p = start
|
|
while p < partition_end:
|
|
current_feature_value = Xf[p]
|
|
if current_feature_value <= current.threshold:
|
|
p += 1
|
|
else:
|
|
partition_end -= 1
|
|
|
|
Xf[p] = Xf[partition_end]
|
|
Xf[partition_end] = current_feature_value
|
|
|
|
tmp = samples[partition_end]
|
|
samples[partition_end] = samples[p]
|
|
samples[p] = tmp
|
|
|
|
current.pos = partition_end
|
|
|
|
# Reject if min_samples_leaf is not guaranteed
|
|
if (((current.pos - start) < min_samples_leaf) or
|
|
((end - current.pos) < min_samples_leaf)):
|
|
continue
|
|
|
|
# Evaluate split
|
|
self.criterion.reset()
|
|
self.criterion.update(current.pos)
|
|
|
|
# Reject if min_weight_leaf is not satisfied
|
|
if ((self.criterion.weighted_n_left < min_weight_leaf) or
|
|
(self.criterion.weighted_n_right < min_weight_leaf)):
|
|
continue
|
|
|
|
current.improvement = self.criterion.impurity_improvement(impurity)
|
|
|
|
if current.improvement > best.improvement:
|
|
self.criterion.children_impurity(¤t.impurity_left,
|
|
¤t.impurity_right)
|
|
best = current # copy
|
|
|
|
# Reorganize into samples[start:best.pos] + samples[best.pos:end]
|
|
if best.pos < end and current.feature != best.feature:
|
|
partition_end = end
|
|
p = start
|
|
|
|
while p < partition_end:
|
|
if X[X_sample_stride * samples[p] +
|
|
X_fx_stride * best.feature] <= best.threshold:
|
|
p += 1
|
|
|
|
else:
|
|
partition_end -= 1
|
|
|
|
tmp = samples[partition_end]
|
|
samples[partition_end] = samples[p]
|
|
samples[p] = tmp
|
|
|
|
# Respect invariant for constant features: the original order of
|
|
# element in features[:n_known_constants] must be preserved for sibling
|
|
# and child nodes
|
|
memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants)
|
|
|
|
# Copy newly found constant features
|
|
memcpy(constant_features + n_known_constants,
|
|
features + n_known_constants,
|
|
sizeof(SIZE_t) * n_found_constants)
|
|
|
|
# Return values
|
|
split[0] = best
|
|
n_constant_features[0] = n_total_constants
|
|
|
|
|
|
cdef class PresortBestSplitter(BaseDenseSplitter):
|
|
"""Splitter for finding the best split, using presorting."""
|
|
cdef DTYPE_t* X_old
|
|
cdef np.ndarray X_argsorted
|
|
cdef INT32_t* X_argsorted_ptr
|
|
cdef SIZE_t X_argsorted_stride
|
|
|
|
cdef SIZE_t n_total_samples
|
|
cdef unsigned char* sample_mask
|
|
|
|
def __cinit__(self, Criterion criterion, SIZE_t max_features,
|
|
SIZE_t min_samples_leaf,
|
|
double min_weight_leaf,
|
|
object random_state):
|
|
# Initialize pointers
|
|
self.X_old = NULL
|
|
self.X_argsorted_ptr = NULL
|
|
self.X_argsorted_stride = 0
|
|
self.sample_mask = NULL
|
|
|
|
def __dealloc__(self):
|
|
"""Destructor."""
|
|
free(self.sample_mask)
|
|
|
|
def __reduce__(self):
|
|
return (PresortBestSplitter, (self.criterion,
|
|
self.max_features,
|
|
self.min_samples_leaf,
|
|
self.min_weight_leaf,
|
|
self.random_state), self.__getstate__())
|
|
|
|
cdef void init(self, object X,
|
|
np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
|
|
DOUBLE_t* sample_weight) except *:
|
|
|
|
cdef void* sample_mask = NULL
|
|
|
|
# Call parent initializer
|
|
BaseDenseSplitter.init(self, X, y, sample_weight)
|
|
|
|
cdef np.ndarray X_ndarray = X
|
|
|
|
# Pre-sort X
|
|
if self.X_old != self.X:
|
|
self.X_old = self.X
|
|
self.X_argsorted = np.asfortranarray(np.argsort(X_ndarray, axis=0),
|
|
dtype=np.int32)
|
|
self.X_argsorted_ptr = <INT32_t*> self.X_argsorted.data
|
|
self.X_argsorted_stride = (<SIZE_t> self.X_argsorted.strides[1] /
|
|
<SIZE_t> self.X_argsorted.itemsize)
|
|
|
|
self.n_total_samples = X.shape[0]
|
|
sample_mask = safe_realloc(&self.sample_mask, self.n_total_samples)
|
|
memset(sample_mask, 0, self.n_total_samples)
|
|
|
|
cdef void node_split(self, double impurity, SplitRecord* split,
|
|
SIZE_t* n_constant_features) nogil:
|
|
"""Find the best split on node samples[start:end]."""
|
|
# Find the best split
|
|
cdef SIZE_t* samples = self.samples
|
|
cdef SIZE_t start = self.start
|
|
cdef SIZE_t end = self.end
|
|
|
|
cdef SIZE_t* features = self.features
|
|
cdef SIZE_t* constant_features = self.constant_features
|
|
cdef SIZE_t n_features = self.n_features
|
|
|
|
cdef DTYPE_t* X = self.X
|
|
cdef DTYPE_t* Xf = self.feature_values
|
|
cdef SIZE_t X_sample_stride = self.X_sample_stride
|
|
cdef SIZE_t X_fx_stride = self.X_fx_stride
|
|
cdef INT32_t* X_argsorted = self.X_argsorted_ptr
|
|
cdef SIZE_t X_argsorted_stride = self.X_argsorted_stride
|
|
cdef SIZE_t n_total_samples = self.n_total_samples
|
|
cdef unsigned char* sample_mask = self.sample_mask
|
|
|
|
cdef SIZE_t max_features = self.max_features
|
|
cdef SIZE_t min_samples_leaf = self.min_samples_leaf
|
|
cdef double min_weight_leaf = self.min_weight_leaf
|
|
cdef UINT32_t* random_state = &self.rand_r_state
|
|
|
|
cdef SplitRecord best, current
|
|
|
|
cdef SIZE_t f_i = n_features
|
|
cdef SIZE_t f_j, p
|
|
# Number of features discovered to be constant during the split search
|
|
cdef SIZE_t n_found_constants = 0
|
|
# Number of features known to be constant and drawn without replacement
|
|
cdef SIZE_t n_drawn_constants = 0
|
|
cdef SIZE_t n_known_constants = n_constant_features[0]
|
|
# n_total_constants = n_known_constants + n_found_constants
|
|
cdef SIZE_t n_total_constants = n_known_constants
|
|
cdef SIZE_t n_visited_features = 0
|
|
cdef SIZE_t partition_end
|
|
cdef SIZE_t i, j
|
|
|
|
_init_split(&best, end)
|
|
|
|
# Set sample mask
|
|
for p in range(start, end):
|
|
sample_mask[samples[p]] = 1
|
|
|
|
# Sample up to max_features without replacement using a
|
|
# Fisher-Yates-based algorithm (using the local variables `f_i` and
|
|
# `f_j` to compute a permutation of the `features` array).
|
|
#
|
|
# Skip the CPU intensive evaluation of the impurity criterion for
|
|
# features that were already detected as constant (hence not suitable
|
|
# for good splitting) by ancestor nodes and save the information on
|
|
# newly discovered constant features to spare computation on descendant
|
|
# nodes.
|
|
while (f_i > n_total_constants and # Stop early if remaining features
|
|
# are constant
|
|
(n_visited_features < max_features or
|
|
# At least one drawn features must be non constant
|
|
n_visited_features <= n_found_constants + n_drawn_constants)):
|
|
n_visited_features += 1
|
|
|
|
# Loop invariant: elements of features in
|
|
# - [:n_drawn_constant[ holds drawn and known constant features;
|
|
# - [n_drawn_constant:n_known_constant[ holds known constant
|
|
# features that haven't been drawn yet;
|
|
# - [n_known_constant:n_total_constant[ holds newly found constant
|
|
# features;
|
|
# - [n_total_constant:f_i[ holds features that haven't been drawn
|
|
# yet and aren't constant apriori.
|
|
# - [f_i:n_features[ holds features that have been drawn
|
|
# and aren't constant.
|
|
|
|
# Draw a feature at random
|
|
f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
|
|
random_state)
|
|
|
|
if f_j < n_known_constants:
|
|
# f_j is in [n_drawn_constants, n_known_constants[
|
|
tmp = features[f_j]
|
|
features[f_j] = features[n_drawn_constants]
|
|
features[n_drawn_constants] = tmp
|
|
|
|
n_drawn_constants += 1
|
|
|
|
else:
|
|
# f_j in the interval [n_known_constants, f_i - n_found_constants[
|
|
f_j += n_found_constants
|
|
# f_j in the interval [n_total_constants, f_i[
|
|
|
|
current.feature = features[f_j]
|
|
|
|
# Extract ordering from X_argsorted
|
|
p = start
|
|
|
|
for i in range(n_total_samples):
|
|
j = X_argsorted[X_argsorted_stride * current.feature + i]
|
|
if sample_mask[j] == 1:
|
|
samples[p] = j
|
|
Xf[p] = X[X_sample_stride * j +
|
|
X_fx_stride * current.feature]
|
|
p += 1
|
|
|
|
# Evaluate all splits
|
|
if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD:
|
|
features[f_j] = features[n_total_constants]
|
|
features[n_total_constants] = current.feature
|
|
|
|
n_found_constants += 1
|
|
n_total_constants += 1
|
|
|
|
else:
|
|
f_i -= 1
|
|
features[f_i], features[f_j] = features[f_j], features[f_i]
|
|
|
|
self.criterion.reset()
|
|
p = start
|
|
|
|
while p < end:
|
|
while (p + 1 < end and
|
|
Xf[p + 1] <= Xf[p] + FEATURE_THRESHOLD):
|
|
p += 1
|
|
|
|
# (p + 1 >= end) or (X[samples[p + 1], current.feature] >
|
|
# X[samples[p], current.feature])
|
|
p += 1
|
|
# (p >= end) or (X[samples[p], current.feature] >
|
|
# X[samples[p - 1], current.feature])
|
|
|
|
if p < end:
|
|
current.pos = p
|
|
|
|
# Reject if min_samples_leaf is not guaranteed
|
|
if (((current.pos - start) < min_samples_leaf) or
|
|
((end - current.pos) < min_samples_leaf)):
|
|
continue
|
|
|
|
self.criterion.update(current.pos)
|
|
|
|
# Reject if min_weight_leaf is not satisfied
|
|
if ((self.criterion.weighted_n_left < min_weight_leaf) or
|
|
(self.criterion.weighted_n_right < min_weight_leaf)):
|
|
continue
|
|
|
|
current.improvement = self.criterion.impurity_improvement(impurity)
|
|
|
|
if current.improvement > best.improvement:
|
|
self.criterion.children_impurity(¤t.impurity_left,
|
|
¤t.impurity_right)
|
|
|
|
current.threshold = (Xf[p - 1] + Xf[p]) / 2.0
|
|
if current.threshold == Xf[p]:
|
|
current.threshold = Xf[p - 1]
|
|
|
|
best = current # copy
|
|
|
|
# Reorganize into samples[start:best.pos] + samples[best.pos:end]
|
|
if best.pos < end:
|
|
partition_end = end
|
|
p = start
|
|
|
|
while p < partition_end:
|
|
if X[X_sample_stride * samples[p] +
|
|
X_fx_stride * best.feature] <= best.threshold:
|
|
p += 1
|
|
|
|
else:
|
|
partition_end -= 1
|
|
|
|
tmp = samples[partition_end]
|
|
samples[partition_end] = samples[p]
|
|
samples[p] = tmp
|
|
|
|
# Reset sample mask
|
|
for p in range(start, end):
|
|
sample_mask[samples[p]] = 0
|
|
|
|
# Respect invariant for constant features: the original order of
|
|
# element in features[:n_known_constants] must be preserved for sibling
|
|
# and child nodes
|
|
memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants)
|
|
|
|
# Copy newly found constant features
|
|
memcpy(constant_features + n_known_constants,
|
|
features + n_known_constants,
|
|
sizeof(SIZE_t) * n_found_constants)
|
|
|
|
# Return values
|
|
split[0] = best
|
|
n_constant_features[0] = n_total_constants
|
|
|
|
|
|
cdef class BaseSparseSplitter(Splitter):
|
|
# The sparse splitter works only with csc sparse matrix format
|
|
cdef DTYPE_t* X_data
|
|
cdef INT32_t* X_indices
|
|
cdef INT32_t* X_indptr
|
|
|
|
cdef SIZE_t n_total_samples
|
|
|
|
cdef SIZE_t* index_to_samples
|
|
cdef SIZE_t* sorted_samples
|
|
|
|
def __cinit__(self, Criterion criterion, SIZE_t max_features,
|
|
SIZE_t min_samples_leaf, double min_weight_leaf,
|
|
object random_state):
|
|
# Parent __cinit__ is automatically called
|
|
|
|
self.X_data = NULL
|
|
self.X_indices = NULL
|
|
self.X_indptr = NULL
|
|
|
|
self.n_total_samples = 0
|
|
|
|
self.index_to_samples = NULL
|
|
self.sorted_samples = NULL
|
|
|
|
def __dealloc__(self):
|
|
"""Deallocate memory"""
|
|
free(self.index_to_samples)
|
|
free(self.sorted_samples)
|
|
|
|
cdef void init(self,
|
|
object X,
|
|
np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
|
|
DOUBLE_t* sample_weight) except *:
|
|
"""Initialize the splitter."""
|
|
|
|
# Call parent init
|
|
Splitter.init(self, X, y, sample_weight)
|
|
|
|
cdef SIZE_t* samples = self.samples
|
|
cdef SIZE_t n_samples = self.n_samples
|
|
|
|
# Initialize X
|
|
cdef np.ndarray[dtype=DTYPE_t, ndim=1] data = X.data
|
|
cdef np.ndarray[dtype=INT32_t, ndim=1] indices = X.indices
|
|
cdef np.ndarray[dtype=INT32_t, ndim=1] indptr = X.indptr
|
|
cdef SIZE_t n_total_samples = X.shape[0]
|
|
|
|
self.X_data = <DTYPE_t*> data.data
|
|
self.X_indices = <INT32_t*> indices.data
|
|
self.X_indptr = <INT32_t*> indptr.data
|
|
self.n_total_samples = n_total_samples
|
|
|
|
# Initialize auxiliary array used to perform split
|
|
safe_realloc(&self.index_to_samples, n_total_samples * sizeof(SIZE_t))
|
|
safe_realloc(&self.sorted_samples, n_samples * sizeof(SIZE_t))
|
|
|
|
cdef SIZE_t* index_to_samples = self.index_to_samples
|
|
cdef SIZE_t p
|
|
for p in range(n_total_samples):
|
|
index_to_samples[p] = -1
|
|
|
|
for p in range(n_samples):
|
|
index_to_samples[samples[p]] = p
|
|
|
|
cdef inline SIZE_t _partition(self, double threshold,
|
|
SIZE_t end_negative, SIZE_t start_positive,
|
|
SIZE_t zero_pos) nogil:
|
|
"""Partition samples[start:end] based on threshold"""
|
|
|
|
cdef double value
|
|
cdef SIZE_t partition_end
|
|
cdef SIZE_t p
|
|
|
|
cdef DTYPE_t* Xf = self.feature_values
|
|
cdef SIZE_t* samples = self.samples
|
|
cdef SIZE_t* index_to_samples = self.index_to_samples
|
|
|
|
if threshold < 0.:
|
|
p = self.start
|
|
partition_end = end_negative
|
|
elif threshold > 0.:
|
|
p = start_positive
|
|
partition_end = self.end
|
|
else:
|
|
# Data are already split
|
|
return zero_pos
|
|
|
|
while p < partition_end:
|
|
value = Xf[p]
|
|
|
|
if value <= threshold:
|
|
p += 1
|
|
|
|
else:
|
|
partition_end -= 1
|
|
|
|
Xf[p] = Xf[partition_end]
|
|
Xf[partition_end] = value
|
|
sparse_swap(index_to_samples, samples, p, partition_end)
|
|
|
|
return partition_end
|
|
|
|
cdef inline void extract_nnz(self, SIZE_t feature,
|
|
SIZE_t* end_negative, SIZE_t* start_positive,
|
|
bint* is_samples_sorted) nogil:
|
|
"""Extract and partition values for a given feature
|
|
|
|
The extracted values are partitioned between negative values
|
|
Xf[start:end_negative[0]] and positive values Xf[start_positive[0]:end].
|
|
The samples and index_to_samples are modified according to this
|
|
partition.
|
|
|
|
The extraction corresponds to the intersection between the arrays
|
|
X_indices[indptr_start:indptr_end] and samples[start:end].
|
|
This is done efficiently using either an index_to_samples based approach
|
|
or binary search based approach.
|
|
|
|
Parameters
|
|
----------
|
|
feature : SIZE_t,
|
|
Index of the feature we want to extract non zero value.
|
|
|
|
|
|
end_negative, start_positive : SIZE_t*, SIZE_t*,
|
|
Return extracted non zero values in self.samples[start:end] where
|
|
negative values are in self.feature_values[start:end_negative[0]]
|
|
and positive values are in
|
|
self.feature_values[start_positive[0]:end].
|
|
|
|
is_samples_sorted : bint*,
|
|
If is_samples_sorted, then self.sorted_samples[start:end] will be
|
|
the sorted version of self.samples[start:end].
|
|
|
|
"""
|
|
cdef SIZE_t indptr_start = self.X_indptr[feature],
|
|
cdef SIZE_t indptr_end = self.X_indptr[feature + 1]
|
|
cdef SIZE_t n_indices = <SIZE_t>(indptr_end - indptr_start)
|
|
cdef SIZE_t n_samples = self.end - self.start
|
|
|
|
# Use binary search if n_samples * log(n_indices) <
|
|
# n_indices and index_to_samples approach otherwise.
|
|
# O(n_samples * log(n_indices)) is the running time of binary
|
|
# search and O(n_indices) is the running time of index_to_samples
|
|
# approach.
|
|
if ((1 - is_samples_sorted[0]) * n_samples * log(n_samples) +
|
|
n_samples * log(n_indices) < EXTRACT_NNZ_SWITCH * n_indices):
|
|
extract_nnz_binary_search(self.X_indices, self.X_data,
|
|
indptr_start, indptr_end,
|
|
self.samples, self.start, self.end,
|
|
self.index_to_samples,
|
|
self.feature_values,
|
|
end_negative, start_positive,
|
|
self.sorted_samples, is_samples_sorted)
|
|
|
|
# Using an index to samples technique to extract non zero values
|
|
# index_to_samples is a mapping from X_indices to samples
|
|
else:
|
|
extract_nnz_index_to_samples(self.X_indices, self.X_data,
|
|
indptr_start, indptr_end,
|
|
self.samples, self.start, self.end,
|
|
self.index_to_samples,
|
|
self.feature_values,
|
|
end_negative, start_positive)
|
|
|
|
|
|
cdef int compare_SIZE_t(const void* a, const void* b) nogil:
|
|
"""Comparison function for sort"""
|
|
return <int>((<SIZE_t*>a)[0] - (<SIZE_t*>b)[0])
|
|
|
|
|
|
cdef inline void binary_search(INT32_t* sorted_array,
|
|
INT32_t start, INT32_t end,
|
|
SIZE_t value, SIZE_t* index,
|
|
INT32_t* new_start) nogil:
|
|
"""Return the index of value in the sorted array
|
|
|
|
If not found, return -1. new_start is the last pivot + 1
|
|
"""
|
|
cdef INT32_t pivot
|
|
index[0] = -1
|
|
while start < end:
|
|
pivot = start + (end - start) / 2
|
|
|
|
if sorted_array[pivot] == value:
|
|
index[0] = pivot
|
|
start = pivot + 1
|
|
break
|
|
|
|
if sorted_array[pivot] < value:
|
|
start = pivot + 1
|
|
else:
|
|
end = pivot
|
|
new_start[0] = start
|
|
|
|
|
|
cdef inline void extract_nnz_index_to_samples(INT32_t* X_indices,
|
|
DTYPE_t* X_data,
|
|
INT32_t indptr_start,
|
|
INT32_t indptr_end,
|
|
SIZE_t* samples,
|
|
SIZE_t start,
|
|
SIZE_t end,
|
|
SIZE_t* index_to_samples,
|
|
DTYPE_t* Xf,
|
|
SIZE_t* end_negative,
|
|
SIZE_t* start_positive) nogil:
|
|
"""Extract and partition values for a feature using index_to_samples
|
|
|
|
Complexity is O(indptr_end - indptr_start).
|
|
"""
|
|
cdef INT32_t k
|
|
cdef SIZE_t index
|
|
cdef SIZE_t end_negative_ = start
|
|
cdef SIZE_t start_positive_ = end
|
|
|
|
for k in range(indptr_start, indptr_end):
|
|
if start <= index_to_samples[X_indices[k]] < end:
|
|
if X_data[k] > 0:
|
|
start_positive_ -= 1
|
|
Xf[start_positive_] = X_data[k]
|
|
index = index_to_samples[X_indices[k]]
|
|
sparse_swap(index_to_samples, samples, index, start_positive_)
|
|
|
|
|
|
elif X_data[k] < 0:
|
|
Xf[end_negative_] = X_data[k]
|
|
index = index_to_samples[X_indices[k]]
|
|
sparse_swap(index_to_samples, samples, index, end_negative_)
|
|
end_negative_ += 1
|
|
|
|
# Returned values
|
|
end_negative[0] = end_negative_
|
|
start_positive[0] = start_positive_
|
|
|
|
|
|
cdef inline void extract_nnz_binary_search(INT32_t* X_indices,
|
|
DTYPE_t* X_data,
|
|
INT32_t indptr_start,
|
|
INT32_t indptr_end,
|
|
SIZE_t* samples,
|
|
SIZE_t start,
|
|
SIZE_t end,
|
|
SIZE_t* index_to_samples,
|
|
DTYPE_t* Xf,
|
|
SIZE_t* end_negative,
|
|
SIZE_t* start_positive,
|
|
SIZE_t* sorted_samples,
|
|
bint* is_samples_sorted) nogil:
|
|
"""Extract and partition values for a given feature using binary search
|
|
|
|
If n_samples = end - start and n_indices = indptr_end - indptr_start,
|
|
the complexity is
|
|
|
|
O((1 - is_samples_sorted[0]) * n_samples * log(n_samples) +
|
|
n_samples * log(n_indices)).
|
|
"""
|
|
cdef SIZE_t n_samples
|
|
|
|
if not is_samples_sorted[0]:
|
|
n_samples = end - start
|
|
memcpy(sorted_samples + start, samples + start,
|
|
n_samples * sizeof(SIZE_t))
|
|
qsort(sorted_samples + start, n_samples, sizeof(SIZE_t),
|
|
compare_SIZE_t)
|
|
is_samples_sorted[0] = 1
|
|
|
|
while (indptr_start < indptr_end and
|
|
sorted_samples[start] > X_indices[indptr_start]):
|
|
indptr_start += 1
|
|
|
|
while (indptr_start < indptr_end and
|
|
sorted_samples[end - 1] < X_indices[indptr_end - 1]):
|
|
indptr_end -= 1
|
|
|
|
cdef SIZE_t p = start
|
|
cdef SIZE_t index
|
|
cdef SIZE_t k
|
|
cdef SIZE_t end_negative_ = start
|
|
cdef SIZE_t start_positive_ = end
|
|
|
|
while (p < end and indptr_start < indptr_end):
|
|
# Find index of sorted_samples[p] in X_indices
|
|
binary_search(X_indices, indptr_start, indptr_end,
|
|
sorted_samples[p], &k, &indptr_start)
|
|
|
|
if k != -1:
|
|
# If k != -1, we have found a non zero value
|
|
|
|
if X_data[k] > 0:
|
|
start_positive_ -= 1
|
|
Xf[start_positive_] = X_data[k]
|
|
index = index_to_samples[X_indices[k]]
|
|
sparse_swap(index_to_samples, samples, index, start_positive_)
|
|
|
|
|
|
elif X_data[k] < 0:
|
|
Xf[end_negative_] = X_data[k]
|
|
index = index_to_samples[X_indices[k]]
|
|
sparse_swap(index_to_samples, samples, index, end_negative_)
|
|
end_negative_ += 1
|
|
p += 1
|
|
|
|
# Returned values
|
|
end_negative[0] = end_negative_
|
|
start_positive[0] = start_positive_
|
|
|
|
|
|
cdef inline void sparse_swap(SIZE_t* index_to_samples, SIZE_t* samples,
|
|
SIZE_t pos_1, SIZE_t pos_2) nogil :
|
|
"""Swap sample pos_1 and pos_2 preserving sparse invariant"""
|
|
samples[pos_1], samples[pos_2] = samples[pos_2], samples[pos_1]
|
|
index_to_samples[samples[pos_1]] = pos_1
|
|
index_to_samples[samples[pos_2]] = pos_2
|
|
|
|
|
|
cdef class BestSparseSplitter(BaseSparseSplitter):
|
|
"""Splitter for finding the best split, using the sparse data."""
|
|
|
|
def __reduce__(self):
|
|
return (BestSparseSplitter, (self.criterion,
|
|
self.max_features,
|
|
self.min_samples_leaf,
|
|
self.min_weight_leaf,
|
|
self.random_state), self.__getstate__())
|
|
|
|
cdef void node_split(self, double impurity, SplitRecord* split,
|
|
SIZE_t* n_constant_features) nogil:
|
|
"""Find the best split on node samples[start:end], using sparse
|
|
features.
|
|
"""
|
|
# Find the best split
|
|
cdef SIZE_t* samples = self.samples
|
|
cdef SIZE_t start = self.start
|
|
cdef SIZE_t end = self.end
|
|
|
|
cdef INT32_t* X_indices = self.X_indices
|
|
cdef INT32_t* X_indptr = self.X_indptr
|
|
cdef DTYPE_t* X_data = self.X_data
|
|
|
|
cdef SIZE_t* features = self.features
|
|
cdef SIZE_t* constant_features = self.constant_features
|
|
cdef SIZE_t n_features = self.n_features
|
|
|
|
cdef DTYPE_t* Xf = self.feature_values
|
|
cdef SIZE_t* sorted_samples = self.sorted_samples
|
|
cdef SIZE_t* index_to_samples = self.index_to_samples
|
|
cdef SIZE_t max_features = self.max_features
|
|
cdef SIZE_t min_samples_leaf = self.min_samples_leaf
|
|
cdef double min_weight_leaf = self.min_weight_leaf
|
|
cdef UINT32_t* random_state = &self.rand_r_state
|
|
|
|
cdef SplitRecord best, current
|
|
_init_split(&best, end)
|
|
|
|
cdef SIZE_t f_i = n_features
|
|
cdef SIZE_t f_j, p, tmp
|
|
cdef SIZE_t n_visited_features = 0
|
|
# Number of features discovered to be constant during the split search
|
|
cdef SIZE_t n_found_constants = 0
|
|
# Number of features known to be constant and drawn without replacement
|
|
cdef SIZE_t n_drawn_constants = 0
|
|
cdef SIZE_t n_known_constants = n_constant_features[0]
|
|
# n_total_constants = n_known_constants + n_found_constants
|
|
cdef SIZE_t n_total_constants = n_known_constants
|
|
cdef DTYPE_t current_feature_value
|
|
|
|
cdef SIZE_t p_next
|
|
cdef SIZE_t p_prev
|
|
cdef bint is_samples_sorted = 0 # indicate is sorted_samples is
|
|
# inititialized
|
|
|
|
# We assume implicitely that end_positive = end and
|
|
# start_negative = start
|
|
cdef SIZE_t start_positive
|
|
cdef SIZE_t end_negative
|
|
|
|
# Sample up to max_features without replacement using a
|
|
# Fisher-Yates-based algorithm (using the local variables `f_i` and
|
|
# `f_j` to compute a permutation of the `features` array).
|
|
#
|
|
# Skip the CPU intensive evaluation of the impurity criterion for
|
|
# features that were already detected as constant (hence not suitable
|
|
# for good splitting) by ancestor nodes and save the information on
|
|
# newly discovered constant features to spare computation on descendant
|
|
# nodes.
|
|
while (f_i > n_total_constants and # Stop early if remaining features
|
|
# are constant
|
|
(n_visited_features < max_features or
|
|
# At least one drawn features must be non constant
|
|
n_visited_features <= n_found_constants + n_drawn_constants)):
|
|
|
|
n_visited_features += 1
|
|
|
|
# Loop invariant: elements of features in
|
|
# - [:n_drawn_constant[ holds drawn and known constant features;
|
|
# - [n_drawn_constant:n_known_constant[ holds known constant
|
|
# features that haven't been drawn yet;
|
|
# - [n_known_constant:n_total_constant[ holds newly found constant
|
|
# features;
|
|
# - [n_total_constant:f_i[ holds features that haven't been drawn
|
|
# yet and aren't constant apriori.
|
|
# - [f_i:n_features[ holds features that have been drawn
|
|
# and aren't constant.
|
|
|
|
# Draw a feature at random
|
|
f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
|
|
random_state)
|
|
|
|
if f_j < n_known_constants:
|
|
# f_j in the interval [n_drawn_constants, n_known_constants[
|
|
tmp = features[f_j]
|
|
features[f_j] = features[n_drawn_constants]
|
|
features[n_drawn_constants] = tmp
|
|
|
|
n_drawn_constants += 1
|
|
|
|
else:
|
|
# f_j in the interval [n_known_constants, f_i - n_found_constants[
|
|
f_j += n_found_constants
|
|
# f_j in the interval [n_total_constants, f_i[
|
|
|
|
current.feature = features[f_j]
|
|
self.extract_nnz(current.feature,
|
|
&end_negative, &start_positive,
|
|
&is_samples_sorted)
|
|
|
|
# Sort the positive and negative parts of `Xf`
|
|
sort(Xf + start, samples + start, end_negative - start)
|
|
sort(Xf + start_positive, samples + start_positive,
|
|
end - start_positive)
|
|
|
|
# Update index_to_samples to take into account the sort
|
|
for p in range(start, end_negative):
|
|
index_to_samples[samples[p]] = p
|
|
for p in range(start_positive, end):
|
|
index_to_samples[samples[p]] = p
|
|
|
|
# Add one or two zeros in Xf, if there is any
|
|
if end_negative < start_positive:
|
|
start_positive -= 1
|
|
Xf[start_positive] = 0.
|
|
|
|
if end_negative != start_positive:
|
|
Xf[end_negative] = 0.
|
|
end_negative += 1
|
|
|
|
if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD:
|
|
features[f_j] = features[n_total_constants]
|
|
features[n_total_constants] = current.feature
|
|
|
|
n_found_constants += 1
|
|
n_total_constants += 1
|
|
|
|
else:
|
|
f_i -= 1
|
|
features[f_i], features[f_j] = features[f_j], features[f_i]
|
|
|
|
# Evaluate all splits
|
|
self.criterion.reset()
|
|
p = start
|
|
|
|
while p < end:
|
|
if p + 1 != end_negative:
|
|
p_next = p + 1
|
|
else:
|
|
p_next = start_positive
|
|
|
|
while (p_next < end and
|
|
Xf[p_next] <= Xf[p] + FEATURE_THRESHOLD):
|
|
p = p_next
|
|
if p + 1 != end_negative:
|
|
p_next = p + 1
|
|
else:
|
|
p_next = start_positive
|
|
|
|
|
|
# (p_next >= end) or (X[samples[p_next], current.feature] >
|
|
# X[samples[p], current.feature])
|
|
p_prev = p
|
|
p = p_next
|
|
# (p >= end) or (X[samples[p], current.feature] >
|
|
# X[samples[p_prev], current.feature])
|
|
|
|
|
|
if p < end:
|
|
current.pos = p
|
|
|
|
# Reject if min_samples_leaf is not guaranteed
|
|
if (((current.pos - start) < min_samples_leaf) or
|
|
((end - current.pos) < min_samples_leaf)):
|
|
continue
|
|
|
|
self.criterion.update(current.pos)
|
|
|
|
# Reject if min_weight_leaf is not satisfied
|
|
if ((self.criterion.weighted_n_left < min_weight_leaf) or
|
|
(self.criterion.weighted_n_right < min_weight_leaf)):
|
|
continue
|
|
|
|
current.improvement = self.criterion.impurity_improvement(impurity)
|
|
if current.improvement > best.improvement:
|
|
self.criterion.children_impurity(¤t.impurity_left,
|
|
¤t.impurity_right)
|
|
|
|
current.threshold = (Xf[p_prev] + Xf[p]) / 2.0
|
|
if current.threshold == Xf[p]:
|
|
current.threshold = Xf[p_prev]
|
|
|
|
best = current
|
|
|
|
# Reorganize into samples[start:best.pos] + samples[best.pos:end]
|
|
if best.pos < end:
|
|
self.extract_nnz(best.feature, &end_negative, &start_positive,
|
|
&is_samples_sorted)
|
|
|
|
self._partition(best.threshold, end_negative, start_positive,
|
|
best.pos)
|
|
|
|
# Respect invariant for constant features: the original order of
|
|
# element in features[:n_known_constants] must be preserved for sibling
|
|
# and child nodes
|
|
memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants)
|
|
|
|
# Copy newly found constant features
|
|
memcpy(constant_features + n_known_constants,
|
|
features + n_known_constants,
|
|
sizeof(SIZE_t) * n_found_constants)
|
|
|
|
# Return values
|
|
split[0] = best
|
|
n_constant_features[0] = n_total_constants
|
|
|
|
|
|
cdef class RandomSparseSplitter(BaseSparseSplitter):
|
|
"""Splitter for finding a random split, using the sparse data."""
|
|
|
|
def __reduce__(self):
|
|
return (RandomSparseSplitter, (self.criterion,
|
|
self.max_features,
|
|
self.min_samples_leaf,
|
|
self.min_weight_leaf,
|
|
self.random_state), self.__getstate__())
|
|
|
|
cdef void node_split(self, double impurity, SplitRecord* split,
|
|
SIZE_t* n_constant_features) nogil:
|
|
"""Find a random split on node samples[start:end], using sparse
|
|
features.
|
|
"""
|
|
# Find the best split
|
|
cdef SIZE_t* samples = self.samples
|
|
cdef SIZE_t start = self.start
|
|
cdef SIZE_t end = self.end
|
|
|
|
cdef INT32_t* X_indices = self.X_indices
|
|
cdef INT32_t* X_indptr = self.X_indptr
|
|
cdef DTYPE_t* X_data = self.X_data
|
|
|
|
cdef SIZE_t* features = self.features
|
|
cdef SIZE_t* constant_features = self.constant_features
|
|
cdef SIZE_t n_features = self.n_features
|
|
|
|
cdef DTYPE_t* Xf = self.feature_values
|
|
cdef SIZE_t* sorted_samples = self.sorted_samples
|
|
cdef SIZE_t* index_to_samples = self.index_to_samples
|
|
cdef SIZE_t max_features = self.max_features
|
|
cdef SIZE_t min_samples_leaf = self.min_samples_leaf
|
|
cdef double min_weight_leaf = self.min_weight_leaf
|
|
cdef UINT32_t* random_state = &self.rand_r_state
|
|
|
|
cdef SplitRecord best, current
|
|
_init_split(&best, end)
|
|
|
|
cdef DTYPE_t current_feature_value
|
|
|
|
cdef SIZE_t f_i = n_features
|
|
cdef SIZE_t f_j, p, tmp
|
|
cdef SIZE_t n_visited_features = 0
|
|
# Number of features discovered to be constant during the split search
|
|
cdef SIZE_t n_found_constants = 0
|
|
# Number of features known to be constant and drawn without replacement
|
|
cdef SIZE_t n_drawn_constants = 0
|
|
cdef SIZE_t n_known_constants = n_constant_features[0]
|
|
# n_total_constants = n_known_constants + n_found_constants
|
|
cdef SIZE_t n_total_constants = n_known_constants
|
|
cdef SIZE_t partition_end
|
|
|
|
cdef DTYPE_t min_feature_value
|
|
cdef DTYPE_t max_feature_value
|
|
|
|
cdef bint is_samples_sorted = 0 # indicate that sorted_samples is
|
|
# inititialized
|
|
|
|
# We assume implicitely that end_positive = end and
|
|
# start_negative = start
|
|
cdef SIZE_t start_positive
|
|
cdef SIZE_t end_negative
|
|
|
|
# Sample up to max_features without replacement using a
|
|
# Fisher-Yates-based algorithm (using the local variables `f_i` and
|
|
# `f_j` to compute a permutation of the `features` array).
|
|
#
|
|
# Skip the CPU intensive evaluation of the impurity criterion for
|
|
# features that were already detected as constant (hence not suitable
|
|
# for good splitting) by ancestor nodes and save the information on
|
|
# newly discovered constant features to spare computation on descendant
|
|
# nodes.
|
|
while (f_i > n_total_constants and # Stop early if remaining features
|
|
# are constant
|
|
(n_visited_features < max_features or
|
|
# At least one drawn features must be non constant
|
|
n_visited_features <= n_found_constants + n_drawn_constants)):
|
|
|
|
n_visited_features += 1
|
|
|
|
# Loop invariant: elements of features in
|
|
# - [:n_drawn_constant[ holds drawn and known constant features;
|
|
# - [n_drawn_constant:n_known_constant[ holds known constant
|
|
# features that haven't been drawn yet;
|
|
# - [n_known_constant:n_total_constant[ holds newly found constant
|
|
# features;
|
|
# - [n_total_constant:f_i[ holds features that haven't been drawn
|
|
# yet and aren't constant apriori.
|
|
# - [f_i:n_features[ holds features that have been drawn
|
|
# and aren't constant.
|
|
|
|
# Draw a feature at random
|
|
f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
|
|
random_state)
|
|
|
|
if f_j < n_known_constants:
|
|
# f_j in the interval [n_drawn_constants, n_known_constants[
|
|
tmp = features[f_j]
|
|
features[f_j] = features[n_drawn_constants]
|
|
features[n_drawn_constants] = tmp
|
|
|
|
n_drawn_constants += 1
|
|
|
|
else:
|
|
# f_j in the interval [n_known_constants, f_i - n_found_constants[
|
|
f_j += n_found_constants
|
|
# f_j in the interval [n_total_constants, f_i[
|
|
|
|
current.feature = features[f_j]
|
|
|
|
self.extract_nnz(current.feature,
|
|
&end_negative, &start_positive,
|
|
&is_samples_sorted)
|
|
|
|
# Add one or two zeros in Xf, if there is any
|
|
if end_negative < start_positive:
|
|
start_positive -= 1
|
|
Xf[start_positive] = 0.
|
|
|
|
if end_negative != start_positive:
|
|
Xf[end_negative] = 0.
|
|
end_negative += 1
|
|
|
|
# Find min, max in Xf[start:end_negative]
|
|
min_feature_value = Xf[start]
|
|
max_feature_value = min_feature_value
|
|
|
|
for p in range(start, end_negative):
|
|
current_feature_value = Xf[p]
|
|
|
|
if current_feature_value < min_feature_value:
|
|
min_feature_value = current_feature_value
|
|
elif current_feature_value > max_feature_value:
|
|
max_feature_value = current_feature_value
|
|
|
|
# Update min, max given Xf[start_positive:end]
|
|
for p in range(start_positive, end):
|
|
current_feature_value = Xf[p]
|
|
|
|
if current_feature_value < min_feature_value:
|
|
min_feature_value = current_feature_value
|
|
elif current_feature_value > max_feature_value:
|
|
max_feature_value = current_feature_value
|
|
|
|
if max_feature_value <= min_feature_value + FEATURE_THRESHOLD:
|
|
features[f_j] = features[n_total_constants]
|
|
features[n_total_constants] = current.feature
|
|
|
|
n_found_constants += 1
|
|
n_total_constants += 1
|
|
|
|
else:
|
|
f_i -= 1
|
|
features[f_i], features[f_j] = features[f_j], features[f_i]
|
|
|
|
# Draw a random threshold
|
|
current.threshold = rand_uniform(min_feature_value,
|
|
max_feature_value,
|
|
random_state)
|
|
|
|
if current.threshold == max_feature_value:
|
|
current.threshold = min_feature_value
|
|
|
|
# Partition
|
|
current.pos = self._partition(current.threshold,
|
|
end_negative,
|
|
start_positive,
|
|
start_positive +
|
|
(Xf[start_positive] == 0.))
|
|
|
|
# Reject if min_samples_leaf is not guaranteed
|
|
if (((current.pos - start) < min_samples_leaf) or
|
|
((end - current.pos) < min_samples_leaf)):
|
|
continue
|
|
|
|
# Evaluate split
|
|
self.criterion.reset()
|
|
self.criterion.update(current.pos)
|
|
|
|
# Reject if min_weight_leaf is not satisfied
|
|
if ((self.criterion.weighted_n_left < min_weight_leaf) or
|
|
(self.criterion.weighted_n_right < min_weight_leaf)):
|
|
continue
|
|
|
|
current.improvement = self.criterion.impurity_improvement(impurity)
|
|
|
|
if current.improvement > best.improvement:
|
|
self.criterion.children_impurity(¤t.impurity_left,
|
|
¤t.impurity_right)
|
|
best = current
|
|
|
|
# Reorganize into samples[start:best.pos] + samples[best.pos:end]
|
|
if best.pos < end and current.feature != best.feature:
|
|
self.extract_nnz(best.feature, &end_negative, &start_positive,
|
|
&is_samples_sorted)
|
|
|
|
self._partition(best.threshold, end_negative, start_positive,
|
|
best.pos)
|
|
|
|
# Respect invariant for constant features: the original order of
|
|
# element in features[:n_known_constants] must be preserved for sibling
|
|
# and child nodes
|
|
memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants)
|
|
|
|
# Copy newly found constant features
|
|
memcpy(constant_features + n_known_constants,
|
|
features + n_known_constants,
|
|
sizeof(SIZE_t) * n_found_constants)
|
|
|
|
# Return values
|
|
split[0] = best
|
|
n_constant_features[0] = n_total_constants
|
|
|
|
|
|
# =============================================================================
|
|
# Tree builders
|
|
# =============================================================================
|
|
cdef class TreeBuilder:
|
|
"""Interface for different tree building strategies. """
|
|
|
|
cpdef build(self, Tree tree, object X, np.ndarray y,
|
|
np.ndarray sample_weight=None):
|
|
"""Build a decision tree from the training set (X, y)."""
|
|
pass
|
|
|
|
cdef inline _check_input(self, object X, np.ndarray y,
|
|
np.ndarray sample_weight):
|
|
"""Check input dtype, layout and format"""
|
|
if X.dtype != DTYPE:
|
|
# since we have to copy we will make it fortran for efficiency
|
|
X = np.asfortranarray(X, dtype=DTYPE)
|
|
|
|
if y.dtype != DOUBLE or not y.flags.contiguous:
|
|
y = np.ascontiguousarray(y, dtype=DOUBLE)
|
|
|
|
if (sample_weight is not None and
|
|
(sample_weight.dtype != DOUBLE or
|
|
not sample_weight.flags.contiguous)):
|
|
sample_weight = np.asarray(sample_weight, dtype=DOUBLE,
|
|
order="C")
|
|
|
|
return X, y, sample_weight
|
|
|
|
# Depth first builder ---------------------------------------------------------
|
|
|
|
cdef class DepthFirstTreeBuilder(TreeBuilder):
|
|
"""Build a decision tree in depth-first fashion."""
|
|
|
|
def __cinit__(self, Splitter splitter, SIZE_t min_samples_split,
|
|
SIZE_t min_samples_leaf, double min_weight_leaf,
|
|
SIZE_t max_depth):
|
|
self.splitter = splitter
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_leaf = min_weight_leaf
|
|
self.max_depth = max_depth
|
|
|
|
cpdef build(self, Tree tree, object X, np.ndarray y,
|
|
np.ndarray sample_weight=None):
|
|
"""Build a decision tree from the training set (X, y)."""
|
|
|
|
# check input
|
|
X, y, sample_weight = self._check_input(X, y, sample_weight)
|
|
|
|
cdef DOUBLE_t* sample_weight_ptr = NULL
|
|
if sample_weight is not None:
|
|
sample_weight_ptr = <DOUBLE_t*> sample_weight.data
|
|
|
|
# Initial capacity
|
|
cdef int init_capacity
|
|
|
|
if tree.max_depth <= 10:
|
|
init_capacity = (2 ** (tree.max_depth + 1)) - 1
|
|
else:
|
|
init_capacity = 2047
|
|
|
|
tree._resize(init_capacity)
|
|
|
|
# Parameters
|
|
cdef Splitter splitter = self.splitter
|
|
cdef SIZE_t max_depth = self.max_depth
|
|
cdef SIZE_t min_samples_leaf = self.min_samples_leaf
|
|
cdef double min_weight_leaf = self.min_weight_leaf
|
|
cdef SIZE_t min_samples_split = self.min_samples_split
|
|
|
|
# Recursive partition (without actual recursion)
|
|
splitter.init(X, y, sample_weight_ptr)
|
|
|
|
cdef SIZE_t start
|
|
cdef SIZE_t end
|
|
cdef SIZE_t depth
|
|
cdef SIZE_t parent
|
|
cdef bint is_left
|
|
cdef SIZE_t n_node_samples = splitter.n_samples
|
|
cdef double weighted_n_samples = splitter.weighted_n_samples
|
|
cdef double weighted_n_node_samples
|
|
cdef SplitRecord split
|
|
cdef SIZE_t node_id
|
|
|
|
cdef double threshold
|
|
cdef double impurity = INFINITY
|
|
cdef SIZE_t n_constant_features
|
|
cdef bint is_leaf
|
|
cdef bint first = 1
|
|
cdef SIZE_t max_depth_seen = -1
|
|
cdef int rc = 0
|
|
|
|
cdef Stack stack = Stack(INITIAL_STACK_SIZE)
|
|
cdef StackRecord stack_record
|
|
|
|
# push root node onto stack
|
|
rc = stack.push(0, n_node_samples, 0, _TREE_UNDEFINED, 0, INFINITY, 0)
|
|
if rc == -1:
|
|
# got return code -1 - out-of-memory
|
|
raise MemoryError()
|
|
|
|
with nogil:
|
|
while not stack.is_empty():
|
|
stack.pop(&stack_record)
|
|
|
|
start = stack_record.start
|
|
end = stack_record.end
|
|
depth = stack_record.depth
|
|
parent = stack_record.parent
|
|
is_left = stack_record.is_left
|
|
impurity = stack_record.impurity
|
|
n_constant_features = stack_record.n_constant_features
|
|
|
|
n_node_samples = end - start
|
|
splitter.node_reset(start, end, &weighted_n_node_samples)
|
|
|
|
is_leaf = ((depth >= max_depth) or
|
|
(n_node_samples < min_samples_split) or
|
|
(n_node_samples < 2 * min_samples_leaf) or
|
|
(weighted_n_node_samples < min_weight_leaf))
|
|
|
|
if first:
|
|
impurity = splitter.node_impurity()
|
|
first = 0
|
|
|
|
is_leaf = is_leaf or (impurity <= MIN_IMPURITY_SPLIT)
|
|
|
|
if not is_leaf:
|
|
splitter.node_split(impurity, &split, &n_constant_features)
|
|
is_leaf = is_leaf or (split.pos >= end)
|
|
|
|
node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
|
|
split.threshold, impurity, n_node_samples,
|
|
weighted_n_node_samples)
|
|
|
|
if is_leaf:
|
|
# Don't store value for internal nodes
|
|
splitter.node_value(tree.value +
|
|
node_id * tree.value_stride)
|
|
|
|
else:
|
|
# Push right child on stack
|
|
rc = stack.push(split.pos, end, depth + 1, node_id, 0,
|
|
split.impurity_right, n_constant_features)
|
|
if rc == -1:
|
|
break
|
|
|
|
# Push left child on stack
|
|
rc = stack.push(start, split.pos, depth + 1, node_id, 1,
|
|
split.impurity_left, n_constant_features)
|
|
if rc == -1:
|
|
break
|
|
|
|
if depth > max_depth_seen:
|
|
max_depth_seen = depth
|
|
|
|
if rc >= 0:
|
|
rc = tree._resize_c(tree.node_count)
|
|
|
|
if rc >= 0:
|
|
tree.max_depth = max_depth_seen
|
|
if rc == -1:
|
|
raise MemoryError()
|
|
|
|
|
|
# Best first builder ----------------------------------------------------------
|
|
|
|
cdef inline int _add_to_frontier(PriorityHeapRecord* rec,
|
|
PriorityHeap frontier) nogil:
|
|
"""Adds record ``rec`` to the priority queue ``frontier``; returns -1
|
|
on memory-error. """
|
|
return frontier.push(rec.node_id, rec.start, rec.end, rec.pos, rec.depth,
|
|
rec.is_leaf, rec.improvement, rec.impurity,
|
|
rec.impurity_left, rec.impurity_right)
|
|
|
|
|
|
cdef class BestFirstTreeBuilder(TreeBuilder):
|
|
"""Build a decision tree in best-first fashion.
|
|
|
|
The best node to expand is given by the node at the frontier that has the
|
|
highest impurity improvement.
|
|
|
|
NOTE: this TreeBuilder will ignore ``tree.max_depth`` .
|
|
"""
|
|
cdef SIZE_t max_leaf_nodes
|
|
|
|
def __cinit__(self, Splitter splitter, SIZE_t min_samples_split,
|
|
SIZE_t min_samples_leaf, min_weight_leaf,
|
|
SIZE_t max_depth, SIZE_t max_leaf_nodes):
|
|
self.splitter = splitter
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_leaf = min_weight_leaf
|
|
self.max_depth = max_depth
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
|
|
cpdef build(self, Tree tree, object X, np.ndarray y,
|
|
np.ndarray sample_weight=None):
|
|
"""Build a decision tree from the training set (X, y)."""
|
|
|
|
# check input
|
|
X, y, sample_weight = self._check_input(X, y, sample_weight)
|
|
|
|
cdef DOUBLE_t* sample_weight_ptr = NULL
|
|
if sample_weight is not None:
|
|
sample_weight_ptr = <DOUBLE_t*> sample_weight.data
|
|
|
|
# Parameters
|
|
cdef Splitter splitter = self.splitter
|
|
cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes
|
|
cdef SIZE_t min_samples_leaf = self.min_samples_leaf
|
|
cdef double min_weight_leaf = self.min_weight_leaf
|
|
cdef SIZE_t min_samples_split = self.min_samples_split
|
|
|
|
# Recursive partition (without actual recursion)
|
|
splitter.init(X, y, sample_weight_ptr)
|
|
|
|
cdef PriorityHeap frontier = PriorityHeap(INITIAL_STACK_SIZE)
|
|
cdef PriorityHeapRecord record
|
|
cdef PriorityHeapRecord split_node_left
|
|
cdef PriorityHeapRecord split_node_right
|
|
|
|
cdef SIZE_t n_node_samples = splitter.n_samples
|
|
cdef SIZE_t max_split_nodes = max_leaf_nodes - 1
|
|
cdef bint is_leaf
|
|
cdef SIZE_t max_depth_seen = -1
|
|
cdef int rc = 0
|
|
cdef Node* node
|
|
|
|
# Initial capacity
|
|
cdef SIZE_t init_capacity = max_split_nodes + max_leaf_nodes
|
|
tree._resize(init_capacity)
|
|
|
|
with nogil:
|
|
# add root to frontier
|
|
rc = self._add_split_node(splitter, tree, 0, n_node_samples,
|
|
INFINITY, IS_FIRST, IS_LEFT, NULL, 0,
|
|
&split_node_left)
|
|
if rc >= 0:
|
|
rc = _add_to_frontier(&split_node_left, frontier)
|
|
if rc == -1:
|
|
raise MemoryError()
|
|
|
|
with nogil:
|
|
while not frontier.is_empty():
|
|
frontier.pop(&record)
|
|
|
|
node = &tree.nodes[record.node_id]
|
|
is_leaf = (record.is_leaf or max_split_nodes <= 0)
|
|
|
|
if is_leaf:
|
|
# Node is not expandable; set node as leaf
|
|
node.left_child = _TREE_LEAF
|
|
node.right_child = _TREE_LEAF
|
|
node.feature = _TREE_UNDEFINED
|
|
node.threshold = _TREE_UNDEFINED
|
|
|
|
else:
|
|
# Node is expandable
|
|
|
|
# Decrement number of split nodes available
|
|
max_split_nodes -= 1
|
|
|
|
# Compute left split node
|
|
rc = self._add_split_node(splitter, tree,
|
|
record.start, record.pos,
|
|
record.impurity_left,
|
|
IS_NOT_FIRST, IS_LEFT, node,
|
|
record.depth + 1,
|
|
&split_node_left)
|
|
if rc == -1:
|
|
break
|
|
|
|
# tree.nodes may have changed
|
|
node = &tree.nodes[record.node_id]
|
|
|
|
# Compute right split node
|
|
rc = self._add_split_node(splitter, tree, record.pos,
|
|
record.end,
|
|
record.impurity_right,
|
|
IS_NOT_FIRST, IS_NOT_LEFT, node,
|
|
record.depth + 1,
|
|
&split_node_right)
|
|
if rc == -1:
|
|
break
|
|
|
|
# Add nodes to queue
|
|
rc = _add_to_frontier(&split_node_left, frontier)
|
|
if rc == -1:
|
|
break
|
|
|
|
rc = _add_to_frontier(&split_node_right, frontier)
|
|
if rc == -1:
|
|
break
|
|
|
|
if record.depth > max_depth_seen:
|
|
max_depth_seen = record.depth
|
|
|
|
if rc >= 0:
|
|
rc = tree._resize_c(tree.node_count)
|
|
|
|
if rc >= 0:
|
|
tree.max_depth = max_depth_seen
|
|
|
|
if rc == -1:
|
|
raise MemoryError()
|
|
|
|
cdef inline int _add_split_node(self, Splitter splitter, Tree tree,
|
|
SIZE_t start, SIZE_t end, double impurity,
|
|
bint is_first, bint is_left, Node* parent,
|
|
SIZE_t depth,
|
|
PriorityHeapRecord* res) nogil:
|
|
"""Adds node w/ partition ``[start, end)`` to the frontier. """
|
|
cdef SplitRecord split
|
|
cdef SIZE_t node_id
|
|
cdef SIZE_t n_node_samples
|
|
cdef SIZE_t n_constant_features = 0
|
|
cdef double weighted_n_samples = splitter.weighted_n_samples
|
|
cdef double weighted_n_node_samples
|
|
cdef bint is_leaf
|
|
cdef SIZE_t n_left, n_right
|
|
cdef double imp_diff
|
|
|
|
splitter.node_reset(start, end, &weighted_n_node_samples)
|
|
|
|
if is_first:
|
|
impurity = splitter.node_impurity()
|
|
|
|
n_node_samples = end - start
|
|
is_leaf = ((depth > self.max_depth) or
|
|
(n_node_samples < self.min_samples_split) or
|
|
(n_node_samples < 2 * self.min_samples_leaf) or
|
|
(weighted_n_node_samples < self.min_weight_leaf) or
|
|
(impurity <= MIN_IMPURITY_SPLIT))
|
|
|
|
if not is_leaf:
|
|
splitter.node_split(impurity, &split, &n_constant_features)
|
|
is_leaf = is_leaf or (split.pos >= end)
|
|
|
|
node_id = tree._add_node(parent - tree.nodes
|
|
if parent != NULL
|
|
else _TREE_UNDEFINED,
|
|
is_left, is_leaf,
|
|
split.feature, split.threshold, impurity, n_node_samples,
|
|
weighted_n_node_samples)
|
|
if node_id == <SIZE_t>(-1):
|
|
return -1
|
|
|
|
# compute values also for split nodes (might become leafs later).
|
|
splitter.node_value(tree.value + node_id * tree.value_stride)
|
|
|
|
res.node_id = node_id
|
|
res.start = start
|
|
res.end = end
|
|
res.depth = depth
|
|
res.impurity = impurity
|
|
|
|
if not is_leaf:
|
|
# is split node
|
|
res.pos = split.pos
|
|
res.is_leaf = 0
|
|
res.improvement = split.improvement
|
|
res.impurity_left = split.impurity_left
|
|
res.impurity_right = split.impurity_right
|
|
|
|
else:
|
|
# is leaf => 0 improvement
|
|
res.pos = end
|
|
res.is_leaf = 1
|
|
res.improvement = 0.0
|
|
res.impurity_left = impurity
|
|
res.impurity_right = impurity
|
|
|
|
return 0
|
|
|
|
|
|
# =============================================================================
|
|
# Tree
|
|
# =============================================================================
|
|
|
|
cdef class Tree:
|
|
"""Array-based representation of a binary decision tree.
|
|
|
|
The binary tree is represented as a number of parallel arrays. The i-th
|
|
element of each array holds information about the node `i`. Node 0 is the
|
|
tree's root. You can find a detailed description of all arrays in
|
|
`_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
|
|
nodes, resp. In this case the values of nodes of the other type are
|
|
arbitrary!
|
|
|
|
Attributes
|
|
----------
|
|
node_count : int
|
|
The number of nodes (internal nodes + leaves) in the tree.
|
|
|
|
capacity : int
|
|
The current capacity (i.e., size) of the arrays, which is at least as
|
|
great as `node_count`.
|
|
|
|
max_depth : int
|
|
The maximal depth of the tree.
|
|
|
|
children_left : array of int, shape [node_count]
|
|
children_left[i] holds the node id of the left child of node i.
|
|
For leaves, children_left[i] == TREE_LEAF. Otherwise,
|
|
children_left[i] > i. This child handles the case where
|
|
X[:, feature[i]] <= threshold[i].
|
|
|
|
children_right : array of int, shape [node_count]
|
|
children_right[i] holds the node id of the right child of node i.
|
|
For leaves, children_right[i] == TREE_LEAF. Otherwise,
|
|
children_right[i] > i. This child handles the case where
|
|
X[:, feature[i]] > threshold[i].
|
|
|
|
feature : array of int, shape [node_count]
|
|
feature[i] holds the feature to split on, for the internal node i.
|
|
|
|
threshold : array of double, shape [node_count]
|
|
threshold[i] holds the threshold for the internal node i.
|
|
|
|
value : array of double, shape [node_count, n_outputs, max_n_classes]
|
|
Contains the constant prediction value of each node.
|
|
|
|
impurity : array of double, shape [node_count]
|
|
impurity[i] holds the impurity (i.e., the value of the splitting
|
|
criterion) at node i.
|
|
|
|
n_node_samples : array of int, shape [node_count]
|
|
n_node_samples[i] holds the number of training samples reaching node i.
|
|
|
|
weighted_n_node_samples : array of int, shape [node_count]
|
|
weighted_n_node_samples[i] holds the weighted number of training samples
|
|
reaching node i.
|
|
"""
|
|
# Wrap for outside world.
|
|
# WARNING: these reference the current `nodes` and `value` buffers, which
|
|
# must not be be freed by a subsequent memory allocation.
|
|
# (i.e. through `_resize` or `__setstate__`)
|
|
property n_classes:
|
|
def __get__(self):
|
|
# it's small; copy for memory safety
|
|
return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs).copy()
|
|
|
|
property children_left:
|
|
def __get__(self):
|
|
return self._get_node_ndarray()['left_child'][:self.node_count]
|
|
|
|
property children_right:
|
|
def __get__(self):
|
|
return self._get_node_ndarray()['right_child'][:self.node_count]
|
|
|
|
property feature:
|
|
def __get__(self):
|
|
return self._get_node_ndarray()['feature'][:self.node_count]
|
|
|
|
property threshold:
|
|
def __get__(self):
|
|
return self._get_node_ndarray()['threshold'][:self.node_count]
|
|
|
|
property impurity:
|
|
def __get__(self):
|
|
return self._get_node_ndarray()['impurity'][:self.node_count]
|
|
|
|
property n_node_samples:
|
|
def __get__(self):
|
|
return self._get_node_ndarray()['n_node_samples'][:self.node_count]
|
|
|
|
property weighted_n_node_samples:
|
|
def __get__(self):
|
|
return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]
|
|
|
|
property value:
|
|
def __get__(self):
|
|
return self._get_value_ndarray()[:self.node_count]
|
|
|
|
def __cinit__(self, int n_features, np.ndarray[SIZE_t, ndim=1] n_classes,
|
|
int n_outputs):
|
|
"""Constructor."""
|
|
# Input/Output layout
|
|
self.n_features = n_features
|
|
self.n_outputs = n_outputs
|
|
self.n_classes = NULL
|
|
safe_realloc(&self.n_classes, n_outputs)
|
|
|
|
self.max_n_classes = np.max(n_classes)
|
|
self.value_stride = n_outputs * self.max_n_classes
|
|
|
|
cdef SIZE_t k
|
|
for k in range(n_outputs):
|
|
self.n_classes[k] = n_classes[k]
|
|
|
|
# Inner structures
|
|
self.max_depth = 0
|
|
self.node_count = 0
|
|
self.capacity = 0
|
|
self.value = NULL
|
|
self.nodes = NULL
|
|
|
|
def __dealloc__(self):
|
|
"""Destructor."""
|
|
# Free all inner structures
|
|
free(self.n_classes)
|
|
free(self.value)
|
|
free(self.nodes)
|
|
|
|
def __reduce__(self):
|
|
"""Reduce re-implementation, for pickling."""
|
|
return (Tree, (self.n_features,
|
|
sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
|
|
self.n_outputs), self.__getstate__())
|
|
|
|
def __getstate__(self):
|
|
"""Getstate re-implementation, for pickling."""
|
|
d = {}
|
|
d["node_count"] = self.node_count
|
|
d["nodes"] = self._get_node_ndarray()
|
|
d["values"] = self._get_value_ndarray()
|
|
return d
|
|
|
|
def __setstate__(self, d):
|
|
"""Setstate re-implementation, for unpickling."""
|
|
self.node_count = d["node_count"]
|
|
|
|
if 'nodes' not in d:
|
|
raise ValueError('You have loaded Tree version which '
|
|
'cannot be imported')
|
|
|
|
node_ndarray = d['nodes']
|
|
value_ndarray = d['values']
|
|
|
|
value_shape = (node_ndarray.shape[0], self.n_outputs,
|
|
self.max_n_classes)
|
|
if (node_ndarray.ndim != 1 or
|
|
node_ndarray.dtype != NODE_DTYPE or
|
|
not node_ndarray.flags.c_contiguous or
|
|
value_ndarray.shape != value_shape or
|
|
not value_ndarray.flags.c_contiguous or
|
|
value_ndarray.dtype != np.float64):
|
|
raise ValueError('Did not recognise loaded array layout')
|
|
|
|
self.capacity = node_ndarray.shape[0]
|
|
if self._resize_c(self.capacity) != 0:
|
|
raise MemoryError("resizing tree to %d" % self.capacity)
|
|
nodes = memcpy(self.nodes, (<np.ndarray> node_ndarray).data,
|
|
self.capacity * sizeof(Node))
|
|
value = memcpy(self.value, (<np.ndarray> value_ndarray).data,
|
|
self.capacity * self.value_stride * sizeof(double))
|
|
|
|
cdef void _resize(self, SIZE_t capacity) except *:
|
|
"""Resize all inner arrays to `capacity`, if `capacity` == -1, then
|
|
double the size of the inner arrays."""
|
|
if self._resize_c(capacity) != 0:
|
|
raise MemoryError()
|
|
|
|
# XXX using (size_t)(-1) is ugly, but SIZE_MAX is not available in C89
|
|
# (i.e., older MSVC).
|
|
cdef int _resize_c(self, SIZE_t capacity=<SIZE_t>(-1)) nogil:
|
|
"""Guts of _resize. Returns 0 for success, -1 for error."""
|
|
if capacity == self.capacity and self.nodes != NULL:
|
|
return 0
|
|
|
|
if capacity == <SIZE_t>(-1):
|
|
if self.capacity == 0:
|
|
capacity = 3 # default initial value
|
|
else:
|
|
capacity = 2 * self.capacity
|
|
|
|
# XXX no safe_realloc here because we need to grab the GIL
|
|
cdef void* ptr = realloc(self.nodes, capacity * sizeof(Node))
|
|
if ptr == NULL:
|
|
return -1
|
|
self.nodes = <Node*> ptr
|
|
ptr = realloc(self.value,
|
|
capacity * self.value_stride * sizeof(double))
|
|
if ptr == NULL:
|
|
return -1
|
|
self.value = <double*> ptr
|
|
|
|
# value memory is initialised to 0 to enable classifier argmax
|
|
if capacity > self.capacity:
|
|
memset(<void*>(self.value + self.capacity * self.value_stride), 0,
|
|
(capacity - self.capacity) * self.value_stride *
|
|
sizeof(double))
|
|
|
|
# if capacity smaller than node_count, adjust the counter
|
|
if capacity < self.node_count:
|
|
self.node_count = capacity
|
|
|
|
self.capacity = capacity
|
|
return 0
|
|
|
|
cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
|
|
SIZE_t feature, double threshold, double impurity,
|
|
SIZE_t n_node_samples, double weighted_n_node_samples) nogil:
|
|
"""Add a node to the tree.
|
|
|
|
The new node registers itself as the child of its parent.
|
|
|
|
Returns (size_t)(-1) on error.
|
|
"""
|
|
cdef SIZE_t node_id = self.node_count
|
|
|
|
if node_id >= self.capacity:
|
|
if self._resize_c() != 0:
|
|
return <SIZE_t>(-1)
|
|
|
|
cdef Node* node = &self.nodes[node_id]
|
|
node.impurity = impurity
|
|
node.n_node_samples = n_node_samples
|
|
node.weighted_n_node_samples = weighted_n_node_samples
|
|
|
|
if parent != _TREE_UNDEFINED:
|
|
if is_left:
|
|
self.nodes[parent].left_child = node_id
|
|
else:
|
|
self.nodes[parent].right_child = node_id
|
|
|
|
if is_leaf:
|
|
node.left_child = _TREE_LEAF
|
|
node.right_child = _TREE_LEAF
|
|
node.feature = _TREE_UNDEFINED
|
|
node.threshold = _TREE_UNDEFINED
|
|
|
|
else:
|
|
# left_child and right_child will be set later
|
|
node.feature = feature
|
|
node.threshold = threshold
|
|
|
|
self.node_count += 1
|
|
|
|
return node_id
|
|
|
|
cpdef np.ndarray predict(self, object X):
|
|
"""Predict target for X."""
|
|
out = self._get_value_ndarray().take(self.apply(X), axis=0,
|
|
mode='clip')
|
|
if self.n_outputs == 1:
|
|
out = out.reshape(X.shape[0], self.max_n_classes)
|
|
return out
|
|
|
|
cpdef np.ndarray apply(self, object X):
|
|
"""Finds the terminal region (=leaf node) for each sample in X."""
|
|
return self._apply_dense(X)
|
|
|
|
|
|
cdef inline np.ndarray _apply_dense(self, object X):
|
|
"""Finds the terminal region (=leaf node) for each sample in X."""
|
|
|
|
# Check input
|
|
if not isinstance(X, np.ndarray):
|
|
raise ValueError("X should be in np.ndarray format, got %s"
|
|
% type(X))
|
|
|
|
if X.dtype != DTYPE:
|
|
raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)
|
|
|
|
# Extract input
|
|
cdef np.ndarray X_ndarray = X
|
|
cdef DTYPE_t* X_ptr = <DTYPE_t*> X_ndarray.data
|
|
cdef SIZE_t X_sample_stride = <SIZE_t> X.strides[0] / <SIZE_t> X.itemsize
|
|
cdef SIZE_t X_fx_stride = <SIZE_t> X.strides[1] / <SIZE_t> X.itemsize
|
|
cdef SIZE_t n_samples = X.shape[0]
|
|
|
|
# Initialize output
|
|
cdef np.ndarray[SIZE_t] out = np.zeros((n_samples,), dtype=np.intp)
|
|
cdef SIZE_t* out_ptr = <SIZE_t*> out.data
|
|
|
|
# Initialize auxiliary data-structure
|
|
cdef Node* node = NULL
|
|
cdef SIZE_t i = 0
|
|
|
|
with nogil:
|
|
for i in range(n_samples):
|
|
node = self.nodes
|
|
# While node not a leaf
|
|
while node.left_child != _TREE_LEAF:
|
|
# ... and node.right_child != _TREE_LEAF:
|
|
if X_ptr[X_sample_stride * i +
|
|
X_fx_stride * node.feature] <= node.threshold:
|
|
node = &self.nodes[node.left_child]
|
|
else:
|
|
node = &self.nodes[node.right_child]
|
|
|
|
out_ptr[i] = <SIZE_t>(node - self.nodes) # node offset
|
|
|
|
return out
|
|
|
|
cdef inline np.ndarray _apply_sparse_csr(self, object X):
|
|
"""Finds the terminal region (=leaf node) for each sample in sparse X.
|
|
|
|
"""
|
|
|
|
if X.dtype != DTYPE:
|
|
raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)
|
|
|
|
# Extract input
|
|
cdef np.ndarray[ndim=1, dtype=DTYPE_t] X_data_ndarray = X.data
|
|
cdef np.ndarray[ndim=1, dtype=INT32_t] X_indices_ndarray = X.indices
|
|
cdef np.ndarray[ndim=1, dtype=INT32_t] X_indptr_ndarray = X.indptr
|
|
|
|
cdef DTYPE_t* X_data = <DTYPE_t*>X_data_ndarray.data
|
|
cdef INT32_t* X_indices = <INT32_t*>X_indices_ndarray.data
|
|
cdef INT32_t* X_indptr = <INT32_t*>X_indptr_ndarray.data
|
|
|
|
cdef SIZE_t n_samples = X.shape[0]
|
|
cdef SIZE_t n_features = X.shape[1]
|
|
|
|
# Initialize output
|
|
cdef np.ndarray[SIZE_t, ndim=1] out = np.zeros((n_samples,),
|
|
dtype=np.intp)
|
|
cdef SIZE_t* out_ptr = <SIZE_t*> out.data
|
|
|
|
# Initialize auxiliary data-structure
|
|
cdef DTYPE_t feature_value = 0.
|
|
cdef Node* node = NULL
|
|
cdef DTYPE_t* X_sample = NULL
|
|
cdef SIZE_t i = 0
|
|
cdef INT32_t k = 0
|
|
|
|
# feature_to_sample as a data structure records the last seen sample
|
|
# for each feature; functionally, it is an efficient way to identify
|
|
# which features are nonzero in the present sample.
|
|
cdef SIZE_t* feature_to_sample = NULL
|
|
|
|
safe_realloc(&X_sample, n_features * sizeof(DTYPE_t))
|
|
safe_realloc(&feature_to_sample, n_features * sizeof(SIZE_t))
|
|
|
|
with nogil:
|
|
memset(feature_to_sample, -1, n_features * sizeof(SIZE_t))
|
|
|
|
for i in range(n_samples):
|
|
node = self.nodes
|
|
|
|
for k in range(X_indptr[i], X_indptr[i + 1]):
|
|
feature_to_sample[X_indices[k]] = i
|
|
X_sample[X_indices[k]] = X_data[k]
|
|
|
|
# While node not a leaf
|
|
while node.left_child != _TREE_LEAF:
|
|
# ... and node.right_child != _TREE_LEAF:
|
|
if feature_to_sample[node.feature] == i:
|
|
feature_value = X_sample[node.feature]
|
|
|
|
else:
|
|
feature_value = 0.
|
|
|
|
if feature_value <= node.threshold:
|
|
node = &self.nodes[node.left_child]
|
|
else:
|
|
node = &self.nodes[node.right_child]
|
|
|
|
out_ptr[i] = <SIZE_t>(node - self.nodes) # node offset
|
|
|
|
# Free auxiliary arrays
|
|
free(X_sample)
|
|
free(feature_to_sample)
|
|
|
|
return out
|
|
|
|
cpdef compute_feature_importances(self, normalize=True):
|
|
"""Computes the importance of each feature (aka variable)."""
|
|
cdef Node* left
|
|
cdef Node* right
|
|
cdef Node* nodes = self.nodes
|
|
cdef Node* node = nodes
|
|
cdef Node* end_node = node + self.node_count
|
|
|
|
cdef double normalizer = 0.
|
|
|
|
cdef np.ndarray[np.float64_t, ndim=1] importances
|
|
importances = np.zeros((self.n_features,))
|
|
cdef DOUBLE_t* importance_data = <DOUBLE_t*>importances.data
|
|
|
|
with nogil:
|
|
while node != end_node:
|
|
if node.left_child != _TREE_LEAF:
|
|
# ... and node.right_child != _TREE_LEAF:
|
|
left = &nodes[node.left_child]
|
|
right = &nodes[node.right_child]
|
|
|
|
importance_data[node.feature] += (
|
|
node.weighted_n_node_samples * node.impurity -
|
|
left.weighted_n_node_samples * left.impurity -
|
|
right.weighted_n_node_samples * right.impurity)
|
|
node += 1
|
|
|
|
importances /= nodes[0].weighted_n_node_samples
|
|
|
|
if normalize:
|
|
normalizer = np.sum(importances)
|
|
|
|
if normalizer > 0.0:
|
|
# Avoid dividing by zero (e.g., when root is pure)
|
|
importances /= normalizer
|
|
|
|
return importances
|
|
|
|
cdef np.ndarray _get_value_ndarray(self):
|
|
"""Wraps value as a 3-d NumPy array
|
|
|
|
The array keeps a reference to this Tree, which manages the underlying
|
|
memory.
|
|
"""
|
|
cdef np.npy_intp shape[3]
|
|
shape[0] = <np.npy_intp> self.node_count
|
|
shape[1] = <np.npy_intp> self.n_outputs
|
|
shape[2] = <np.npy_intp> self.max_n_classes
|
|
cdef np.ndarray arr
|
|
arr = np.PyArray_SimpleNewFromData(3, shape, np.NPY_DOUBLE, self.value)
|
|
Py_INCREF(self)
|
|
arr.base = <PyObject*> self
|
|
return arr
|
|
|
|
cdef np.ndarray _get_node_ndarray(self):
|
|
"""Wraps nodes as a NumPy struct array
|
|
|
|
The array keeps a reference to this Tree, which manages the underlying
|
|
memory. Individual fields are publicly accessible as properties of the
|
|
Tree.
|
|
"""
|
|
cdef np.npy_intp shape[1]
|
|
shape[0] = <np.npy_intp> self.node_count
|
|
cdef np.npy_intp strides[1]
|
|
strides[0] = sizeof(Node)
|
|
cdef np.ndarray arr
|
|
Py_INCREF(NODE_DTYPE)
|
|
arr = PyArray_NewFromDescr(np.ndarray, <np.dtype> NODE_DTYPE, 1, shape,
|
|
strides, <void*> self.nodes,
|
|
np.NPY_DEFAULT, None)
|
|
Py_INCREF(self)
|
|
arr.base = <PyObject*> self
|
|
return arr
|
|
|
|
# =============================================================================
|
|
# Utils
|
|
# =============================================================================
|
|
|
|
# safe_realloc(&p, n) resizes the allocation of p to n * sizeof(*p) bytes or
|
|
# raises a MemoryError. It never calls free, since that's __dealloc__'s job.
|
|
# cdef DTYPE_t *p = NULL
|
|
# safe_realloc(&p, n)
|
|
# is equivalent to p = malloc(n * sizeof(*p)) with error checking.
|
|
ctypedef fused realloc_ptr:
|
|
# Add pointer types here as needed.
|
|
(DTYPE_t*)
|
|
(SIZE_t*)
|
|
(unsigned char*)
|
|
|
|
cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except *:
|
|
# sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython
|
|
# 0.20.1 to crash.
|
|
cdef size_t nbytes = nelems * sizeof(p[0][0])
|
|
if nbytes / sizeof(p[0][0]) != nelems:
|
|
# Overflow in the multiplication
|
|
raise MemoryError("could not allocate (%d * %d) bytes"
|
|
% (nelems, sizeof(p[0][0])))
|
|
cdef realloc_ptr tmp = <realloc_ptr>realloc(p[0], nbytes)
|
|
if tmp == NULL:
|
|
raise MemoryError("could not allocate %d bytes" % nbytes)
|
|
|
|
p[0] = tmp
|
|
return tmp # for convenience
|
|
|
|
|
|
def _realloc_test():
|
|
# Helper for tests. Tries to allocate <size_t>(-1) / 2 * sizeof(size_t)
|
|
# bytes, which will always overflow.
|
|
cdef SIZE_t* p = NULL
|
|
safe_realloc(&p, <size_t>(-1) / 2)
|
|
if p != NULL:
|
|
free(p)
|
|
assert False
|
|
|
|
|
|
# rand_r replacement using a 32bit XorShift generator
|
|
# See http://www.jstatsoft.org/v08/i14/paper for details
|
|
cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil:
|
|
seed[0] ^= <UINT32_t>(seed[0] << 13)
|
|
seed[0] ^= <UINT32_t>(seed[0] >> 17)
|
|
seed[0] ^= <UINT32_t>(seed[0] << 5)
|
|
|
|
return seed[0] % (<UINT32_t>RAND_R_MAX + 1)
|
|
|
|
cdef inline np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size):
|
|
"""Encapsulate data into a 1D numpy array of intp's."""
|
|
cdef np.npy_intp shape[1]
|
|
shape[0] = <np.npy_intp> size
|
|
return np.PyArray_SimpleNewFromData(1, shape, np.NPY_INTP, data)
|
|
|
|
cdef inline SIZE_t rand_int(SIZE_t low, SIZE_t high,
|
|
UINT32_t* random_state) nogil:
|
|
"""Generate a random integer in [0; end)."""
|
|
return low + our_rand_r(random_state) % (high - low)
|
|
|
|
cdef inline double rand_uniform(double low, double high,
|
|
UINT32_t* random_state) nogil:
|
|
"""Generate a random double in [low; high)."""
|
|
return ((high - low) * <double> our_rand_r(random_state) /
|
|
<double> RAND_R_MAX) + low
|
|
|
|
cdef inline double log(double x) nogil:
|
|
return ln(x) / ln(2.0)
|
|
|
|
|