# Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt # Joel Nothman # Arnaud Joly # # Licence: BSD 3 clause # See _tree.pyx for details. import numpy as np cimport numpy as np ctypedef np.npy_float32 DTYPE_t # Type of X ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef np.npy_intp SIZE_t # Type for indices and counters ctypedef np.npy_int32 INT32_t # Signed 32 bit integer ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer # ============================================================================= # Stack data structure # ============================================================================= # A record on the stack for depth-first tree growing cdef struct StackRecord: SIZE_t start SIZE_t end SIZE_t depth SIZE_t parent bint is_left double impurity SIZE_t n_constant_features cdef class Stack: cdef SIZE_t capacity cdef SIZE_t top cdef StackRecord* stack_ cdef bint is_empty(self) nogil cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent, bint is_left, double impurity, SIZE_t n_constant_features) nogil cdef int pop(self, StackRecord* res) nogil # ============================================================================= # PriorityHeap data structure # ============================================================================= # A record on the frontier for best-first tree growing cdef struct PriorityHeapRecord: SIZE_t node_id SIZE_t start SIZE_t end SIZE_t pos SIZE_t depth bint is_leaf double impurity double impurity_left double impurity_right double improvement cdef class PriorityHeap: cdef SIZE_t capacity cdef SIZE_t heap_ptr cdef PriorityHeapRecord* heap_ cdef bint is_empty(self) nogil cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos, SIZE_t depth, bint is_leaf, double improvement, double impurity, double impurity_left, double impurity_right) nogil cdef int pop(self, PriorityHeapRecord* res) nogil # ============================================================================= # Criterion # ============================================================================= cdef class Criterion: # The criterion computes the impurity of a node and the reduction of # impurity of a split on that node. It also computes the output statistics # such as the mean in regression and class probabilities in classification. # Internal structures cdef DOUBLE_t* y # Values of y cdef SIZE_t y_stride # Stride in y (since n_outputs >= 1) cdef DOUBLE_t* sample_weight # Sample weights cdef SIZE_t* samples # Sample indices in X, y cdef SIZE_t start # samples[start:pos] are the samples in the left node cdef SIZE_t pos # samples[pos:end] are the samples in the right node cdef SIZE_t end cdef SIZE_t n_outputs # Number of outputs cdef SIZE_t n_node_samples # Number of samples in the node (end-start) cdef double weighted_n_samples # Weighted number of samples (in total) cdef double weighted_n_node_samples # Weighted number of samples in the node cdef double weighted_n_left # Weighted number of samples in the left node cdef double weighted_n_right # Weighted number of samples in the right node # The criterion object is maintained such that left and right collected # statistics correspond to samples[start:pos] and samples[pos:end]. # Methods cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, SIZE_t start, SIZE_t end) nogil cdef void reset(self) nogil cdef void update(self, SIZE_t new_pos) nogil cdef double node_impurity(self) nogil cdef void children_impurity(self, double* impurity_left, double* impurity_right) nogil cdef void node_value(self, double* dest) nogil cdef double impurity_improvement(self, double impurity) nogil # ============================================================================= # Splitter # ============================================================================= cdef struct SplitRecord: # Data to track sample split SIZE_t feature # Which feature to split on. SIZE_t pos # Split samples array at the given position, # i.e. count of samples below threshold for feature. # pos is >= end if the node is a leaf. double threshold # Threshold to split at. double improvement # Impurity improvement given parent node. double impurity_left # Impurity of the left split. double impurity_right # Impurity of the right split. cdef class Splitter: # The splitter searches in the input space for a feature and a threshold # to split the samples samples[start:end]. # # The impurity computations are delegated to a criterion object. # Internal structures cdef public Criterion criterion # Impurity criterion cdef public SIZE_t max_features # Number of features to test cdef public SIZE_t min_samples_leaf # Min samples in a leaf cdef public double min_weight_leaf # Minimum weight in a leaf cdef object random_state # Random state cdef UINT32_t rand_r_state # sklearn_rand_r random number state cdef SIZE_t* samples # Sample indices in X, y cdef SIZE_t n_samples # X.shape[0] cdef double weighted_n_samples # Weighted number of samples cdef SIZE_t* features # Feature indices in X cdef SIZE_t* constant_features # Constant features indices cdef SIZE_t n_features # X.shape[1] cdef DTYPE_t* feature_values # temp. array holding feature values cdef SIZE_t start # Start position for the current node cdef SIZE_t end # End position for the current node cdef DOUBLE_t* y cdef SIZE_t y_stride cdef DOUBLE_t* sample_weight # The samples vector `samples` is maintained by the Splitter object such # that the samples contained in a node are contiguous. With this setting, # `node_split` reorganizes the node samples `samples[start:end]` in two # subsets `samples[start:pos]` and `samples[pos:end]`. # The 1-d `features` array of size n_features contains the features # indices and allows fast sampling without replacement of features. # The 1-d `constant_features` array of size n_features holds in # `constant_features[:n_constant_features]` the feature ids with # constant values for all the samples that reached a specific node. # The value `n_constant_features` is given by the the parent node to its # child nodes. The content of the range `[n_constant_features:]` is left # undefined, but preallocated for performance reasons # This allows optimization with depth-based tree building. # Methods cdef void init(self, object X, np.ndarray y, DOUBLE_t* sample_weight) except * cdef void node_reset(self, SIZE_t start, SIZE_t end, double* weighted_n_node_samples) nogil cdef void node_split(self, double impurity, # Impurity of the node SplitRecord* split, SIZE_t* n_constant_features) nogil cdef void node_value(self, double* dest) nogil cdef double node_impurity(self) nogil # ============================================================================= # Tree # ============================================================================= cdef struct Node: # Base storage structure for the nodes in a Tree object SIZE_t left_child # id of the left child of the node SIZE_t right_child # id of the right child of the node SIZE_t feature # Feature used for splitting the node DOUBLE_t threshold # Threshold value at the node DOUBLE_t impurity # Impurity of the node (i.e., the value of the criterion) SIZE_t n_node_samples # Number of samples at the node DOUBLE_t weighted_n_node_samples # Weighted number of samples at the node cdef class Tree: # The Tree object is a binary tree structure constructed by the # TreeBuilder. The tree structure is used for predictions and # feature importances. # Input/Output layout cdef public SIZE_t n_features # Number of features in X cdef SIZE_t* n_classes # Number of classes in y[:, k] cdef public SIZE_t n_outputs # Number of outputs in y cdef public SIZE_t max_n_classes # max(n_classes) # Inner structures: values are stored separately from node structure, # since size is determined at runtime. cdef public SIZE_t max_depth # Max depth of the tree cdef public SIZE_t node_count # Counter for node IDs cdef public SIZE_t capacity # Capacity of tree, in terms of nodes cdef Node* nodes # Array of nodes cdef double* value # (capacity, n_outputs, max_n_classes) array of values cdef SIZE_t value_stride # = n_outputs * max_n_classes # Methods cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, SIZE_t feature, double threshold, double impurity, SIZE_t n_node_samples, double weighted_n_samples) nogil cdef void _resize(self, SIZE_t capacity) except * cdef int _resize_c(self, SIZE_t capacity=*) nogil cdef np.ndarray _get_value_ndarray(self) cdef np.ndarray _get_node_ndarray(self) cpdef np.ndarray predict(self, object X) cpdef np.ndarray apply(self, object X) cdef np.ndarray _apply_dense(self, object X) cdef np.ndarray _apply_sparse_csr(self, object X) cpdef compute_feature_importances(self, normalize=*) # ============================================================================= # Tree builder # ============================================================================= cdef class TreeBuilder: # The TreeBuilder recursively builds a Tree object from training samples, # using a Splitter object for splitting internal nodes and assigning # values to leaves. # # This class controls the various stopping criteria and the node splitting # evaluation order, e.g. depth-first or best-first. cdef Splitter splitter # Splitting algorithm cdef SIZE_t min_samples_split # Minimum number of samples in an internal node cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf cdef double min_weight_leaf # Minimum weight in a leaf cdef SIZE_t max_depth # Maximal tree depth cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=*) cdef _check_input(self, object X, np.ndarray y, np.ndarray sample_weight)