Files
triton/python/external/sklearn/_tree.pxd

275 lines
11 KiB
Cython
Raw Normal View History

# Authors: Gilles Louppe <g.louppe@gmail.com>
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Brian Holt <bdholt1@gmail.com>
# Joel Nothman <joel.nothman@gmail.com>
# Arnaud Joly <arnaud.v.joly@gmail.com>
#
# Licence: BSD 3 clause
# See _tree.pyx for details.
import numpy as np
cimport numpy as np
ctypedef np.npy_float32 DTYPE_t # Type of X
ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight
ctypedef np.npy_intp SIZE_t # Type for indices and counters
ctypedef np.npy_int32 INT32_t # Signed 32 bit integer
ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer
# =============================================================================
# Stack data structure
# =============================================================================
# A record on the stack for depth-first tree growing
cdef struct StackRecord:
SIZE_t start
SIZE_t end
SIZE_t depth
SIZE_t parent
bint is_left
double impurity
SIZE_t n_constant_features
cdef class Stack:
cdef SIZE_t capacity
cdef SIZE_t top
cdef StackRecord* stack_
cdef bint is_empty(self) nogil
cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
bint is_left, double impurity,
SIZE_t n_constant_features) nogil
cdef int pop(self, StackRecord* res) nogil
# =============================================================================
# PriorityHeap data structure
# =============================================================================
# A record on the frontier for best-first tree growing
cdef struct PriorityHeapRecord:
SIZE_t node_id
SIZE_t start
SIZE_t end
SIZE_t pos
SIZE_t depth
bint is_leaf
double impurity
double impurity_left
double impurity_right
double improvement
cdef class PriorityHeap:
cdef SIZE_t capacity
cdef SIZE_t heap_ptr
cdef PriorityHeapRecord* heap_
cdef bint is_empty(self) nogil
cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
SIZE_t depth, bint is_leaf, double improvement,
double impurity, double impurity_left,
double impurity_right) nogil
cdef int pop(self, PriorityHeapRecord* res) nogil
# =============================================================================
# Criterion
# =============================================================================
cdef class Criterion:
# The criterion computes the impurity of a node and the reduction of
# impurity of a split on that node. It also computes the output statistics
# such as the mean in regression and class probabilities in classification.
# Internal structures
cdef DOUBLE_t* y # Values of y
cdef SIZE_t y_stride # Stride in y (since n_outputs >= 1)
cdef DOUBLE_t* sample_weight # Sample weights
cdef SIZE_t* samples # Sample indices in X, y
cdef SIZE_t start # samples[start:pos] are the samples in the left node
cdef SIZE_t pos # samples[pos:end] are the samples in the right node
cdef SIZE_t end
cdef SIZE_t n_outputs # Number of outputs
cdef SIZE_t n_node_samples # Number of samples in the node (end-start)
cdef double weighted_n_samples # Weighted number of samples (in total)
cdef double weighted_n_node_samples # Weighted number of samples in the node
cdef double weighted_n_left # Weighted number of samples in the left node
cdef double weighted_n_right # Weighted number of samples in the right node
# The criterion object is maintained such that left and right collected
# statistics correspond to samples[start:pos] and samples[pos:end].
# Methods
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
SIZE_t end) nogil
cdef void reset(self) nogil
cdef void update(self, SIZE_t new_pos) nogil
cdef double node_impurity(self) nogil
cdef void children_impurity(self, double* impurity_left,
double* impurity_right) nogil
cdef void node_value(self, double* dest) nogil
cdef double impurity_improvement(self, double impurity) nogil
# =============================================================================
# Splitter
# =============================================================================
cdef struct SplitRecord:
# Data to track sample split
SIZE_t feature # Which feature to split on.
SIZE_t pos # Split samples array at the given position,
# i.e. count of samples below threshold for feature.
# pos is >= end if the node is a leaf.
double threshold # Threshold to split at.
double improvement # Impurity improvement given parent node.
double impurity_left # Impurity of the left split.
double impurity_right # Impurity of the right split.
cdef class Splitter:
# The splitter searches in the input space for a feature and a threshold
# to split the samples samples[start:end].
#
# The impurity computations are delegated to a criterion object.
# Internal structures
cdef public Criterion criterion # Impurity criterion
cdef public SIZE_t max_features # Number of features to test
cdef public SIZE_t min_samples_leaf # Min samples in a leaf
cdef public double min_weight_leaf # Minimum weight in a leaf
cdef object random_state # Random state
cdef UINT32_t rand_r_state # sklearn_rand_r random number state
cdef SIZE_t* samples # Sample indices in X, y
cdef SIZE_t n_samples # X.shape[0]
cdef double weighted_n_samples # Weighted number of samples
cdef SIZE_t* features # Feature indices in X
cdef SIZE_t* constant_features # Constant features indices
cdef SIZE_t n_features # X.shape[1]
cdef DTYPE_t* feature_values # temp. array holding feature values
cdef SIZE_t start # Start position for the current node
cdef SIZE_t end # End position for the current node
cdef DOUBLE_t* y
cdef SIZE_t y_stride
cdef DOUBLE_t* sample_weight
# The samples vector `samples` is maintained by the Splitter object such
# that the samples contained in a node are contiguous. With this setting,
# `node_split` reorganizes the node samples `samples[start:end]` in two
# subsets `samples[start:pos]` and `samples[pos:end]`.
# The 1-d `features` array of size n_features contains the features
# indices and allows fast sampling without replacement of features.
# The 1-d `constant_features` array of size n_features holds in
# `constant_features[:n_constant_features]` the feature ids with
# constant values for all the samples that reached a specific node.
# The value `n_constant_features` is given by the the parent node to its
# child nodes. The content of the range `[n_constant_features:]` is left
# undefined, but preallocated for performance reasons
# This allows optimization with depth-based tree building.
# Methods
cdef void init(self, object X, np.ndarray y,
DOUBLE_t* sample_weight) except *
cdef void node_reset(self, SIZE_t start, SIZE_t end,
double* weighted_n_node_samples) nogil
cdef void node_split(self,
double impurity, # Impurity of the node
SplitRecord* split,
SIZE_t* n_constant_features) nogil
cdef void node_value(self, double* dest) nogil
cdef double node_impurity(self) nogil
# =============================================================================
# Tree
# =============================================================================
cdef struct Node:
# Base storage structure for the nodes in a Tree object
SIZE_t left_child # id of the left child of the node
SIZE_t right_child # id of the right child of the node
SIZE_t feature # Feature used for splitting the node
DOUBLE_t threshold # Threshold value at the node
DOUBLE_t impurity # Impurity of the node (i.e., the value of the criterion)
SIZE_t n_node_samples # Number of samples at the node
DOUBLE_t weighted_n_node_samples # Weighted number of samples at the node
cdef class Tree:
# The Tree object is a binary tree structure constructed by the
# TreeBuilder. The tree structure is used for predictions and
# feature importances.
# Input/Output layout
cdef public SIZE_t n_features # Number of features in X
cdef SIZE_t* n_classes # Number of classes in y[:, k]
cdef public SIZE_t n_outputs # Number of outputs in y
cdef public SIZE_t max_n_classes # max(n_classes)
# Inner structures: values are stored separately from node structure,
# since size is determined at runtime.
cdef public SIZE_t max_depth # Max depth of the tree
cdef public SIZE_t node_count # Counter for node IDs
cdef public SIZE_t capacity # Capacity of tree, in terms of nodes
cdef Node* nodes # Array of nodes
cdef double* value # (capacity, n_outputs, max_n_classes) array of values
cdef SIZE_t value_stride # = n_outputs * max_n_classes
# Methods
cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
SIZE_t feature, double threshold, double impurity,
SIZE_t n_node_samples,
double weighted_n_samples) nogil
cdef void _resize(self, SIZE_t capacity) except *
cdef int _resize_c(self, SIZE_t capacity=*) nogil
cdef np.ndarray _get_value_ndarray(self)
cdef np.ndarray _get_node_ndarray(self)
cpdef np.ndarray predict(self, object X)
cpdef np.ndarray apply(self, object X)
cdef np.ndarray _apply_dense(self, object X)
cdef np.ndarray _apply_sparse_csr(self, object X)
cpdef compute_feature_importances(self, normalize=*)
# =============================================================================
# Tree builder
# =============================================================================
cdef class TreeBuilder:
# The TreeBuilder recursively builds a Tree object from training samples,
# using a Splitter object for splitting internal nodes and assigning
# values to leaves.
#
# This class controls the various stopping criteria and the node splitting
# evaluation order, e.g. depth-first or best-first.
cdef Splitter splitter # Splitting algorithm
cdef SIZE_t min_samples_split # Minimum number of samples in an internal node
cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf
cdef double min_weight_leaf # Minimum weight in a leaf
cdef SIZE_t max_depth # Maximal tree depth
cpdef build(self, Tree tree, object X, np.ndarray y,
np.ndarray sample_weight=*)
cdef _check_input(self, object X, np.ndarray y, np.ndarray sample_weight)