Tune: Further file hierarchy improvements
This commit is contained in:
0
python/isaac/external/__init__.py
vendored
Normal file
0
python/isaac/external/__init__.py
vendored
Normal file
12
python/isaac/external/sklearn/__init__.py
vendored
Normal file
12
python/isaac/external/sklearn/__init__.py
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
"""
|
||||
The :mod:`sklearn.tree` module includes decision tree-based models for
|
||||
classification and regression.
|
||||
"""
|
||||
|
||||
from .tree import DecisionTreeClassifier
|
||||
from .tree import DecisionTreeRegressor
|
||||
from .tree import ExtraTreeClassifier
|
||||
from .tree import ExtraTreeRegressor
|
||||
|
||||
__all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor",
|
||||
"ExtraTreeClassifier", "ExtraTreeRegressor"]
|
1664
python/isaac/external/sklearn/forest.py
vendored
Normal file
1664
python/isaac/external/sklearn/forest.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
22
python/isaac/external/sklearn/setup.py
vendored
Normal file
22
python/isaac/external/sklearn/setup.py
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
import os
|
||||
|
||||
import numpy
|
||||
from numpy.distutils.misc_util import Configuration
|
||||
|
||||
|
||||
def configuration(parent_package="", top_path=None):
|
||||
config = Configuration("tree", parent_package, top_path)
|
||||
libraries = []
|
||||
if os.name == 'posix':
|
||||
libraries.append('m')
|
||||
config.add_extension("_tree",
|
||||
sources=["_tree.c"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries,
|
||||
extra_compile_args=["-O3"])
|
||||
return config
|
||||
|
||||
if __name__ == "__main__":
|
||||
from numpy.distutils.core import setup
|
||||
setup(**configuration().todict())
|
||||
print 'aaa'
|
835
python/isaac/external/sklearn/tree.py
vendored
Normal file
835
python/isaac/external/sklearn/tree.py
vendored
Normal file
@@ -0,0 +1,835 @@
|
||||
"""
|
||||
This module gathers tree-based methods, including decision, regression and
|
||||
randomized trees. Single and multi-output problems are both handled.
|
||||
"""
|
||||
|
||||
# Authors: Gilles Louppe <g.louppe@gmail.com>
|
||||
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
||||
# Brian Holt <bdholt1@gmail.com>
|
||||
# Noel Dawe <noel@dawe.me>
|
||||
# Satrajit Gosh <satrajit.ghosh@gmail.com>
|
||||
# Joly Arnaud <arnaud.v.joly@gmail.com>
|
||||
# Fares Hedayati <fares.hedayati@gmail.com>
|
||||
#
|
||||
# Licence: BSD 3 clause
|
||||
|
||||
# MODIFICATIONS:
|
||||
# - Removed base classes
|
||||
# - Incorporated required functions from six, utils
|
||||
|
||||
from __future__ import division
|
||||
|
||||
import numbers
|
||||
from abc import abstractmethod
|
||||
|
||||
import numpy as np
|
||||
|
||||
from utils import NotFittedError, check_is_fitted, compute_sample_weight, check_array, check_random_state, ClassifierMixin, RegressorMixin, string_types
|
||||
from utils import BaseEstimator
|
||||
|
||||
from ._tree import Criterion
|
||||
from ._tree import Splitter
|
||||
from ._tree import DepthFirstTreeBuilder, BestFirstTreeBuilder
|
||||
from ._tree import Tree
|
||||
from . import _tree
|
||||
|
||||
__all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor",
|
||||
"ExtraTreeClassifier", "ExtraTreeRegressor"]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Types and constants
|
||||
# =============================================================================
|
||||
|
||||
DTYPE = _tree.DTYPE
|
||||
DOUBLE = _tree.DOUBLE
|
||||
|
||||
CRITERIA_CLF = {"gini": _tree.Gini, "entropy": _tree.Entropy}
|
||||
CRITERIA_REG = {"mse": _tree.MSE, "friedman_mse": _tree.FriedmanMSE}
|
||||
|
||||
DENSE_SPLITTERS = {"best": _tree.BestSplitter,
|
||||
"presort-best": _tree.PresortBestSplitter,
|
||||
"random": _tree.RandomSplitter}
|
||||
|
||||
SPARSE_SPLITTERS = {"best": _tree.BestSparseSplitter,
|
||||
"random": _tree.RandomSparseSplitter}
|
||||
|
||||
# =============================================================================
|
||||
# Base decision tree
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class BaseDecisionTree(BaseEstimator):
|
||||
"""Base class for decision trees.
|
||||
|
||||
Warning: This class should not be used directly.
|
||||
Use derived classes instead.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self,
|
||||
criterion,
|
||||
splitter,
|
||||
max_depth,
|
||||
min_samples_split,
|
||||
min_samples_leaf,
|
||||
min_weight_fraction_leaf,
|
||||
max_features,
|
||||
max_leaf_nodes,
|
||||
random_state,
|
||||
class_weight=None):
|
||||
self.criterion = criterion
|
||||
self.splitter = splitter
|
||||
self.max_depth = max_depth
|
||||
self.min_samples_split = min_samples_split
|
||||
self.min_samples_leaf = min_samples_leaf
|
||||
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
||||
self.max_features = max_features
|
||||
self.random_state = random_state
|
||||
self.max_leaf_nodes = max_leaf_nodes
|
||||
self.class_weight = class_weight
|
||||
|
||||
self.n_features_ = None
|
||||
self.n_outputs_ = None
|
||||
self.classes_ = None
|
||||
self.n_classes_ = None
|
||||
|
||||
self.tree_ = None
|
||||
self.max_features_ = None
|
||||
|
||||
def fit(self, X, y, sample_weight=None, check_input=True):
|
||||
"""Build a decision tree from the training set (X, y).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape = [n_samples, n_features]
|
||||
The training input samples. Internally, it will be converted to
|
||||
``dtype=np.float32`` and if a sparse matrix is provided
|
||||
to a sparse ``csc_matrix``.
|
||||
|
||||
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
|
||||
The target values (class labels in classification, real numbers in
|
||||
regression). In the regression case, use ``dtype=np.float64`` and
|
||||
``order='C'`` for maximum efficiency.
|
||||
|
||||
sample_weight : array-like, shape = [n_samples] or None
|
||||
Sample weights. If None, then samples are equally weighted. Splits
|
||||
that would create child nodes with net zero or negative weight are
|
||||
ignored while searching for a split in each node. In the case of
|
||||
classification, splits are also ignored if they would result in any
|
||||
single class carrying a negative weight in either child node.
|
||||
|
||||
check_input : boolean, (default=True)
|
||||
Allow to bypass several input checking.
|
||||
Don't use this parameter unless you know what you do.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns self.
|
||||
"""
|
||||
random_state = check_random_state(self.random_state)
|
||||
if check_input:
|
||||
X = check_array(X, dtype=DTYPE)
|
||||
|
||||
# Determine output settings
|
||||
n_samples, self.n_features_ = X.shape
|
||||
is_classification = isinstance(self, ClassifierMixin)
|
||||
|
||||
y = np.atleast_1d(y)
|
||||
expanded_class_weight = None
|
||||
|
||||
if y.ndim == 1:
|
||||
# reshape is necessary to preserve the data contiguity against vs
|
||||
# [:, np.newaxis] that does not.
|
||||
y = np.reshape(y, (-1, 1))
|
||||
|
||||
self.n_outputs_ = y.shape[1]
|
||||
|
||||
if is_classification:
|
||||
y = np.copy(y)
|
||||
|
||||
self.classes_ = []
|
||||
self.n_classes_ = []
|
||||
|
||||
if self.class_weight is not None:
|
||||
y_original = np.copy(y)
|
||||
|
||||
for k in range(self.n_outputs_):
|
||||
classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True)
|
||||
self.classes_.append(classes_k)
|
||||
self.n_classes_.append(classes_k.shape[0])
|
||||
|
||||
if self.class_weight is not None:
|
||||
expanded_class_weight = compute_sample_weight(
|
||||
self.class_weight, y_original)
|
||||
|
||||
else:
|
||||
self.classes_ = [None] * self.n_outputs_
|
||||
self.n_classes_ = [1] * self.n_outputs_
|
||||
|
||||
self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
|
||||
|
||||
if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
|
||||
y = np.ascontiguousarray(y, dtype=DOUBLE)
|
||||
|
||||
# Check parameters
|
||||
max_depth = ((2 ** 31) - 1 if self.max_depth is None
|
||||
else self.max_depth)
|
||||
max_leaf_nodes = (-1 if self.max_leaf_nodes is None
|
||||
else self.max_leaf_nodes)
|
||||
|
||||
if isinstance(self.max_features, string_types):
|
||||
if self.max_features == "auto":
|
||||
if is_classification:
|
||||
max_features = max(1, int(np.sqrt(self.n_features_)))
|
||||
else:
|
||||
max_features = self.n_features_
|
||||
elif self.max_features == "sqrt":
|
||||
max_features = max(1, int(np.sqrt(self.n_features_)))
|
||||
elif self.max_features == "log2":
|
||||
max_features = max(1, int(np.log2(self.n_features_)))
|
||||
else:
|
||||
raise ValueError(
|
||||
'Invalid value for max_features. Allowed string '
|
||||
'values are "auto", "sqrt" or "log2".')
|
||||
|
||||
elif self.max_features is None:
|
||||
max_features = self.n_features_
|
||||
elif isinstance(self.max_features, (numbers.Integral, np.integer)):
|
||||
max_features = self.max_features
|
||||
else: # float
|
||||
if self.max_features > 0.0:
|
||||
max_features = max(1, int(self.max_features * self.n_features_))
|
||||
else:
|
||||
max_features = 0
|
||||
|
||||
self.max_features_ = max_features
|
||||
|
||||
if len(y) != n_samples:
|
||||
raise ValueError("Number of labels=%d does not match "
|
||||
"number of samples=%d" % (len(y), n_samples))
|
||||
if self.min_samples_split <= 0:
|
||||
raise ValueError("min_samples_split must be greater than zero.")
|
||||
if self.min_samples_leaf <= 0:
|
||||
raise ValueError("min_samples_leaf must be greater than zero.")
|
||||
if not 0 <= self.min_weight_fraction_leaf <= 0.5:
|
||||
raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
|
||||
if max_depth <= 0:
|
||||
raise ValueError("max_depth must be greater than zero. ")
|
||||
if not (0 < max_features <= self.n_features_):
|
||||
raise ValueError("max_features must be in (0, n_features]")
|
||||
if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)):
|
||||
raise ValueError("max_leaf_nodes must be integral number but was "
|
||||
"%r" % max_leaf_nodes)
|
||||
if -1 < max_leaf_nodes < 2:
|
||||
raise ValueError(("max_leaf_nodes {0} must be either smaller than "
|
||||
"0 or larger than 1").format(max_leaf_nodes))
|
||||
|
||||
if sample_weight is not None:
|
||||
if (getattr(sample_weight, "dtype", None) != DOUBLE or
|
||||
not sample_weight.flags.contiguous):
|
||||
sample_weight = np.ascontiguousarray(
|
||||
sample_weight, dtype=DOUBLE)
|
||||
if len(sample_weight.shape) > 1:
|
||||
raise ValueError("Sample weights array has more "
|
||||
"than one dimension: %d" %
|
||||
len(sample_weight.shape))
|
||||
if len(sample_weight) != n_samples:
|
||||
raise ValueError("Number of weights=%d does not match "
|
||||
"number of samples=%d" %
|
||||
(len(sample_weight), n_samples))
|
||||
|
||||
if expanded_class_weight is not None:
|
||||
if sample_weight is not None:
|
||||
sample_weight = sample_weight * expanded_class_weight
|
||||
else:
|
||||
sample_weight = expanded_class_weight
|
||||
|
||||
# Set min_weight_leaf from min_weight_fraction_leaf
|
||||
if self.min_weight_fraction_leaf != 0. and sample_weight is not None:
|
||||
min_weight_leaf = (self.min_weight_fraction_leaf *
|
||||
np.sum(sample_weight))
|
||||
else:
|
||||
min_weight_leaf = 0.
|
||||
|
||||
# Set min_samples_split sensibly
|
||||
min_samples_split = max(self.min_samples_split,
|
||||
2 * self.min_samples_leaf)
|
||||
|
||||
# Build tree
|
||||
criterion = self.criterion
|
||||
if not isinstance(criterion, Criterion):
|
||||
if is_classification:
|
||||
criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
|
||||
self.n_classes_)
|
||||
else:
|
||||
criterion = CRITERIA_REG[self.criterion](self.n_outputs_)
|
||||
|
||||
SPLITTERS = DENSE_SPLITTERS
|
||||
|
||||
splitter = self.splitter
|
||||
if not isinstance(self.splitter, Splitter):
|
||||
splitter = SPLITTERS[self.splitter](criterion,
|
||||
self.max_features_,
|
||||
self.min_samples_leaf,
|
||||
min_weight_leaf,
|
||||
random_state)
|
||||
|
||||
self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_)
|
||||
|
||||
# Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
|
||||
if max_leaf_nodes < 0:
|
||||
builder = DepthFirstTreeBuilder(splitter, min_samples_split,
|
||||
self.min_samples_leaf,
|
||||
min_weight_leaf,
|
||||
max_depth)
|
||||
else:
|
||||
builder = BestFirstTreeBuilder(splitter, min_samples_split,
|
||||
self.min_samples_leaf,
|
||||
min_weight_leaf,
|
||||
max_depth,
|
||||
max_leaf_nodes)
|
||||
|
||||
builder.build(self.tree_, X, y, sample_weight)
|
||||
|
||||
if self.n_outputs_ == 1:
|
||||
self.n_classes_ = self.n_classes_[0]
|
||||
self.classes_ = self.classes_[0]
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X, check_input=True):
|
||||
"""Predict class or regression value for X.
|
||||
|
||||
For a classification model, the predicted class for each sample in X is
|
||||
returned. For a regression model, the predicted value based on X is
|
||||
returned.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix of shape = [n_samples, n_features]
|
||||
The input samples. Internally, it will be converted to
|
||||
``dtype=np.float32`` and if a sparse matrix is provided
|
||||
to a sparse ``csr_matrix``.
|
||||
|
||||
check_input : boolean, (default=True)
|
||||
Allow to bypass several input checking.
|
||||
Don't use this parameter unless you know what you do.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : array of shape = [n_samples] or [n_samples, n_outputs]
|
||||
The predicted classes, or the predict values.
|
||||
"""
|
||||
if check_input:
|
||||
X = check_array(X, dtype=DTYPE)
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
if self.tree_ is None:
|
||||
raise NotFittedError("Tree not initialized. Perform a fit first")
|
||||
|
||||
if self.n_features_ != n_features:
|
||||
raise ValueError("Number of features of the model must "
|
||||
" match the input. Model n_features is %s and "
|
||||
" input n_features is %s "
|
||||
% (self.n_features_, n_features))
|
||||
|
||||
proba = self.tree_.predict(X)
|
||||
|
||||
# Classification
|
||||
if isinstance(self, ClassifierMixin):
|
||||
if self.n_outputs_ == 1:
|
||||
return self.classes_.take(np.argmax(proba, axis=1), axis=0)
|
||||
|
||||
else:
|
||||
predictions = np.zeros((n_samples, self.n_outputs_))
|
||||
|
||||
for k in range(self.n_outputs_):
|
||||
predictions[:, k] = self.classes_[k].take(
|
||||
np.argmax(proba[:, k], axis=1),
|
||||
axis=0)
|
||||
|
||||
return predictions
|
||||
|
||||
# Regression
|
||||
else:
|
||||
if self.n_outputs_ == 1:
|
||||
return proba[:, 0]
|
||||
|
||||
else:
|
||||
return proba[:, :, 0]
|
||||
|
||||
@property
|
||||
def feature_importances_(self):
|
||||
"""Return the feature importances.
|
||||
|
||||
The importance of a feature is computed as the (normalized) total
|
||||
reduction of the criterion brought by that feature.
|
||||
It is also known as the Gini importance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_importances_ : array, shape = [n_features]
|
||||
"""
|
||||
if self.tree_ is None:
|
||||
raise NotFittedError("Estimator not fitted, call `fit` before"
|
||||
" `feature_importances_`.")
|
||||
|
||||
return self.tree_.compute_feature_importances()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Public estimators
|
||||
# =============================================================================
|
||||
class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
|
||||
"""A decision tree classifier.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
criterion : string, optional (default="gini")
|
||||
The function to measure the quality of a split. Supported criteria are
|
||||
"gini" for the Gini impurity and "entropy" for the information gain.
|
||||
|
||||
splitter : string, optional (default="best")
|
||||
The strategy used to choose the split at each node. Supported
|
||||
strategies are "best" to choose the best split and "random" to choose
|
||||
the best random split.
|
||||
|
||||
max_features : int, float, string or None, optional (default=None)
|
||||
The number of features to consider when looking for the best split:
|
||||
- If int, then consider `max_features` features at each split.
|
||||
- If float, then `max_features` is a percentage and
|
||||
`int(max_features * n_features)` features are considered at each
|
||||
split.
|
||||
- If "auto", then `max_features=sqrt(n_features)`.
|
||||
- If "sqrt", then `max_features=sqrt(n_features)`.
|
||||
- If "log2", then `max_features=log2(n_features)`.
|
||||
- If None, then `max_features=n_features`.
|
||||
|
||||
Note: the search for a split does not stop until at least one
|
||||
valid partition of the node samples is found, even if it requires to
|
||||
effectively inspect more than ``max_features`` features.
|
||||
|
||||
max_depth : int or None, optional (default=None)
|
||||
The maximum depth of the tree. If None, then nodes are expanded until
|
||||
all leaves are pure or until all leaves contain less than
|
||||
min_samples_split samples.
|
||||
Ignored if ``max_leaf_nodes`` is not None.
|
||||
|
||||
min_samples_split : int, optional (default=2)
|
||||
The minimum number of samples required to split an internal node.
|
||||
|
||||
min_samples_leaf : int, optional (default=1)
|
||||
The minimum number of samples required to be at a leaf node.
|
||||
|
||||
min_weight_fraction_leaf : float, optional (default=0.)
|
||||
The minimum weighted fraction of the input samples required to be at a
|
||||
leaf node.
|
||||
|
||||
max_leaf_nodes : int or None, optional (default=None)
|
||||
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
|
||||
Best nodes are defined as relative reduction in impurity.
|
||||
If None then unlimited number of leaf nodes.
|
||||
If not None then ``max_depth`` will be ignored.
|
||||
|
||||
class_weight : dict, list of dicts, "auto" or None, optional (default=None)
|
||||
Weights associated with classes in the form ``{class_label: weight}``.
|
||||
If not given, all classes are supposed to have weight one. For
|
||||
multi-output problems, a list of dicts can be provided in the same
|
||||
order as the columns of y.
|
||||
|
||||
The "auto" mode uses the values of y to automatically adjust
|
||||
weights inversely proportional to class frequencies in the input data.
|
||||
|
||||
For multi-output, the weights of each column of y will be multiplied.
|
||||
|
||||
Note that these weights will be multiplied with sample_weight (passed
|
||||
through the fit method) if sample_weight is specified.
|
||||
|
||||
random_state : int, RandomState instance or None, optional (default=None)
|
||||
If int, random_state is the seed used by the random number generator;
|
||||
If RandomState instance, random_state is the random number generator;
|
||||
If None, the random number generator is the RandomState instance used
|
||||
by `np.random`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
tree_ : Tree object
|
||||
The underlying Tree object.
|
||||
|
||||
max_features_ : int,
|
||||
The inferred value of max_features.
|
||||
|
||||
classes_ : array of shape = [n_classes] or a list of such arrays
|
||||
The classes labels (single output problem),
|
||||
or a list of arrays of class labels (multi-output problem).
|
||||
|
||||
n_classes_ : int or list
|
||||
The number of classes (for single output problems),
|
||||
or a list containing the number of classes for each
|
||||
output (for multi-output problems).
|
||||
|
||||
feature_importances_ : array of shape = [n_features]
|
||||
The feature importances. The higher, the more important the
|
||||
feature. The importance of a feature is computed as the (normalized)
|
||||
total reduction of the criterion brought by that feature. It is also
|
||||
known as the Gini importance [4]_.
|
||||
|
||||
See also
|
||||
--------
|
||||
DecisionTreeRegressor
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] http://en.wikipedia.org/wiki/Decision_tree_learning
|
||||
|
||||
.. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
|
||||
and Regression Trees", Wadsworth, Belmont, CA, 1984.
|
||||
|
||||
.. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
|
||||
Learning", Springer, 2009.
|
||||
|
||||
.. [4] L. Breiman, and A. Cutler, "Random Forests",
|
||||
http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_iris
|
||||
>>> from sklearn.cross_validation import cross_val_score
|
||||
>>> from sklearn.tree import DecisionTreeClassifier
|
||||
>>> clf = DecisionTreeClassifier(random_state=0)
|
||||
>>> iris = load_iris()
|
||||
>>> cross_val_score(clf, iris.data, iris.target, cv=10)
|
||||
... # doctest: +SKIP
|
||||
...
|
||||
array([ 1. , 0.93..., 0.86..., 0.93..., 0.93...,
|
||||
0.93..., 0.93..., 1. , 0.93..., 1. ])
|
||||
"""
|
||||
def __init__(self,
|
||||
criterion="gini",
|
||||
splitter="best",
|
||||
max_depth=None,
|
||||
min_samples_split=2,
|
||||
min_samples_leaf=1,
|
||||
min_weight_fraction_leaf=0.,
|
||||
max_features=None,
|
||||
random_state=None,
|
||||
max_leaf_nodes=None,
|
||||
class_weight=None):
|
||||
super(DecisionTreeClassifier, self).__init__(
|
||||
criterion=criterion,
|
||||
splitter=splitter,
|
||||
max_depth=max_depth,
|
||||
min_samples_split=min_samples_split,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
min_weight_fraction_leaf=min_weight_fraction_leaf,
|
||||
max_features=max_features,
|
||||
max_leaf_nodes=max_leaf_nodes,
|
||||
class_weight=class_weight,
|
||||
random_state=random_state)
|
||||
|
||||
def predict_proba(self, X, check_input=True):
|
||||
"""Predict class probabilities of the input samples X.
|
||||
|
||||
The predicted class probability is the fraction of samples of the same
|
||||
class in a leaf.
|
||||
|
||||
check_input : boolean, (default=True)
|
||||
Allow to bypass several input checking.
|
||||
Don't use this parameter unless you know what you do.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix of shape = [n_samples, n_features]
|
||||
The input samples. Internally, it will be converted to
|
||||
``dtype=np.float32`` and if a sparse matrix is provided
|
||||
to a sparse ``csr_matrix``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
p : array of shape = [n_samples, n_classes], or a list of n_outputs
|
||||
such arrays if n_outputs > 1.
|
||||
The class probabilities of the input samples. The order of the
|
||||
classes corresponds to that in the attribute `classes_`.
|
||||
"""
|
||||
check_is_fitted(self, 'n_outputs_')
|
||||
if check_input:
|
||||
X = check_array(X, dtype=DTYPE)
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
if self.tree_ is None:
|
||||
raise NotFittedError("Tree not initialized. Perform a fit first.")
|
||||
|
||||
if self.n_features_ != n_features:
|
||||
raise ValueError("Number of features of the model must "
|
||||
" match the input. Model n_features is %s and "
|
||||
" input n_features is %s "
|
||||
% (self.n_features_, n_features))
|
||||
|
||||
proba = self.tree_.predict(X)
|
||||
|
||||
if self.n_outputs_ == 1:
|
||||
proba = proba[:, :self.n_classes_]
|
||||
normalizer = proba.sum(axis=1)[:, np.newaxis]
|
||||
normalizer[normalizer == 0.0] = 1.0
|
||||
proba /= normalizer
|
||||
|
||||
return proba
|
||||
|
||||
else:
|
||||
all_proba = []
|
||||
|
||||
for k in range(self.n_outputs_):
|
||||
proba_k = proba[:, k, :self.n_classes_[k]]
|
||||
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
|
||||
normalizer[normalizer == 0.0] = 1.0
|
||||
proba_k /= normalizer
|
||||
all_proba.append(proba_k)
|
||||
|
||||
return all_proba
|
||||
|
||||
def predict_log_proba(self, X):
|
||||
"""Predict class log-probabilities of the input samples X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix of shape = [n_samples, n_features]
|
||||
The input samples. Internally, it will be converted to
|
||||
``dtype=np.float32`` and if a sparse matrix is provided
|
||||
to a sparse ``csr_matrix``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
p : array of shape = [n_samples, n_classes], or a list of n_outputs
|
||||
such arrays if n_outputs > 1.
|
||||
The class log-probabilities of the input samples. The order of the
|
||||
classes corresponds to that in the attribute `classes_`.
|
||||
"""
|
||||
proba = self.predict_proba(X)
|
||||
|
||||
if self.n_outputs_ == 1:
|
||||
return np.log(proba)
|
||||
|
||||
else:
|
||||
for k in range(self.n_outputs_):
|
||||
proba[k] = np.log(proba[k])
|
||||
|
||||
return proba
|
||||
|
||||
class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
|
||||
"""A decision tree regressor.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
criterion : string, optional (default="mse")
|
||||
The function to measure the quality of a split. The only supported
|
||||
criterion is "mse" for the mean squared error.
|
||||
|
||||
splitter : string, optional (default="best")
|
||||
The strategy used to choose the split at each node. Supported
|
||||
strategies are "best" to choose the best split and "random" to choose
|
||||
the best random split.
|
||||
|
||||
max_features : int, float, string or None, optional (default=None)
|
||||
The number of features to consider when looking for the best split:
|
||||
- If int, then consider `max_features` features at each split.
|
||||
- If float, then `max_features` is a percentage and
|
||||
`int(max_features * n_features)` features are considered at each
|
||||
split.
|
||||
- If "auto", then `max_features=n_features`.
|
||||
- If "sqrt", then `max_features=sqrt(n_features)`.
|
||||
- If "log2", then `max_features=log2(n_features)`.
|
||||
- If None, then `max_features=n_features`.
|
||||
|
||||
Note: the search for a split does not stop until at least one
|
||||
valid partition of the node samples is found, even if it requires to
|
||||
effectively inspect more than ``max_features`` features.
|
||||
|
||||
max_depth : int or None, optional (default=None)
|
||||
The maximum depth of the tree. If None, then nodes are expanded until
|
||||
all leaves are pure or until all leaves contain less than
|
||||
min_samples_split samples.
|
||||
Ignored if ``max_leaf_nodes`` is not None.
|
||||
|
||||
min_samples_split : int, optional (default=2)
|
||||
The minimum number of samples required to split an internal node.
|
||||
|
||||
min_samples_leaf : int, optional (default=1)
|
||||
The minimum number of samples required to be at a leaf node.
|
||||
|
||||
min_weight_fraction_leaf : float, optional (default=0.)
|
||||
The minimum weighted fraction of the input samples required to be at a
|
||||
leaf node.
|
||||
|
||||
max_leaf_nodes : int or None, optional (default=None)
|
||||
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
|
||||
Best nodes are defined as relative reduction in impurity.
|
||||
If None then unlimited number of leaf nodes.
|
||||
If not None then ``max_depth`` will be ignored.
|
||||
|
||||
random_state : int, RandomState instance or None, optional (default=None)
|
||||
If int, random_state is the seed used by the random number generator;
|
||||
If RandomState instance, random_state is the random number generator;
|
||||
If None, the random number generator is the RandomState instance used
|
||||
by `np.random`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
tree_ : Tree object
|
||||
The underlying Tree object.
|
||||
|
||||
max_features_ : int,
|
||||
The inferred value of max_features.
|
||||
|
||||
feature_importances_ : array of shape = [n_features]
|
||||
The feature importances.
|
||||
The higher, the more important the feature.
|
||||
The importance of a feature is computed as the
|
||||
(normalized) total reduction of the criterion brought
|
||||
by that feature. It is also known as the Gini importance [4]_.
|
||||
|
||||
See also
|
||||
--------
|
||||
DecisionTreeClassifier
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] http://en.wikipedia.org/wiki/Decision_tree_learning
|
||||
|
||||
.. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
|
||||
and Regression Trees", Wadsworth, Belmont, CA, 1984.
|
||||
|
||||
.. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
|
||||
Learning", Springer, 2009.
|
||||
|
||||
.. [4] L. Breiman, and A. Cutler, "Random Forests",
|
||||
http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_boston
|
||||
>>> from sklearn.cross_validation import cross_val_score
|
||||
>>> from sklearn.tree import DecisionTreeRegressor
|
||||
>>> boston = load_boston()
|
||||
>>> regressor = DecisionTreeRegressor(random_state=0)
|
||||
>>> cross_val_score(regressor, boston.data, boston.target, cv=10)
|
||||
... # doctest: +SKIP
|
||||
...
|
||||
array([ 0.61..., 0.57..., -0.34..., 0.41..., 0.75...,
|
||||
0.07..., 0.29..., 0.33..., -1.42..., -1.77...])
|
||||
"""
|
||||
def __init__(self,
|
||||
criterion="mse",
|
||||
splitter="best",
|
||||
max_depth=None,
|
||||
min_samples_split=2,
|
||||
min_samples_leaf=1,
|
||||
min_weight_fraction_leaf=0.,
|
||||
max_features=None,
|
||||
random_state=None,
|
||||
max_leaf_nodes=None):
|
||||
super(DecisionTreeRegressor, self).__init__(
|
||||
criterion=criterion,
|
||||
splitter=splitter,
|
||||
max_depth=max_depth,
|
||||
min_samples_split=min_samples_split,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
min_weight_fraction_leaf=min_weight_fraction_leaf,
|
||||
max_features=max_features,
|
||||
max_leaf_nodes=max_leaf_nodes,
|
||||
random_state=random_state)
|
||||
|
||||
|
||||
class ExtraTreeClassifier(DecisionTreeClassifier):
|
||||
"""An extremely randomized tree classifier.
|
||||
|
||||
Extra-trees differ from classic decision trees in the way they are built.
|
||||
When looking for the best split to separate the samples of a node into two
|
||||
groups, random splits are drawn for each of the `max_features` randomly
|
||||
selected features and the best split among those is chosen. When
|
||||
`max_features` is set 1, this amounts to building a totally random
|
||||
decision tree.
|
||||
|
||||
Warning: Extra-trees should only be used within ensemble methods.
|
||||
|
||||
See also
|
||||
--------
|
||||
ExtraTreeRegressor, ExtraTreesClassifier, ExtraTreesRegressor
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
|
||||
Machine Learning, 63(1), 3-42, 2006.
|
||||
"""
|
||||
def __init__(self,
|
||||
criterion="gini",
|
||||
splitter="random",
|
||||
max_depth=None,
|
||||
min_samples_split=2,
|
||||
min_samples_leaf=1,
|
||||
min_weight_fraction_leaf=0.,
|
||||
max_features="auto",
|
||||
random_state=None,
|
||||
max_leaf_nodes=None,
|
||||
class_weight=None):
|
||||
super(ExtraTreeClassifier, self).__init__(
|
||||
criterion=criterion,
|
||||
splitter=splitter,
|
||||
max_depth=max_depth,
|
||||
min_samples_split=min_samples_split,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
min_weight_fraction_leaf=min_weight_fraction_leaf,
|
||||
max_features=max_features,
|
||||
max_leaf_nodes=max_leaf_nodes,
|
||||
class_weight=class_weight,
|
||||
random_state=random_state)
|
||||
|
||||
|
||||
class ExtraTreeRegressor(DecisionTreeRegressor):
|
||||
"""An extremely randomized tree regressor.
|
||||
|
||||
Extra-trees differ from classic decision trees in the way they are built.
|
||||
When looking for the best split to separate the samples of a node into two
|
||||
groups, random splits are drawn for each of the `max_features` randomly
|
||||
selected features and the best split among those is chosen. When
|
||||
`max_features` is set 1, this amounts to building a totally random
|
||||
decision tree.
|
||||
|
||||
Warning: Extra-trees should only be used within ensemble methods.
|
||||
|
||||
See also
|
||||
--------
|
||||
ExtraTreeClassifier, ExtraTreesClassifier, ExtraTreesRegressor
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
|
||||
Machine Learning, 63(1), 3-42, 2006.
|
||||
"""
|
||||
def __init__(self,
|
||||
criterion="mse",
|
||||
splitter="random",
|
||||
max_depth=None,
|
||||
min_samples_split=2,
|
||||
min_samples_leaf=1,
|
||||
min_weight_fraction_leaf=0.,
|
||||
max_features="auto",
|
||||
random_state=None,
|
||||
max_leaf_nodes=None):
|
||||
super(ExtraTreeRegressor, self).__init__(
|
||||
criterion=criterion,
|
||||
splitter=splitter,
|
||||
max_depth=max_depth,
|
||||
min_samples_split=min_samples_split,
|
||||
min_samples_leaf=min_samples_leaf,
|
||||
min_weight_fraction_leaf=min_weight_fraction_leaf,
|
||||
max_features=max_features,
|
||||
max_leaf_nodes=max_leaf_nodes,
|
||||
random_state=random_state)
|
911
python/isaac/external/sklearn/utils.py
vendored
Normal file
911
python/isaac/external/sklearn/utils.py
vendored
Normal file
@@ -0,0 +1,911 @@
|
||||
import sys
|
||||
import inspect
|
||||
import warnings
|
||||
import numbers
|
||||
|
||||
import numpy as np
|
||||
|
||||
################################ six ########################################
|
||||
PY2 = sys.version_info[0] == 2
|
||||
PY3 = sys.version_info[0] == 3
|
||||
|
||||
if PY3:
|
||||
string_types = str
|
||||
_iteritems = "items"
|
||||
|
||||
else:
|
||||
string_types = basestring
|
||||
_iteritems = "iteritems"
|
||||
|
||||
|
||||
def iteritems(d, **kw):
|
||||
"""Return an iterator over the (key, value) pairs of a dictionary."""
|
||||
return iter(getattr(d, _iteritems)(**kw))
|
||||
|
||||
################################ utils ########################################
|
||||
|
||||
if np.version < (1, 6, 2):
|
||||
# Allow bincount to accept empty arrays
|
||||
# https://github.com/numpy/numpy/commit/40f0844846a9d7665616b142407a3d74cb65a040
|
||||
def bincount(x, weights=None, minlength=None):
|
||||
if len(x) > 0:
|
||||
return np.bincount(x, weights, minlength)
|
||||
else:
|
||||
if minlength is None:
|
||||
minlength = 0
|
||||
minlength = np.asscalar(np.asarray(minlength, dtype=np.intp))
|
||||
return np.zeros(minlength, dtype=np.intp)
|
||||
|
||||
else:
|
||||
from numpy import bincount
|
||||
|
||||
class DataConversionWarning(UserWarning):
|
||||
"""A warning on implicit data conversions happening in the code"""
|
||||
pass
|
||||
|
||||
class NotFittedError(ValueError, AttributeError):
|
||||
"""Exception class to raise if estimator is used before fitting
|
||||
|
||||
This class inherits from both ValueError and AttributeError to help with
|
||||
exception handling and backward compatibility.
|
||||
"""
|
||||
|
||||
def check_is_fitted(estimator, attributes, msg=None, all_or_any=all):
|
||||
"""Perform is_fitted validation for estimator.
|
||||
|
||||
Checks if the estimator is fitted by verifying the presence of
|
||||
"all_or_any" of the passed attributes and raises a NotFittedError with the
|
||||
given message.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator instance.
|
||||
estimator instance for which the check is performed.
|
||||
|
||||
attributes : attribute name(s) given as string or a list/tuple of strings
|
||||
Eg. : ["coef_", "estimator_", ...], "coef_"
|
||||
|
||||
msg : string
|
||||
The default error message is, "This %(name)s instance is not fitted
|
||||
yet. Call 'fit' with appropriate arguments before using this method."
|
||||
|
||||
For custom messages if "%(name)s" is present in the message string,
|
||||
it is substituted for the estimator name.
|
||||
|
||||
Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
|
||||
|
||||
all_or_any : callable, {all, any}, default all
|
||||
Specify whether all or any of the given attributes must exist.
|
||||
"""
|
||||
if msg is None:
|
||||
msg = ("This %(name)s instance is not fitted yet. Call 'fit' with "
|
||||
"appropriate arguments before using this method.")
|
||||
|
||||
if not hasattr(estimator, 'fit'):
|
||||
raise TypeError("%s is not an estimator instance." % (estimator))
|
||||
|
||||
if not isinstance(attributes, (list, tuple)):
|
||||
attributes = [attributes]
|
||||
|
||||
if not all_or_any([hasattr(estimator, attr) for attr in attributes]):
|
||||
raise NotFittedError(msg % {'name': type(estimator).__name__})
|
||||
|
||||
def compute_sample_weight(class_weight, y, indices=None):
|
||||
"""Estimate sample weights by class for unbalanced datasets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
class_weight : dict, list of dicts, "auto", or None, optional
|
||||
Weights associated with classes in the form ``{class_label: weight}``.
|
||||
If not given, all classes are supposed to have weight one. For
|
||||
multi-output problems, a list of dicts can be provided in the same
|
||||
order as the columns of y.
|
||||
|
||||
The "auto" mode uses the values of y to automatically adjust
|
||||
weights inversely proportional to class frequencies in the input data.
|
||||
|
||||
For multi-output, the weights of each column of y will be multiplied.
|
||||
|
||||
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
|
||||
Array of original class labels per sample.
|
||||
|
||||
indices : array-like, shape (n_subsample,), or None
|
||||
Array of indices to be used in a subsample. Can be of length less than
|
||||
n_samples in the case of a subsample, or equal to n_samples in the
|
||||
case of a bootstrap subsample with repeated indices. If None, the
|
||||
sample weight will be calculated over the full sample. Only "auto" is
|
||||
supported for class_weight if this is provided.
|
||||
|
||||
Returns
|
||||
-------
|
||||
sample_weight_vect : ndarray, shape (n_samples,)
|
||||
Array with sample weights as applied to the original y
|
||||
"""
|
||||
|
||||
y = np.atleast_1d(y)
|
||||
if y.ndim == 1:
|
||||
y = np.reshape(y, (-1, 1))
|
||||
n_outputs = y.shape[1]
|
||||
|
||||
if isinstance(class_weight, string_types):
|
||||
if class_weight != 'auto':
|
||||
raise ValueError('The only valid preset for class_weight is '
|
||||
'"auto". Given "%s".' % class_weight)
|
||||
elif (indices is not None and
|
||||
not isinstance(class_weight, string_types)):
|
||||
raise ValueError('The only valid class_weight for subsampling is '
|
||||
'"auto". Given "%s".' % class_weight)
|
||||
elif n_outputs > 1:
|
||||
if (not hasattr(class_weight, "__iter__") or
|
||||
isinstance(class_weight, dict)):
|
||||
raise ValueError("For multi-output, class_weight should be a "
|
||||
"list of dicts, or a valid string.")
|
||||
if len(class_weight) != n_outputs:
|
||||
raise ValueError("For multi-output, number of elements in "
|
||||
"class_weight should match number of outputs.")
|
||||
|
||||
expanded_class_weight = []
|
||||
for k in range(n_outputs):
|
||||
|
||||
y_full = y[:, k]
|
||||
classes_full = np.unique(y_full)
|
||||
classes_missing = None
|
||||
|
||||
if class_weight == 'auto' or n_outputs == 1:
|
||||
class_weight_k = class_weight
|
||||
else:
|
||||
class_weight_k = class_weight[k]
|
||||
|
||||
if indices is not None:
|
||||
# Get class weights for the subsample, covering all classes in
|
||||
# case some labels that were present in the original data are
|
||||
# missing from the sample.
|
||||
y_subsample = y[indices, k]
|
||||
classes_subsample = np.unique(y_subsample)
|
||||
|
||||
weight_k = np.choose(np.searchsorted(classes_subsample,
|
||||
classes_full),
|
||||
compute_class_weight(class_weight_k,
|
||||
classes_subsample,
|
||||
y_subsample),
|
||||
mode='clip')
|
||||
|
||||
classes_missing = set(classes_full) - set(classes_subsample)
|
||||
else:
|
||||
weight_k = compute_class_weight(class_weight_k,
|
||||
classes_full,
|
||||
y_full)
|
||||
|
||||
weight_k = weight_k[np.searchsorted(classes_full, y_full)]
|
||||
|
||||
if classes_missing:
|
||||
# Make missing classes' weight zero
|
||||
weight_k[in1d(y_full, list(classes_missing))] = 0.
|
||||
|
||||
expanded_class_weight.append(weight_k)
|
||||
|
||||
expanded_class_weight = np.prod(expanded_class_weight,
|
||||
axis=0,
|
||||
dtype=np.float64)
|
||||
|
||||
return expanded_class_weight
|
||||
|
||||
def _assert_all_finite(X):
|
||||
"""Like assert_all_finite, but only for ndarray."""
|
||||
X = np.asanyarray(X)
|
||||
# First try an O(n) time, O(1) space solution for the common case that
|
||||
# everything is finite; fall back to O(n) space np.isfinite to prevent
|
||||
# false positives from overflow in sum method.
|
||||
if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
|
||||
and not np.isfinite(X).all()):
|
||||
raise ValueError("Input contains NaN, infinity"
|
||||
" or a value too large for %r." % X.dtype)
|
||||
|
||||
def check_array(array, accept_sparse=None, dtype="numeric", order=None,
|
||||
copy=False, force_all_finite=True, ensure_2d=True,
|
||||
allow_nd=False, ensure_min_samples=1, ensure_min_features=1):
|
||||
"""Input validation on an array, list, sparse matrix or similar.
|
||||
|
||||
By default, the input is converted to an at least 2nd numpy array.
|
||||
If the dtype of the array is object, attempt converting to float,
|
||||
raising on failure.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
array : object
|
||||
Input object to check / convert.
|
||||
|
||||
accept_sparse : string, list of string or None (default=None)
|
||||
String[s] representing allowed sparse matrix formats, such as 'csc',
|
||||
'csr', etc. None means that sparse matrix input will raise an error.
|
||||
If the input is sparse but not in the allowed format, it will be
|
||||
converted to the first listed format.
|
||||
|
||||
dtype : string, type or None (default="numeric")
|
||||
Data type of result. If None, the dtype of the input is preserved.
|
||||
If "numeric", dtype is preserved unless array.dtype is object.
|
||||
|
||||
order : 'F', 'C' or None (default=None)
|
||||
Whether an array will be forced to be fortran or c-style.
|
||||
|
||||
copy : boolean (default=False)
|
||||
Whether a forced copy will be triggered. If copy=False, a copy might
|
||||
be triggered by a conversion.
|
||||
|
||||
force_all_finite : boolean (default=True)
|
||||
Whether to raise an error on np.inf and np.nan in X.
|
||||
|
||||
ensure_2d : boolean (default=True)
|
||||
Whether to make X at least 2d.
|
||||
|
||||
allow_nd : boolean (default=False)
|
||||
Whether to allow X.ndim > 2.
|
||||
|
||||
ensure_min_samples : int (default=1)
|
||||
Make sure that the array has a minimum number of samples in its first
|
||||
axis (rows for a 2D array). Setting to 0 disables this check.
|
||||
|
||||
ensure_min_features : int (default=1)
|
||||
Make sure that the 2D array has some minimum number of features
|
||||
(columns). The default value of 1 rejects empty datasets.
|
||||
This check is only enforced when the input data has effectively 2
|
||||
dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
|
||||
disables this check.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_converted : object
|
||||
The converted and validated X.
|
||||
"""
|
||||
if isinstance(accept_sparse, str):
|
||||
accept_sparse = [accept_sparse]
|
||||
|
||||
# store whether originally we wanted numeric dtype
|
||||
dtype_numeric = dtype == "numeric"
|
||||
|
||||
if ensure_2d:
|
||||
array = np.atleast_2d(array)
|
||||
if dtype_numeric:
|
||||
if hasattr(array, "dtype") and getattr(array.dtype, "kind", None) == "O":
|
||||
# if input is object, convert to float.
|
||||
dtype = np.float64
|
||||
else:
|
||||
dtype = None
|
||||
array = np.array(array, dtype=dtype, order=order, copy=copy)
|
||||
# make sure we actually converted to numeric:
|
||||
if dtype_numeric and array.dtype.kind == "O":
|
||||
array = array.astype(np.float64)
|
||||
if not allow_nd and array.ndim >= 3:
|
||||
raise ValueError("Found array with dim %d. Expected <= 2" %
|
||||
array.ndim)
|
||||
if force_all_finite:
|
||||
_assert_all_finite(array)
|
||||
|
||||
shape_repr = _shape_repr(array.shape)
|
||||
if ensure_min_samples > 0:
|
||||
n_samples = _num_samples(array)
|
||||
if n_samples < ensure_min_samples:
|
||||
raise ValueError("Found array with %d sample(s) (shape=%s) while a"
|
||||
" minimum of %d is required."
|
||||
% (n_samples, shape_repr, ensure_min_samples))
|
||||
|
||||
if ensure_min_features > 0 and array.ndim == 2:
|
||||
n_features = array.shape[1]
|
||||
if n_features < ensure_min_features:
|
||||
raise ValueError("Found array with %d feature(s) (shape=%s) while"
|
||||
" a minimum of %d is required."
|
||||
% (n_features, shape_repr, ensure_min_features))
|
||||
return array
|
||||
|
||||
def check_random_state(seed):
|
||||
"""Turn seed into a np.random.RandomState instance
|
||||
|
||||
If seed is None, return the RandomState singleton used by np.random.
|
||||
If seed is an int, return a new RandomState instance seeded with seed.
|
||||
If seed is already a RandomState instance, return it.
|
||||
Otherwise raise ValueError.
|
||||
"""
|
||||
if seed is None or seed is np.random:
|
||||
return np.random.mtrand._rand
|
||||
if isinstance(seed, (numbers.Integral, np.integer)):
|
||||
return np.random.RandomState(seed)
|
||||
if isinstance(seed, np.random.RandomState):
|
||||
return seed
|
||||
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
|
||||
' instance' % seed)
|
||||
|
||||
def _shape_repr(shape):
|
||||
"""Return a platform independent reprensentation of an array shape
|
||||
|
||||
Under Python 2, the `long` type introduces an 'L' suffix when using the
|
||||
default %r format for tuples of integers (typically used to store the shape
|
||||
of an array).
|
||||
|
||||
Under Windows 64 bit (and Python 2), the `long` type is used by default
|
||||
in numpy shapes even when the integer dimensions are well below 32 bit.
|
||||
The platform specific type causes string messages or doctests to change
|
||||
from one platform to another which is not desirable.
|
||||
|
||||
Under Python 3, there is no more `long` type so the `L` suffix is never
|
||||
introduced in string representation.
|
||||
|
||||
>>> _shape_repr((1, 2))
|
||||
'(1, 2)'
|
||||
>>> one = 2 ** 64 / 2 ** 64 # force an upcast to `long` under Python 2
|
||||
>>> _shape_repr((one, 2 * one))
|
||||
'(1, 2)'
|
||||
>>> _shape_repr((1,))
|
||||
'(1,)'
|
||||
>>> _shape_repr(())
|
||||
'()'
|
||||
"""
|
||||
if len(shape) == 0:
|
||||
return "()"
|
||||
joined = ", ".join("%d" % e for e in shape)
|
||||
if len(shape) == 1:
|
||||
# special notation for singleton tuples
|
||||
joined += ','
|
||||
return "(%s)" % joined
|
||||
|
||||
def _num_samples(x):
|
||||
"""Return number of samples in array-like x."""
|
||||
if hasattr(x, 'fit'):
|
||||
# Don't get num_samples from an ensembles length!
|
||||
raise TypeError('Expected sequence or array-like, got '
|
||||
'estimator %s' % x)
|
||||
if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
|
||||
if hasattr(x, '__array__'):
|
||||
x = np.asarray(x)
|
||||
else:
|
||||
raise TypeError("Expected sequence or array-like, got %s" %
|
||||
type(x))
|
||||
if hasattr(x, 'shape'):
|
||||
if len(x.shape) == 0:
|
||||
raise TypeError("Singleton array %r cannot be considered"
|
||||
" a valid collection." % x)
|
||||
return x.shape[0]
|
||||
else:
|
||||
return len(x)
|
||||
################################ metrics ########################################
|
||||
def _weighted_sum(sample_score, sample_weight, normalize=False):
|
||||
if normalize:
|
||||
return np.average(sample_score, weights=sample_weight)
|
||||
elif sample_weight is not None:
|
||||
return np.dot(sample_score, sample_weight)
|
||||
else:
|
||||
return sample_score.sum()
|
||||
|
||||
def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
|
||||
"""Accuracy classification score.
|
||||
|
||||
In multilabel classification, this function computes subset accuracy:
|
||||
the set of labels predicted for a sample must *exactly* match the
|
||||
corresponding set of labels in y_true.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : 1d array-like, or label indicator array / sparse matrix
|
||||
Ground truth (correct) labels.
|
||||
|
||||
y_pred : 1d array-like, or label indicator array / sparse matrix
|
||||
Predicted labels, as returned by a classifier.
|
||||
|
||||
normalize : bool, optional (default=True)
|
||||
If ``False``, return the number of correctly classified samples.
|
||||
Otherwise, return the fraction of correctly classified samples.
|
||||
|
||||
sample_weight : array-like of shape = [n_samples], optional
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
If ``normalize == True``, return the correctly classified samples
|
||||
(float), else it returns the number of correctly classified samples
|
||||
(int).
|
||||
|
||||
The best performance is 1 with ``normalize == True`` and the number
|
||||
of samples with ``normalize == False``.
|
||||
|
||||
See also
|
||||
--------
|
||||
jaccard_similarity_score, hamming_loss, zero_one_loss
|
||||
|
||||
Notes
|
||||
-----
|
||||
In binary and multiclass classification, this function is equal
|
||||
to the ``jaccard_similarity_score`` function.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.metrics import accuracy_score
|
||||
>>> y_pred = [0, 2, 1, 3]
|
||||
>>> y_true = [0, 1, 2, 3]
|
||||
>>> accuracy_score(y_true, y_pred)
|
||||
0.5
|
||||
>>> accuracy_score(y_true, y_pred, normalize=False)
|
||||
2
|
||||
|
||||
In the multilabel case with binary label indicators:
|
||||
>>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))
|
||||
0.5
|
||||
"""
|
||||
|
||||
# Compute accuracy for each possible representation
|
||||
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
|
||||
if y_type.startswith('multilabel'):
|
||||
differing_labels = count_nonzero(y_true - y_pred, axis=1)
|
||||
score = differing_labels == 0
|
||||
else:
|
||||
score = y_true == y_pred
|
||||
|
||||
return _weighted_sum(score, sample_weight, normalize)
|
||||
|
||||
def r2_score(y_true, y_pred, sample_weight=None):
|
||||
"""R^2 (coefficient of determination) regression score function.
|
||||
|
||||
Best possible score is 1.0, lower values are worse.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : array-like of shape = [n_samples] or [n_samples, n_outputs]
|
||||
Ground truth (correct) target values.
|
||||
|
||||
y_pred : array-like of shape = [n_samples] or [n_samples, n_outputs]
|
||||
Estimated target values.
|
||||
|
||||
sample_weight : array-like of shape = [n_samples], optional
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
z : float
|
||||
The R^2 score.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This is not a symmetric function.
|
||||
|
||||
Unlike most other scores, R^2 score may be negative (it need not actually
|
||||
be the square of a quantity R).
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] `Wikipedia entry on the Coefficient of determination
|
||||
<http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.metrics import r2_score
|
||||
>>> y_true = [3, -0.5, 2, 7]
|
||||
>>> y_pred = [2.5, 0.0, 2, 8]
|
||||
>>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS
|
||||
0.948...
|
||||
>>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
|
||||
>>> y_pred = [[0, 2], [-1, 2], [8, -5]]
|
||||
>>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS
|
||||
0.938...
|
||||
|
||||
"""
|
||||
y_type, y_true, y_pred = _check_reg_targets(y_true, y_pred)
|
||||
|
||||
if sample_weight is not None:
|
||||
sample_weight = column_or_1d(sample_weight)
|
||||
weight = sample_weight[:, np.newaxis]
|
||||
else:
|
||||
weight = 1.
|
||||
|
||||
numerator = (weight * (y_true - y_pred) ** 2).sum(dtype=np.float64)
|
||||
denominator = (weight * (y_true - np.average(
|
||||
y_true, axis=0, weights=sample_weight)) ** 2).sum(dtype=np.float64)
|
||||
|
||||
if denominator == 0.0:
|
||||
if numerator == 0.0:
|
||||
return 1.0
|
||||
else:
|
||||
# arbitrary set to zero to avoid -inf scores, having a constant
|
||||
# y_true is not interesting for scoring a regression anyway
|
||||
return 0.0
|
||||
|
||||
return 1 - numerator / denominator
|
||||
|
||||
|
||||
################################ base #########################################
|
||||
###############################################################################
|
||||
|
||||
|
||||
class BaseEstimator(object):
|
||||
"""Base class for all estimators in scikit-learn
|
||||
|
||||
Notes
|
||||
-----
|
||||
All estimators should specify all the parameters that can be set
|
||||
at the class level in their ``__init__`` as explicit keyword
|
||||
arguments (no ``*args`` or ``**kwargs``).
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def _get_param_names(cls):
|
||||
"""Get parameter names for the estimator"""
|
||||
# fetch the constructor or the original constructor before
|
||||
# deprecation wrapping if any
|
||||
init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
|
||||
if init is object.__init__:
|
||||
# No explicit constructor to introspect
|
||||
return []
|
||||
|
||||
# introspect the constructor arguments to find the model parameters
|
||||
# to represent
|
||||
args, varargs, kw, default = inspect.getargspec(init)
|
||||
if varargs is not None:
|
||||
raise RuntimeError("scikit-learn estimators should always "
|
||||
"specify their parameters in the signature"
|
||||
" of their __init__ (no varargs)."
|
||||
" %s doesn't follow this convention."
|
||||
% (cls, ))
|
||||
# Remove 'self'
|
||||
# XXX: This is going to fail if the init is a staticmethod, but
|
||||
# who would do this?
|
||||
args.pop(0)
|
||||
args.sort()
|
||||
return args
|
||||
|
||||
def get_params(self, deep=True):
|
||||
"""Get parameters for this estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
deep: boolean, optional
|
||||
If True, will return the parameters for this estimator and
|
||||
contained subobjects that are estimators.
|
||||
|
||||
Returns
|
||||
-------
|
||||
params : mapping of string to any
|
||||
Parameter names mapped to their values.
|
||||
"""
|
||||
out = dict()
|
||||
for key in self._get_param_names():
|
||||
# We need deprecation warnings to always be on in order to
|
||||
# catch deprecated param values.
|
||||
# This is set in utils/__init__.py but it gets overwritten
|
||||
# when running under python3 somehow.
|
||||
warnings.simplefilter("always", DeprecationWarning)
|
||||
try:
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
value = getattr(self, key, None)
|
||||
if len(w) and w[0].category == DeprecationWarning:
|
||||
# if the parameter is deprecated, don't show it
|
||||
continue
|
||||
finally:
|
||||
warnings.filters.pop(0)
|
||||
|
||||
# XXX: should we rather test if instance of estimator?
|
||||
if deep and hasattr(value, 'get_params'):
|
||||
deep_items = value.get_params().items()
|
||||
out.update((key + '__' + k, val) for k, val in deep_items)
|
||||
out[key] = value
|
||||
return out
|
||||
|
||||
def set_params(self, **params):
|
||||
"""Set the parameters of this estimator.
|
||||
|
||||
The method works on simple estimators as well as on nested objects
|
||||
(such as pipelines). The former have parameters of the form
|
||||
``<component>__<parameter>`` so that it's possible to update each
|
||||
component of a nested object.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
if not params:
|
||||
# Simple optimisation to gain speed (inspect is slow)
|
||||
return self
|
||||
valid_params = self.get_params(deep=True)
|
||||
for key, value in iteritems(params):
|
||||
split = key.split('__', 1)
|
||||
if len(split) > 1:
|
||||
# nested objects case
|
||||
name, sub_name = split
|
||||
if not name in valid_params:
|
||||
raise ValueError('Invalid parameter %s for estimator %s' %
|
||||
(name, self))
|
||||
sub_object = valid_params[name]
|
||||
sub_object.set_params(**{sub_name: value})
|
||||
else:
|
||||
# simple objects case
|
||||
if not key in valid_params:
|
||||
raise ValueError('Invalid parameter %s ' 'for estimator %s'
|
||||
% (key, self.__class__.__name__))
|
||||
setattr(self, key, value)
|
||||
return self
|
||||
|
||||
def __repr__(self):
|
||||
class_name = self.__class__.__name__
|
||||
return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False),
|
||||
offset=len(class_name),),)
|
||||
|
||||
class MetaEstimatorMixin(object):
|
||||
"""Mixin class for all meta estimators in scikit-learn."""
|
||||
# this is just a tag for the moment
|
||||
|
||||
|
||||
class ClassifierMixin(object):
|
||||
"""Mixin class for all classifiers in scikit-learn."""
|
||||
|
||||
def score(self, X, y, sample_weight=None):
|
||||
"""Returns the mean accuracy on the given test data and labels.
|
||||
|
||||
In multi-label classification, this is the subset accuracy
|
||||
which is a harsh metric since you require for each sample that
|
||||
each label set be correctly predicted.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = (n_samples, n_features)
|
||||
Test samples.
|
||||
|
||||
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
|
||||
True labels for X.
|
||||
|
||||
sample_weight : array-like, shape = [n_samples], optional
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
Mean accuracy of self.predict(X) wrt. y.
|
||||
|
||||
"""
|
||||
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
|
||||
|
||||
class RegressorMixin(object):
|
||||
"""Mixin class for all regression estimators in scikit-learn."""
|
||||
|
||||
def score(self, X, y, sample_weight=None):
|
||||
"""Returns the coefficient of determination R^2 of the prediction.
|
||||
|
||||
The coefficient R^2 is defined as (1 - u/v), where u is the regression
|
||||
sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual
|
||||
sum of squares ((y_true - y_true.mean()) ** 2).sum().
|
||||
Best possible score is 1.0, lower values are worse.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = (n_samples, n_features)
|
||||
Test samples.
|
||||
|
||||
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
|
||||
True values for X.
|
||||
|
||||
sample_weight : array-like, shape = [n_samples], optional
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
R^2 of self.predict(X) wrt. y.
|
||||
"""
|
||||
return r2_score(y, self.predict(X), sample_weight=sample_weight)
|
||||
|
||||
######################### Preprocessing ################################
|
||||
|
||||
class OneHotEncoder:
|
||||
"""Encode categorical integer features using a one-hot aka one-of-K scheme.
|
||||
|
||||
The input to this transformer should be a matrix of integers, denoting
|
||||
the values taken on by categorical (discrete) features. The output will be
|
||||
a sparse matrix where each column corresponds to one possible value of one
|
||||
feature. It is assumed that input features take on values in the range
|
||||
[0, n_values).
|
||||
|
||||
This encoding is needed for feeding categorical data to many scikit-learn
|
||||
estimators, notably linear models and SVMs with the standard kernels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_values : 'auto', int or array of ints
|
||||
Number of values per feature.
|
||||
|
||||
- 'auto' : determine value range from training data.
|
||||
- int : maximum value for all features.
|
||||
- array : maximum value per feature.
|
||||
|
||||
categorical_features: "all" or array of indices or mask
|
||||
Specify what features are treated as categorical.
|
||||
|
||||
- 'all' (default): All features are treated as categorical.
|
||||
- array of indices: Array of categorical feature indices.
|
||||
- mask: Array of length n_features and with dtype=bool.
|
||||
|
||||
Non-categorical features are always stacked to the right of the matrix.
|
||||
|
||||
dtype : number type, default=np.float
|
||||
Desired dtype of output.
|
||||
|
||||
sparse : boolean, default=True
|
||||
Will return sparse matrix if set True else will return an array.
|
||||
|
||||
handle_unknown : str, 'error' or 'ignore'
|
||||
Whether to raise an error or ignore if a unknown categorical feature is
|
||||
present during transform.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
active_features_ : array
|
||||
Indices for active features, meaning values that actually occur
|
||||
in the training set. Only available when n_values is ``'auto'``.
|
||||
|
||||
feature_indices_ : array of shape (n_features,)
|
||||
Indices to feature ranges.
|
||||
Feature ``i`` in the original data is mapped to features
|
||||
from ``feature_indices_[i]`` to ``feature_indices_[i+1]``
|
||||
(and then potentially masked by `active_features_` afterwards)
|
||||
|
||||
n_values_ : array of shape (n_features,)
|
||||
Maximum number of values per feature.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Given a dataset with three features and two samples, we let the encoder
|
||||
find the maximum value per feature and transform the data to a binary
|
||||
one-hot encoding.
|
||||
|
||||
>>> from sklearn.preprocessing import OneHotEncoder
|
||||
>>> enc = OneHotEncoder()
|
||||
>>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
|
||||
[1, 0, 2]]) # doctest: +ELLIPSIS
|
||||
OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
|
||||
handle_unknown='error', n_values='auto', sparse=True)
|
||||
>>> enc.n_values_
|
||||
array([2, 3, 4])
|
||||
>>> enc.feature_indices_
|
||||
array([0, 2, 5, 9])
|
||||
>>> enc.transform([[0, 1, 1]]).toarray()
|
||||
array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]])
|
||||
|
||||
See also
|
||||
--------
|
||||
sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
|
||||
dictionary items (also handles string-valued features).
|
||||
sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
|
||||
encoding of dictionary items or strings.
|
||||
"""
|
||||
def __init__(self, n_values="auto", categorical_features="all",
|
||||
dtype=np.float, sparse=True, handle_unknown='error'):
|
||||
self.n_values = n_values
|
||||
self.categorical_features = categorical_features
|
||||
self.dtype = dtype
|
||||
self.sparse = sparse
|
||||
self.handle_unknown = handle_unknown
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit OneHotEncoder to X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape=(n_samples, n_feature)
|
||||
Input array of type int.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
self.fit_transform(X)
|
||||
return self
|
||||
|
||||
def _fit_transform(self, X):
|
||||
"""Assumes X contains only categorical features."""
|
||||
X = check_array(X, dtype=np.int)
|
||||
if np.any(X < 0):
|
||||
raise ValueError("X needs to contain only non-negative integers.")
|
||||
n_samples, n_features = X.shape
|
||||
if self.n_values == 'auto':
|
||||
n_values = np.max(X, axis=0) + 1
|
||||
elif isinstance(self.n_values, numbers.Integral):
|
||||
if (np.max(X, axis=0) >= self.n_values).any():
|
||||
raise ValueError("Feature out of bounds for n_values=%d"
|
||||
% self.n_values)
|
||||
n_values = np.empty(n_features, dtype=np.int)
|
||||
n_values.fill(self.n_values)
|
||||
else:
|
||||
try:
|
||||
n_values = np.asarray(self.n_values, dtype=int)
|
||||
except (ValueError, TypeError):
|
||||
raise TypeError("Wrong type for parameter `n_values`. Expected"
|
||||
" 'auto', int or array of ints, got %r"
|
||||
% type(X))
|
||||
if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
|
||||
raise ValueError("Shape mismatch: if n_values is an array,"
|
||||
" it has to be of shape (n_features,).")
|
||||
|
||||
self.n_values_ = n_values
|
||||
n_values = np.hstack([[0], n_values])
|
||||
indices = np.cumsum(n_values)
|
||||
self.feature_indices_ = indices
|
||||
|
||||
column_indices = (X + indices[:-1]).ravel()
|
||||
row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
|
||||
n_features)
|
||||
#data = np.ones(n_samples * n_features)
|
||||
#out = sparse.coo_matrix((data, (row_indices, column_indices)),
|
||||
# shape=(n_samples, indices[-1]),
|
||||
# dtype=self.dtype).tocsr()
|
||||
data = np.zeros(n_samples, indices[-1], dtype = self.dtype)
|
||||
data[row_indices, column_indices] = 1
|
||||
|
||||
print out
|
||||
|
||||
if self.n_values == 'auto':
|
||||
mask = np.array(out.sum(axis=0)).ravel() != 0
|
||||
active_features = np.where(mask)[0]
|
||||
out = out[:, active_features]
|
||||
self.active_features_ = active_features
|
||||
|
||||
return out if self.sparse else out.toarray()
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Fit OneHotEncoder to X, then transform X.
|
||||
|
||||
Equivalent to self.fit(X).transform(X), but more convenient and more
|
||||
efficient. See fit for the parameters, transform for the return value.
|
||||
"""
|
||||
return _transform_selected(X, self._fit_transform,
|
||||
self.categorical_features, copy=True)
|
||||
|
||||
def _transform(self, X):
|
||||
"""Assumes X contains only categorical features."""
|
||||
X = check_array(X, dtype=np.int)
|
||||
if np.any(X < 0):
|
||||
raise ValueError("X needs to contain only non-negative integers.")
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
indices = self.feature_indices_
|
||||
if n_features != indices.shape[0] - 1:
|
||||
raise ValueError("X has different shape than during fitting."
|
||||
" Expected %d, got %d."
|
||||
% (indices.shape[0] - 1, n_features))
|
||||
|
||||
# We use only those catgorical features of X that are known using fit.
|
||||
# i.e lesser than n_values_ using mask.
|
||||
# This means, if self.handle_unknown is "ignore", the row_indices and
|
||||
# col_indices corresponding to the unknown categorical feature are
|
||||
# ignored.
|
||||
mask = (X < self.n_values_).ravel()
|
||||
if np.any(~mask):
|
||||
if self.handle_unknown not in ['error', 'ignore']:
|
||||
raise ValueError("handle_unknown should be either error or "
|
||||
"unknown got %s" % self.handle_unknown)
|
||||
if self.handle_unknown == 'error':
|
||||
raise ValueError("unknown categorical feature present %s "
|
||||
"during transform." % X[~mask])
|
||||
|
||||
column_indices = (X + indices[:-1]).ravel()[mask]
|
||||
row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
|
||||
n_features)[mask]
|
||||
data = np.ones(np.sum(mask))
|
||||
out = sparse.coo_matrix((data, (row_indices, column_indices)),
|
||||
shape=(n_samples, indices[-1]),
|
||||
dtype=self.dtype).tocsr()
|
||||
if self.n_values == 'auto':
|
||||
out = out[:, self.active_features_]
|
||||
|
||||
return out if self.sparse else out.toarray()
|
||||
|
||||
def transform(self, X):
|
||||
"""Transform X using one-hot encoding.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape=(n_samples, n_features)
|
||||
Input array of type int.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_out : sparse matrix if sparse=True else a 2-d array, dtype=int
|
||||
Transformed input.
|
||||
"""
|
||||
return _transform_selected(X, self._transform,
|
||||
self.categorical_features, copy=True)
|
Reference in New Issue
Block a user