Files
triton/python/isaac/autotuning/external/sklearn/utils.py

912 lines
34 KiB
Python
Raw Normal View History

2015-08-13 18:16:55 -07:00
import sys
import inspect
import warnings
import numbers
import numpy as np
################################ six ########################################
PY2 = sys.version_info[0] == 2
PY3 = sys.version_info[0] == 3
if PY3:
string_types = str
_iteritems = "items"
else:
string_types = basestring
_iteritems = "iteritems"
def iteritems(d, **kw):
"""Return an iterator over the (key, value) pairs of a dictionary."""
return iter(getattr(d, _iteritems)(**kw))
################################ utils ########################################
if np.version < (1, 6, 2):
# Allow bincount to accept empty arrays
# https://github.com/numpy/numpy/commit/40f0844846a9d7665616b142407a3d74cb65a040
def bincount(x, weights=None, minlength=None):
if len(x) > 0:
return np.bincount(x, weights, minlength)
else:
if minlength is None:
minlength = 0
minlength = np.asscalar(np.asarray(minlength, dtype=np.intp))
return np.zeros(minlength, dtype=np.intp)
else:
from numpy import bincount
class DataConversionWarning(UserWarning):
"""A warning on implicit data conversions happening in the code"""
pass
class NotFittedError(ValueError, AttributeError):
"""Exception class to raise if estimator is used before fitting
This class inherits from both ValueError and AttributeError to help with
exception handling and backward compatibility.
"""
def check_is_fitted(estimator, attributes, msg=None, all_or_any=all):
"""Perform is_fitted validation for estimator.
Checks if the estimator is fitted by verifying the presence of
"all_or_any" of the passed attributes and raises a NotFittedError with the
given message.
Parameters
----------
estimator : estimator instance.
estimator instance for which the check is performed.
attributes : attribute name(s) given as string or a list/tuple of strings
Eg. : ["coef_", "estimator_", ...], "coef_"
msg : string
The default error message is, "This %(name)s instance is not fitted
yet. Call 'fit' with appropriate arguments before using this method."
For custom messages if "%(name)s" is present in the message string,
it is substituted for the estimator name.
Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
all_or_any : callable, {all, any}, default all
Specify whether all or any of the given attributes must exist.
"""
if msg is None:
msg = ("This %(name)s instance is not fitted yet. Call 'fit' with "
"appropriate arguments before using this method.")
if not hasattr(estimator, 'fit'):
raise TypeError("%s is not an estimator instance." % (estimator))
if not isinstance(attributes, (list, tuple)):
attributes = [attributes]
if not all_or_any([hasattr(estimator, attr) for attr in attributes]):
raise NotFittedError(msg % {'name': type(estimator).__name__})
def compute_sample_weight(class_weight, y, indices=None):
"""Estimate sample weights by class for unbalanced datasets.
Parameters
----------
class_weight : dict, list of dicts, "auto", or None, optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one. For
multi-output problems, a list of dicts can be provided in the same
order as the columns of y.
The "auto" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data.
For multi-output, the weights of each column of y will be multiplied.
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
Array of original class labels per sample.
indices : array-like, shape (n_subsample,), or None
Array of indices to be used in a subsample. Can be of length less than
n_samples in the case of a subsample, or equal to n_samples in the
case of a bootstrap subsample with repeated indices. If None, the
sample weight will be calculated over the full sample. Only "auto" is
supported for class_weight if this is provided.
Returns
-------
sample_weight_vect : ndarray, shape (n_samples,)
Array with sample weights as applied to the original y
"""
y = np.atleast_1d(y)
if y.ndim == 1:
y = np.reshape(y, (-1, 1))
n_outputs = y.shape[1]
if isinstance(class_weight, string_types):
if class_weight != 'auto':
raise ValueError('The only valid preset for class_weight is '
'"auto". Given "%s".' % class_weight)
elif (indices is not None and
not isinstance(class_weight, string_types)):
raise ValueError('The only valid class_weight for subsampling is '
'"auto". Given "%s".' % class_weight)
elif n_outputs > 1:
if (not hasattr(class_weight, "__iter__") or
isinstance(class_weight, dict)):
raise ValueError("For multi-output, class_weight should be a "
"list of dicts, or a valid string.")
if len(class_weight) != n_outputs:
raise ValueError("For multi-output, number of elements in "
"class_weight should match number of outputs.")
expanded_class_weight = []
for k in range(n_outputs):
y_full = y[:, k]
classes_full = np.unique(y_full)
classes_missing = None
if class_weight == 'auto' or n_outputs == 1:
class_weight_k = class_weight
else:
class_weight_k = class_weight[k]
if indices is not None:
# Get class weights for the subsample, covering all classes in
# case some labels that were present in the original data are
# missing from the sample.
y_subsample = y[indices, k]
classes_subsample = np.unique(y_subsample)
weight_k = np.choose(np.searchsorted(classes_subsample,
classes_full),
compute_class_weight(class_weight_k,
classes_subsample,
y_subsample),
mode='clip')
classes_missing = set(classes_full) - set(classes_subsample)
else:
weight_k = compute_class_weight(class_weight_k,
classes_full,
y_full)
weight_k = weight_k[np.searchsorted(classes_full, y_full)]
if classes_missing:
# Make missing classes' weight zero
weight_k[in1d(y_full, list(classes_missing))] = 0.
expanded_class_weight.append(weight_k)
expanded_class_weight = np.prod(expanded_class_weight,
axis=0,
dtype=np.float64)
return expanded_class_weight
def _assert_all_finite(X):
"""Like assert_all_finite, but only for ndarray."""
X = np.asanyarray(X)
# First try an O(n) time, O(1) space solution for the common case that
# everything is finite; fall back to O(n) space np.isfinite to prevent
# false positives from overflow in sum method.
if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
and not np.isfinite(X).all()):
raise ValueError("Input contains NaN, infinity"
" or a value too large for %r." % X.dtype)
def check_array(array, accept_sparse=None, dtype="numeric", order=None,
copy=False, force_all_finite=True, ensure_2d=True,
allow_nd=False, ensure_min_samples=1, ensure_min_features=1):
"""Input validation on an array, list, sparse matrix or similar.
By default, the input is converted to an at least 2nd numpy array.
If the dtype of the array is object, attempt converting to float,
raising on failure.
Parameters
----------
array : object
Input object to check / convert.
accept_sparse : string, list of string or None (default=None)
String[s] representing allowed sparse matrix formats, such as 'csc',
'csr', etc. None means that sparse matrix input will raise an error.
If the input is sparse but not in the allowed format, it will be
converted to the first listed format.
dtype : string, type or None (default="numeric")
Data type of result. If None, the dtype of the input is preserved.
If "numeric", dtype is preserved unless array.dtype is object.
order : 'F', 'C' or None (default=None)
Whether an array will be forced to be fortran or c-style.
copy : boolean (default=False)
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.
force_all_finite : boolean (default=True)
Whether to raise an error on np.inf and np.nan in X.
ensure_2d : boolean (default=True)
Whether to make X at least 2d.
allow_nd : boolean (default=False)
Whether to allow X.ndim > 2.
ensure_min_samples : int (default=1)
Make sure that the array has a minimum number of samples in its first
axis (rows for a 2D array). Setting to 0 disables this check.
ensure_min_features : int (default=1)
Make sure that the 2D array has some minimum number of features
(columns). The default value of 1 rejects empty datasets.
This check is only enforced when the input data has effectively 2
dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
disables this check.
Returns
-------
X_converted : object
The converted and validated X.
"""
if isinstance(accept_sparse, str):
accept_sparse = [accept_sparse]
# store whether originally we wanted numeric dtype
dtype_numeric = dtype == "numeric"
if ensure_2d:
array = np.atleast_2d(array)
if dtype_numeric:
if hasattr(array, "dtype") and getattr(array.dtype, "kind", None) == "O":
# if input is object, convert to float.
dtype = np.float64
else:
dtype = None
array = np.array(array, dtype=dtype, order=order, copy=copy)
# make sure we actually converted to numeric:
if dtype_numeric and array.dtype.kind == "O":
array = array.astype(np.float64)
if not allow_nd and array.ndim >= 3:
raise ValueError("Found array with dim %d. Expected <= 2" %
array.ndim)
if force_all_finite:
_assert_all_finite(array)
shape_repr = _shape_repr(array.shape)
if ensure_min_samples > 0:
n_samples = _num_samples(array)
if n_samples < ensure_min_samples:
raise ValueError("Found array with %d sample(s) (shape=%s) while a"
" minimum of %d is required."
% (n_samples, shape_repr, ensure_min_samples))
if ensure_min_features > 0 and array.ndim == 2:
n_features = array.shape[1]
if n_features < ensure_min_features:
raise ValueError("Found array with %d feature(s) (shape=%s) while"
" a minimum of %d is required."
% (n_features, shape_repr, ensure_min_features))
return array
def check_random_state(seed):
"""Turn seed into a np.random.RandomState instance
If seed is None, return the RandomState singleton used by np.random.
If seed is an int, return a new RandomState instance seeded with seed.
If seed is already a RandomState instance, return it.
Otherwise raise ValueError.
"""
if seed is None or seed is np.random:
return np.random.mtrand._rand
if isinstance(seed, (numbers.Integral, np.integer)):
return np.random.RandomState(seed)
if isinstance(seed, np.random.RandomState):
return seed
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
' instance' % seed)
def _shape_repr(shape):
"""Return a platform independent reprensentation of an array shape
Under Python 2, the `long` type introduces an 'L' suffix when using the
default %r format for tuples of integers (typically used to store the shape
of an array).
Under Windows 64 bit (and Python 2), the `long` type is used by default
in numpy shapes even when the integer dimensions are well below 32 bit.
The platform specific type causes string messages or doctests to change
from one platform to another which is not desirable.
Under Python 3, there is no more `long` type so the `L` suffix is never
introduced in string representation.
>>> _shape_repr((1, 2))
'(1, 2)'
>>> one = 2 ** 64 / 2 ** 64 # force an upcast to `long` under Python 2
>>> _shape_repr((one, 2 * one))
'(1, 2)'
>>> _shape_repr((1,))
'(1,)'
>>> _shape_repr(())
'()'
"""
if len(shape) == 0:
return "()"
joined = ", ".join("%d" % e for e in shape)
if len(shape) == 1:
# special notation for singleton tuples
joined += ','
return "(%s)" % joined
def _num_samples(x):
"""Return number of samples in array-like x."""
if hasattr(x, 'fit'):
# Don't get num_samples from an ensembles length!
raise TypeError('Expected sequence or array-like, got '
'estimator %s' % x)
if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
if hasattr(x, '__array__'):
x = np.asarray(x)
else:
raise TypeError("Expected sequence or array-like, got %s" %
type(x))
if hasattr(x, 'shape'):
if len(x.shape) == 0:
raise TypeError("Singleton array %r cannot be considered"
" a valid collection." % x)
return x.shape[0]
else:
return len(x)
################################ metrics ########################################
def _weighted_sum(sample_score, sample_weight, normalize=False):
if normalize:
return np.average(sample_score, weights=sample_weight)
elif sample_weight is not None:
return np.dot(sample_score, sample_weight)
else:
return sample_score.sum()
def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
"""Accuracy classification score.
In multilabel classification, this function computes subset accuracy:
the set of labels predicted for a sample must *exactly* match the
corresponding set of labels in y_true.
Parameters
----------
y_true : 1d array-like, or label indicator array / sparse matrix
Ground truth (correct) labels.
y_pred : 1d array-like, or label indicator array / sparse matrix
Predicted labels, as returned by a classifier.
normalize : bool, optional (default=True)
If ``False``, return the number of correctly classified samples.
Otherwise, return the fraction of correctly classified samples.
sample_weight : array-like of shape = [n_samples], optional
Sample weights.
Returns
-------
score : float
If ``normalize == True``, return the correctly classified samples
(float), else it returns the number of correctly classified samples
(int).
The best performance is 1 with ``normalize == True`` and the number
of samples with ``normalize == False``.
See also
--------
jaccard_similarity_score, hamming_loss, zero_one_loss
Notes
-----
In binary and multiclass classification, this function is equal
to the ``jaccard_similarity_score`` function.
Examples
--------
>>> import numpy as np
>>> from sklearn.metrics import accuracy_score
>>> y_pred = [0, 2, 1, 3]
>>> y_true = [0, 1, 2, 3]
>>> accuracy_score(y_true, y_pred)
0.5
>>> accuracy_score(y_true, y_pred, normalize=False)
2
In the multilabel case with binary label indicators:
>>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))
0.5
"""
# Compute accuracy for each possible representation
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
if y_type.startswith('multilabel'):
differing_labels = count_nonzero(y_true - y_pred, axis=1)
score = differing_labels == 0
else:
score = y_true == y_pred
return _weighted_sum(score, sample_weight, normalize)
def r2_score(y_true, y_pred, sample_weight=None):
"""R^2 (coefficient of determination) regression score function.
Best possible score is 1.0, lower values are worse.
Parameters
----------
y_true : array-like of shape = [n_samples] or [n_samples, n_outputs]
Ground truth (correct) target values.
y_pred : array-like of shape = [n_samples] or [n_samples, n_outputs]
Estimated target values.
sample_weight : array-like of shape = [n_samples], optional
Sample weights.
Returns
-------
z : float
The R^2 score.
Notes
-----
This is not a symmetric function.
Unlike most other scores, R^2 score may be negative (it need not actually
be the square of a quantity R).
References
----------
.. [1] `Wikipedia entry on the Coefficient of determination
<http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
Examples
--------
>>> from sklearn.metrics import r2_score
>>> y_true = [3, -0.5, 2, 7]
>>> y_pred = [2.5, 0.0, 2, 8]
>>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS
0.948...
>>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
>>> y_pred = [[0, 2], [-1, 2], [8, -5]]
>>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS
0.938...
"""
y_type, y_true, y_pred = _check_reg_targets(y_true, y_pred)
if sample_weight is not None:
sample_weight = column_or_1d(sample_weight)
weight = sample_weight[:, np.newaxis]
else:
weight = 1.
numerator = (weight * (y_true - y_pred) ** 2).sum(dtype=np.float64)
denominator = (weight * (y_true - np.average(
y_true, axis=0, weights=sample_weight)) ** 2).sum(dtype=np.float64)
if denominator == 0.0:
if numerator == 0.0:
return 1.0
else:
# arbitrary set to zero to avoid -inf scores, having a constant
# y_true is not interesting for scoring a regression anyway
return 0.0
return 1 - numerator / denominator
################################ base #########################################
###############################################################################
class BaseEstimator(object):
"""Base class for all estimators in scikit-learn
Notes
-----
All estimators should specify all the parameters that can be set
at the class level in their ``__init__`` as explicit keyword
arguments (no ``*args`` or ``**kwargs``).
"""
@classmethod
def _get_param_names(cls):
"""Get parameter names for the estimator"""
# fetch the constructor or the original constructor before
# deprecation wrapping if any
init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
if init is object.__init__:
# No explicit constructor to introspect
return []
# introspect the constructor arguments to find the model parameters
# to represent
args, varargs, kw, default = inspect.getargspec(init)
if varargs is not None:
raise RuntimeError("scikit-learn estimators should always "
"specify their parameters in the signature"
" of their __init__ (no varargs)."
" %s doesn't follow this convention."
% (cls, ))
# Remove 'self'
# XXX: This is going to fail if the init is a staticmethod, but
# who would do this?
args.pop(0)
args.sort()
return args
def get_params(self, deep=True):
"""Get parameters for this estimator.
Parameters
----------
deep: boolean, optional
If True, will return the parameters for this estimator and
contained subobjects that are estimators.
Returns
-------
params : mapping of string to any
Parameter names mapped to their values.
"""
out = dict()
for key in self._get_param_names():
# We need deprecation warnings to always be on in order to
# catch deprecated param values.
# This is set in utils/__init__.py but it gets overwritten
# when running under python3 somehow.
warnings.simplefilter("always", DeprecationWarning)
try:
with warnings.catch_warnings(record=True) as w:
value = getattr(self, key, None)
if len(w) and w[0].category == DeprecationWarning:
# if the parameter is deprecated, don't show it
continue
finally:
warnings.filters.pop(0)
# XXX: should we rather test if instance of estimator?
if deep and hasattr(value, 'get_params'):
deep_items = value.get_params().items()
out.update((key + '__' + k, val) for k, val in deep_items)
out[key] = value
return out
def set_params(self, **params):
"""Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The former have parameters of the form
``<component>__<parameter>`` so that it's possible to update each
component of a nested object.
Returns
-------
self
"""
if not params:
# Simple optimisation to gain speed (inspect is slow)
return self
valid_params = self.get_params(deep=True)
for key, value in iteritems(params):
split = key.split('__', 1)
if len(split) > 1:
# nested objects case
name, sub_name = split
if not name in valid_params:
raise ValueError('Invalid parameter %s for estimator %s' %
(name, self))
sub_object = valid_params[name]
sub_object.set_params(**{sub_name: value})
else:
# simple objects case
if not key in valid_params:
raise ValueError('Invalid parameter %s ' 'for estimator %s'
% (key, self.__class__.__name__))
setattr(self, key, value)
return self
def __repr__(self):
class_name = self.__class__.__name__
return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False),
offset=len(class_name),),)
class MetaEstimatorMixin(object):
"""Mixin class for all meta estimators in scikit-learn."""
# this is just a tag for the moment
class ClassifierMixin(object):
"""Mixin class for all classifiers in scikit-learn."""
def score(self, X, y, sample_weight=None):
"""Returns the mean accuracy on the given test data and labels.
In multi-label classification, this is the subset accuracy
which is a harsh metric since you require for each sample that
each label set be correctly predicted.
Parameters
----------
X : array-like, shape = (n_samples, n_features)
Test samples.
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
True labels for X.
sample_weight : array-like, shape = [n_samples], optional
Sample weights.
Returns
-------
score : float
Mean accuracy of self.predict(X) wrt. y.
"""
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
class RegressorMixin(object):
"""Mixin class for all regression estimators in scikit-learn."""
def score(self, X, y, sample_weight=None):
"""Returns the coefficient of determination R^2 of the prediction.
The coefficient R^2 is defined as (1 - u/v), where u is the regression
sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual
sum of squares ((y_true - y_true.mean()) ** 2).sum().
Best possible score is 1.0, lower values are worse.
Parameters
----------
X : array-like, shape = (n_samples, n_features)
Test samples.
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
True values for X.
sample_weight : array-like, shape = [n_samples], optional
Sample weights.
Returns
-------
score : float
R^2 of self.predict(X) wrt. y.
"""
return r2_score(y, self.predict(X), sample_weight=sample_weight)
######################### Preprocessing ################################
class OneHotEncoder:
"""Encode categorical integer features using a one-hot aka one-of-K scheme.
The input to this transformer should be a matrix of integers, denoting
the values taken on by categorical (discrete) features. The output will be
a sparse matrix where each column corresponds to one possible value of one
feature. It is assumed that input features take on values in the range
[0, n_values).
This encoding is needed for feeding categorical data to many scikit-learn
estimators, notably linear models and SVMs with the standard kernels.
Parameters
----------
n_values : 'auto', int or array of ints
Number of values per feature.
- 'auto' : determine value range from training data.
- int : maximum value for all features.
- array : maximum value per feature.
categorical_features: "all" or array of indices or mask
Specify what features are treated as categorical.
- 'all' (default): All features are treated as categorical.
- array of indices: Array of categorical feature indices.
- mask: Array of length n_features and with dtype=bool.
Non-categorical features are always stacked to the right of the matrix.
dtype : number type, default=np.float
Desired dtype of output.
sparse : boolean, default=True
Will return sparse matrix if set True else will return an array.
handle_unknown : str, 'error' or 'ignore'
Whether to raise an error or ignore if a unknown categorical feature is
present during transform.
Attributes
----------
active_features_ : array
Indices for active features, meaning values that actually occur
in the training set. Only available when n_values is ``'auto'``.
feature_indices_ : array of shape (n_features,)
Indices to feature ranges.
Feature ``i`` in the original data is mapped to features
from ``feature_indices_[i]`` to ``feature_indices_[i+1]``
(and then potentially masked by `active_features_` afterwards)
n_values_ : array of shape (n_features,)
Maximum number of values per feature.
Examples
--------
Given a dataset with three features and two samples, we let the encoder
find the maximum value per feature and transform the data to a binary
one-hot encoding.
>>> from sklearn.preprocessing import OneHotEncoder
>>> enc = OneHotEncoder()
>>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
[1, 0, 2]]) # doctest: +ELLIPSIS
OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
handle_unknown='error', n_values='auto', sparse=True)
>>> enc.n_values_
array([2, 3, 4])
>>> enc.feature_indices_
array([0, 2, 5, 9])
>>> enc.transform([[0, 1, 1]]).toarray()
array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]])
See also
--------
sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
dictionary items (also handles string-valued features).
sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
encoding of dictionary items or strings.
"""
def __init__(self, n_values="auto", categorical_features="all",
dtype=np.float, sparse=True, handle_unknown='error'):
self.n_values = n_values
self.categorical_features = categorical_features
self.dtype = dtype
self.sparse = sparse
self.handle_unknown = handle_unknown
def fit(self, X, y=None):
"""Fit OneHotEncoder to X.
Parameters
----------
X : array-like, shape=(n_samples, n_feature)
Input array of type int.
Returns
-------
self
"""
self.fit_transform(X)
return self
def _fit_transform(self, X):
"""Assumes X contains only categorical features."""
X = check_array(X, dtype=np.int)
if np.any(X < 0):
raise ValueError("X needs to contain only non-negative integers.")
n_samples, n_features = X.shape
if self.n_values == 'auto':
n_values = np.max(X, axis=0) + 1
elif isinstance(self.n_values, numbers.Integral):
if (np.max(X, axis=0) >= self.n_values).any():
raise ValueError("Feature out of bounds for n_values=%d"
% self.n_values)
n_values = np.empty(n_features, dtype=np.int)
n_values.fill(self.n_values)
else:
try:
n_values = np.asarray(self.n_values, dtype=int)
except (ValueError, TypeError):
raise TypeError("Wrong type for parameter `n_values`. Expected"
" 'auto', int or array of ints, got %r"
% type(X))
if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
raise ValueError("Shape mismatch: if n_values is an array,"
" it has to be of shape (n_features,).")
self.n_values_ = n_values
n_values = np.hstack([[0], n_values])
indices = np.cumsum(n_values)
self.feature_indices_ = indices
column_indices = (X + indices[:-1]).ravel()
row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
n_features)
#data = np.ones(n_samples * n_features)
#out = sparse.coo_matrix((data, (row_indices, column_indices)),
# shape=(n_samples, indices[-1]),
# dtype=self.dtype).tocsr()
data = np.zeros(n_samples, indices[-1], dtype = self.dtype)
data[row_indices, column_indices] = 1
print out
if self.n_values == 'auto':
mask = np.array(out.sum(axis=0)).ravel() != 0
active_features = np.where(mask)[0]
out = out[:, active_features]
self.active_features_ = active_features
return out if self.sparse else out.toarray()
def fit_transform(self, X, y=None):
"""Fit OneHotEncoder to X, then transform X.
Equivalent to self.fit(X).transform(X), but more convenient and more
efficient. See fit for the parameters, transform for the return value.
"""
return _transform_selected(X, self._fit_transform,
self.categorical_features, copy=True)
def _transform(self, X):
"""Assumes X contains only categorical features."""
X = check_array(X, dtype=np.int)
if np.any(X < 0):
raise ValueError("X needs to contain only non-negative integers.")
n_samples, n_features = X.shape
indices = self.feature_indices_
if n_features != indices.shape[0] - 1:
raise ValueError("X has different shape than during fitting."
" Expected %d, got %d."
% (indices.shape[0] - 1, n_features))
# We use only those catgorical features of X that are known using fit.
# i.e lesser than n_values_ using mask.
# This means, if self.handle_unknown is "ignore", the row_indices and
# col_indices corresponding to the unknown categorical feature are
# ignored.
mask = (X < self.n_values_).ravel()
if np.any(~mask):
if self.handle_unknown not in ['error', 'ignore']:
raise ValueError("handle_unknown should be either error or "
"unknown got %s" % self.handle_unknown)
if self.handle_unknown == 'error':
raise ValueError("unknown categorical feature present %s "
"during transform." % X[~mask])
column_indices = (X + indices[:-1]).ravel()[mask]
row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
n_features)[mask]
data = np.ones(np.sum(mask))
out = sparse.coo_matrix((data, (row_indices, column_indices)),
shape=(n_samples, indices[-1]),
dtype=self.dtype).tocsr()
if self.n_values == 'auto':
out = out[:, self.active_features_]
return out if self.sparse else out.toarray()
def transform(self, X):
"""Transform X using one-hot encoding.
Parameters
----------
X : array-like, shape=(n_samples, n_features)
Input array of type int.
Returns
-------
X_out : sparse matrix if sparse=True else a 2-d array, dtype=int
Transformed input.
"""
return _transform_selected(X, self._transform,
self.categorical_features, copy=True)