diff --git a/python/isaac/external/utils.py b/python/isaac/external/utils.py new file mode 100644 index 000000000..bc2c4053d --- /dev/null +++ b/python/isaac/external/utils.py @@ -0,0 +1,911 @@ +import sys +import inspect +import warnings +import numbers + +import numpy as np + +################################ six ######################################## +PY2 = sys.version_info[0] == 2 +PY3 = sys.version_info[0] == 3 + +if PY3: + string_types = str + _iteritems = "items" + +else: + string_types = basestring + _iteritems = "iteritems" + + +def iteritems(d, **kw): + """Return an iterator over the (key, value) pairs of a dictionary.""" + return iter(getattr(d, _iteritems)(**kw)) + +################################ utils ######################################## + +if np.version < (1, 6, 2): + # Allow bincount to accept empty arrays + # https://github.com/numpy/numpy/commit/40f0844846a9d7665616b142407a3d74cb65a040 + def bincount(x, weights=None, minlength=None): + if len(x) > 0: + return np.bincount(x, weights, minlength) + else: + if minlength is None: + minlength = 0 + minlength = np.asscalar(np.asarray(minlength, dtype=np.intp)) + return np.zeros(minlength, dtype=np.intp) + +else: + from numpy import bincount + +class DataConversionWarning(UserWarning): + """A warning on implicit data conversions happening in the code""" + pass + +class NotFittedError(ValueError, AttributeError): + """Exception class to raise if estimator is used before fitting + + This class inherits from both ValueError and AttributeError to help with + exception handling and backward compatibility. + """ + +def check_is_fitted(estimator, attributes, msg=None, all_or_any=all): + """Perform is_fitted validation for estimator. + + Checks if the estimator is fitted by verifying the presence of + "all_or_any" of the passed attributes and raises a NotFittedError with the + given message. + + Parameters + ---------- + estimator : estimator instance. + estimator instance for which the check is performed. + + attributes : attribute name(s) given as string or a list/tuple of strings + Eg. : ["coef_", "estimator_", ...], "coef_" + + msg : string + The default error message is, "This %(name)s instance is not fitted + yet. Call 'fit' with appropriate arguments before using this method." + + For custom messages if "%(name)s" is present in the message string, + it is substituted for the estimator name. + + Eg. : "Estimator, %(name)s, must be fitted before sparsifying". + + all_or_any : callable, {all, any}, default all + Specify whether all or any of the given attributes must exist. + """ + if msg is None: + msg = ("This %(name)s instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method.") + + if not hasattr(estimator, 'fit'): + raise TypeError("%s is not an estimator instance." % (estimator)) + + if not isinstance(attributes, (list, tuple)): + attributes = [attributes] + + if not all_or_any([hasattr(estimator, attr) for attr in attributes]): + raise NotFittedError(msg % {'name': type(estimator).__name__}) + +def compute_sample_weight(class_weight, y, indices=None): + """Estimate sample weights by class for unbalanced datasets. + + Parameters + ---------- + class_weight : dict, list of dicts, "auto", or None, optional + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + + The "auto" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data. + + For multi-output, the weights of each column of y will be multiplied. + + y : array-like, shape = [n_samples] or [n_samples, n_outputs] + Array of original class labels per sample. + + indices : array-like, shape (n_subsample,), or None + Array of indices to be used in a subsample. Can be of length less than + n_samples in the case of a subsample, or equal to n_samples in the + case of a bootstrap subsample with repeated indices. If None, the + sample weight will be calculated over the full sample. Only "auto" is + supported for class_weight if this is provided. + + Returns + ------- + sample_weight_vect : ndarray, shape (n_samples,) + Array with sample weights as applied to the original y + """ + + y = np.atleast_1d(y) + if y.ndim == 1: + y = np.reshape(y, (-1, 1)) + n_outputs = y.shape[1] + + if isinstance(class_weight, string_types): + if class_weight != 'auto': + raise ValueError('The only valid preset for class_weight is ' + '"auto". Given "%s".' % class_weight) + elif (indices is not None and + not isinstance(class_weight, string_types)): + raise ValueError('The only valid class_weight for subsampling is ' + '"auto". Given "%s".' % class_weight) + elif n_outputs > 1: + if (not hasattr(class_weight, "__iter__") or + isinstance(class_weight, dict)): + raise ValueError("For multi-output, class_weight should be a " + "list of dicts, or a valid string.") + if len(class_weight) != n_outputs: + raise ValueError("For multi-output, number of elements in " + "class_weight should match number of outputs.") + + expanded_class_weight = [] + for k in range(n_outputs): + + y_full = y[:, k] + classes_full = np.unique(y_full) + classes_missing = None + + if class_weight == 'auto' or n_outputs == 1: + class_weight_k = class_weight + else: + class_weight_k = class_weight[k] + + if indices is not None: + # Get class weights for the subsample, covering all classes in + # case some labels that were present in the original data are + # missing from the sample. + y_subsample = y[indices, k] + classes_subsample = np.unique(y_subsample) + + weight_k = np.choose(np.searchsorted(classes_subsample, + classes_full), + compute_class_weight(class_weight_k, + classes_subsample, + y_subsample), + mode='clip') + + classes_missing = set(classes_full) - set(classes_subsample) + else: + weight_k = compute_class_weight(class_weight_k, + classes_full, + y_full) + + weight_k = weight_k[np.searchsorted(classes_full, y_full)] + + if classes_missing: + # Make missing classes' weight zero + weight_k[in1d(y_full, list(classes_missing))] = 0. + + expanded_class_weight.append(weight_k) + + expanded_class_weight = np.prod(expanded_class_weight, + axis=0, + dtype=np.float64) + + return expanded_class_weight + +def _assert_all_finite(X): + """Like assert_all_finite, but only for ndarray.""" + X = np.asanyarray(X) + # First try an O(n) time, O(1) space solution for the common case that + # everything is finite; fall back to O(n) space np.isfinite to prevent + # false positives from overflow in sum method. + if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum()) + and not np.isfinite(X).all()): + raise ValueError("Input contains NaN, infinity" + " or a value too large for %r." % X.dtype) + +def check_array(array, accept_sparse=None, dtype="numeric", order=None, + copy=False, force_all_finite=True, ensure_2d=True, + allow_nd=False, ensure_min_samples=1, ensure_min_features=1): + """Input validation on an array, list, sparse matrix or similar. + + By default, the input is converted to an at least 2nd numpy array. + If the dtype of the array is object, attempt converting to float, + raising on failure. + + Parameters + ---------- + array : object + Input object to check / convert. + + accept_sparse : string, list of string or None (default=None) + String[s] representing allowed sparse matrix formats, such as 'csc', + 'csr', etc. None means that sparse matrix input will raise an error. + If the input is sparse but not in the allowed format, it will be + converted to the first listed format. + + dtype : string, type or None (default="numeric") + Data type of result. If None, the dtype of the input is preserved. + If "numeric", dtype is preserved unless array.dtype is object. + + order : 'F', 'C' or None (default=None) + Whether an array will be forced to be fortran or c-style. + + copy : boolean (default=False) + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. + + force_all_finite : boolean (default=True) + Whether to raise an error on np.inf and np.nan in X. + + ensure_2d : boolean (default=True) + Whether to make X at least 2d. + + allow_nd : boolean (default=False) + Whether to allow X.ndim > 2. + + ensure_min_samples : int (default=1) + Make sure that the array has a minimum number of samples in its first + axis (rows for a 2D array). Setting to 0 disables this check. + + ensure_min_features : int (default=1) + Make sure that the 2D array has some minimum number of features + (columns). The default value of 1 rejects empty datasets. + This check is only enforced when the input data has effectively 2 + dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 + disables this check. + + Returns + ------- + X_converted : object + The converted and validated X. + """ + if isinstance(accept_sparse, str): + accept_sparse = [accept_sparse] + + # store whether originally we wanted numeric dtype + dtype_numeric = dtype == "numeric" + + if ensure_2d: + array = np.atleast_2d(array) + if dtype_numeric: + if hasattr(array, "dtype") and getattr(array.dtype, "kind", None) == "O": + # if input is object, convert to float. + dtype = np.float64 + else: + dtype = None + array = np.array(array, dtype=dtype, order=order, copy=copy) + # make sure we actually converted to numeric: + if dtype_numeric and array.dtype.kind == "O": + array = array.astype(np.float64) + if not allow_nd and array.ndim >= 3: + raise ValueError("Found array with dim %d. Expected <= 2" % + array.ndim) + if force_all_finite: + _assert_all_finite(array) + + shape_repr = _shape_repr(array.shape) + if ensure_min_samples > 0: + n_samples = _num_samples(array) + if n_samples < ensure_min_samples: + raise ValueError("Found array with %d sample(s) (shape=%s) while a" + " minimum of %d is required." + % (n_samples, shape_repr, ensure_min_samples)) + + if ensure_min_features > 0 and array.ndim == 2: + n_features = array.shape[1] + if n_features < ensure_min_features: + raise ValueError("Found array with %d feature(s) (shape=%s) while" + " a minimum of %d is required." + % (n_features, shape_repr, ensure_min_features)) + return array + +def check_random_state(seed): + """Turn seed into a np.random.RandomState instance + + If seed is None, return the RandomState singleton used by np.random. + If seed is an int, return a new RandomState instance seeded with seed. + If seed is already a RandomState instance, return it. + Otherwise raise ValueError. + """ + if seed is None or seed is np.random: + return np.random.mtrand._rand + if isinstance(seed, (numbers.Integral, np.integer)): + return np.random.RandomState(seed) + if isinstance(seed, np.random.RandomState): + return seed + raise ValueError('%r cannot be used to seed a numpy.random.RandomState' + ' instance' % seed) + +def _shape_repr(shape): + """Return a platform independent reprensentation of an array shape + + Under Python 2, the `long` type introduces an 'L' suffix when using the + default %r format for tuples of integers (typically used to store the shape + of an array). + + Under Windows 64 bit (and Python 2), the `long` type is used by default + in numpy shapes even when the integer dimensions are well below 32 bit. + The platform specific type causes string messages or doctests to change + from one platform to another which is not desirable. + + Under Python 3, there is no more `long` type so the `L` suffix is never + introduced in string representation. + + >>> _shape_repr((1, 2)) + '(1, 2)' + >>> one = 2 ** 64 / 2 ** 64 # force an upcast to `long` under Python 2 + >>> _shape_repr((one, 2 * one)) + '(1, 2)' + >>> _shape_repr((1,)) + '(1,)' + >>> _shape_repr(()) + '()' + """ + if len(shape) == 0: + return "()" + joined = ", ".join("%d" % e for e in shape) + if len(shape) == 1: + # special notation for singleton tuples + joined += ',' + return "(%s)" % joined + +def _num_samples(x): + """Return number of samples in array-like x.""" + if hasattr(x, 'fit'): + # Don't get num_samples from an ensembles length! + raise TypeError('Expected sequence or array-like, got ' + 'estimator %s' % x) + if not hasattr(x, '__len__') and not hasattr(x, 'shape'): + if hasattr(x, '__array__'): + x = np.asarray(x) + else: + raise TypeError("Expected sequence or array-like, got %s" % + type(x)) + if hasattr(x, 'shape'): + if len(x.shape) == 0: + raise TypeError("Singleton array %r cannot be considered" + " a valid collection." % x) + return x.shape[0] + else: + return len(x) +################################ metrics ######################################## +def _weighted_sum(sample_score, sample_weight, normalize=False): + if normalize: + return np.average(sample_score, weights=sample_weight) + elif sample_weight is not None: + return np.dot(sample_score, sample_weight) + else: + return sample_score.sum() + +def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None): + """Accuracy classification score. + + In multilabel classification, this function computes subset accuracy: + the set of labels predicted for a sample must *exactly* match the + corresponding set of labels in y_true. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) labels. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Predicted labels, as returned by a classifier. + + normalize : bool, optional (default=True) + If ``False``, return the number of correctly classified samples. + Otherwise, return the fraction of correctly classified samples. + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + score : float + If ``normalize == True``, return the correctly classified samples + (float), else it returns the number of correctly classified samples + (int). + + The best performance is 1 with ``normalize == True`` and the number + of samples with ``normalize == False``. + + See also + -------- + jaccard_similarity_score, hamming_loss, zero_one_loss + + Notes + ----- + In binary and multiclass classification, this function is equal + to the ``jaccard_similarity_score`` function. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import accuracy_score + >>> y_pred = [0, 2, 1, 3] + >>> y_true = [0, 1, 2, 3] + >>> accuracy_score(y_true, y_pred) + 0.5 + >>> accuracy_score(y_true, y_pred, normalize=False) + 2 + + In the multilabel case with binary label indicators: + >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2))) + 0.5 + """ + + # Compute accuracy for each possible representation + y_type, y_true, y_pred = _check_targets(y_true, y_pred) + if y_type.startswith('multilabel'): + differing_labels = count_nonzero(y_true - y_pred, axis=1) + score = differing_labels == 0 + else: + score = y_true == y_pred + + return _weighted_sum(score, sample_weight, normalize) + +def r2_score(y_true, y_pred, sample_weight=None): + """R^2 (coefficient of determination) regression score function. + + Best possible score is 1.0, lower values are worse. + + Parameters + ---------- + y_true : array-like of shape = [n_samples] or [n_samples, n_outputs] + Ground truth (correct) target values. + + y_pred : array-like of shape = [n_samples] or [n_samples, n_outputs] + Estimated target values. + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + z : float + The R^2 score. + + Notes + ----- + This is not a symmetric function. + + Unlike most other scores, R^2 score may be negative (it need not actually + be the square of a quantity R). + + References + ---------- + .. [1] `Wikipedia entry on the Coefficient of determination + `_ + + Examples + -------- + >>> from sklearn.metrics import r2_score + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS + 0.948... + >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] + >>> y_pred = [[0, 2], [-1, 2], [8, -5]] + >>> r2_score(y_true, y_pred) # doctest: +ELLIPSIS + 0.938... + + """ + y_type, y_true, y_pred = _check_reg_targets(y_true, y_pred) + + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight) + weight = sample_weight[:, np.newaxis] + else: + weight = 1. + + numerator = (weight * (y_true - y_pred) ** 2).sum(dtype=np.float64) + denominator = (weight * (y_true - np.average( + y_true, axis=0, weights=sample_weight)) ** 2).sum(dtype=np.float64) + + if denominator == 0.0: + if numerator == 0.0: + return 1.0 + else: + # arbitrary set to zero to avoid -inf scores, having a constant + # y_true is not interesting for scoring a regression anyway + return 0.0 + + return 1 - numerator / denominator + + +################################ base ######################################### +############################################################################### + + +class BaseEstimator(object): + """Base class for all estimators in scikit-learn + + Notes + ----- + All estimators should specify all the parameters that can be set + at the class level in their ``__init__`` as explicit keyword + arguments (no ``*args`` or ``**kwargs``). + """ + + @classmethod + def _get_param_names(cls): + """Get parameter names for the estimator""" + # fetch the constructor or the original constructor before + # deprecation wrapping if any + init = getattr(cls.__init__, 'deprecated_original', cls.__init__) + if init is object.__init__: + # No explicit constructor to introspect + return [] + + # introspect the constructor arguments to find the model parameters + # to represent + args, varargs, kw, default = inspect.getargspec(init) + if varargs is not None: + raise RuntimeError("scikit-learn estimators should always " + "specify their parameters in the signature" + " of their __init__ (no varargs)." + " %s doesn't follow this convention." + % (cls, )) + # Remove 'self' + # XXX: This is going to fail if the init is a staticmethod, but + # who would do this? + args.pop(0) + args.sort() + return args + + def get_params(self, deep=True): + """Get parameters for this estimator. + + Parameters + ---------- + deep: boolean, optional + If True, will return the parameters for this estimator and + contained subobjects that are estimators. + + Returns + ------- + params : mapping of string to any + Parameter names mapped to their values. + """ + out = dict() + for key in self._get_param_names(): + # We need deprecation warnings to always be on in order to + # catch deprecated param values. + # This is set in utils/__init__.py but it gets overwritten + # when running under python3 somehow. + warnings.simplefilter("always", DeprecationWarning) + try: + with warnings.catch_warnings(record=True) as w: + value = getattr(self, key, None) + if len(w) and w[0].category == DeprecationWarning: + # if the parameter is deprecated, don't show it + continue + finally: + warnings.filters.pop(0) + + # XXX: should we rather test if instance of estimator? + if deep and hasattr(value, 'get_params'): + deep_items = value.get_params().items() + out.update((key + '__' + k, val) for k, val in deep_items) + out[key] = value + return out + + def set_params(self, **params): + """Set the parameters of this estimator. + + The method works on simple estimators as well as on nested objects + (such as pipelines). The former have parameters of the form + ``__`` so that it's possible to update each + component of a nested object. + + Returns + ------- + self + """ + if not params: + # Simple optimisation to gain speed (inspect is slow) + return self + valid_params = self.get_params(deep=True) + for key, value in iteritems(params): + split = key.split('__', 1) + if len(split) > 1: + # nested objects case + name, sub_name = split + if not name in valid_params: + raise ValueError('Invalid parameter %s for estimator %s' % + (name, self)) + sub_object = valid_params[name] + sub_object.set_params(**{sub_name: value}) + else: + # simple objects case + if not key in valid_params: + raise ValueError('Invalid parameter %s ' 'for estimator %s' + % (key, self.__class__.__name__)) + setattr(self, key, value) + return self + + def __repr__(self): + class_name = self.__class__.__name__ + return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False), + offset=len(class_name),),) + +class MetaEstimatorMixin(object): + """Mixin class for all meta estimators in scikit-learn.""" + # this is just a tag for the moment + + +class ClassifierMixin(object): + """Mixin class for all classifiers in scikit-learn.""" + + def score(self, X, y, sample_weight=None): + """Returns the mean accuracy on the given test data and labels. + + In multi-label classification, this is the subset accuracy + which is a harsh metric since you require for each sample that + each label set be correctly predicted. + + Parameters + ---------- + X : array-like, shape = (n_samples, n_features) + Test samples. + + y : array-like, shape = (n_samples) or (n_samples, n_outputs) + True labels for X. + + sample_weight : array-like, shape = [n_samples], optional + Sample weights. + + Returns + ------- + score : float + Mean accuracy of self.predict(X) wrt. y. + + """ + return accuracy_score(y, self.predict(X), sample_weight=sample_weight) + +class RegressorMixin(object): + """Mixin class for all regression estimators in scikit-learn.""" + + def score(self, X, y, sample_weight=None): + """Returns the coefficient of determination R^2 of the prediction. + + The coefficient R^2 is defined as (1 - u/v), where u is the regression + sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual + sum of squares ((y_true - y_true.mean()) ** 2).sum(). + Best possible score is 1.0, lower values are worse. + + Parameters + ---------- + X : array-like, shape = (n_samples, n_features) + Test samples. + + y : array-like, shape = (n_samples) or (n_samples, n_outputs) + True values for X. + + sample_weight : array-like, shape = [n_samples], optional + Sample weights. + + Returns + ------- + score : float + R^2 of self.predict(X) wrt. y. + """ + return r2_score(y, self.predict(X), sample_weight=sample_weight) + +######################### Preprocessing ################################ + +class OneHotEncoder: + """Encode categorical integer features using a one-hot aka one-of-K scheme. + + The input to this transformer should be a matrix of integers, denoting + the values taken on by categorical (discrete) features. The output will be + a sparse matrix where each column corresponds to one possible value of one + feature. It is assumed that input features take on values in the range + [0, n_values). + + This encoding is needed for feeding categorical data to many scikit-learn + estimators, notably linear models and SVMs with the standard kernels. + + Parameters + ---------- + n_values : 'auto', int or array of ints + Number of values per feature. + + - 'auto' : determine value range from training data. + - int : maximum value for all features. + - array : maximum value per feature. + + categorical_features: "all" or array of indices or mask + Specify what features are treated as categorical. + + - 'all' (default): All features are treated as categorical. + - array of indices: Array of categorical feature indices. + - mask: Array of length n_features and with dtype=bool. + + Non-categorical features are always stacked to the right of the matrix. + + dtype : number type, default=np.float + Desired dtype of output. + + sparse : boolean, default=True + Will return sparse matrix if set True else will return an array. + + handle_unknown : str, 'error' or 'ignore' + Whether to raise an error or ignore if a unknown categorical feature is + present during transform. + + Attributes + ---------- + active_features_ : array + Indices for active features, meaning values that actually occur + in the training set. Only available when n_values is ``'auto'``. + + feature_indices_ : array of shape (n_features,) + Indices to feature ranges. + Feature ``i`` in the original data is mapped to features + from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` + (and then potentially masked by `active_features_` afterwards) + + n_values_ : array of shape (n_features,) + Maximum number of values per feature. + + Examples + -------- + Given a dataset with three features and two samples, we let the encoder + find the maximum value per feature and transform the data to a binary + one-hot encoding. + + >>> from sklearn.preprocessing import OneHotEncoder + >>> enc = OneHotEncoder() + >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ +[1, 0, 2]]) # doctest: +ELLIPSIS + OneHotEncoder(categorical_features='all', dtype=<... 'float'>, + handle_unknown='error', n_values='auto', sparse=True) + >>> enc.n_values_ + array([2, 3, 4]) + >>> enc.feature_indices_ + array([0, 2, 5, 9]) + >>> enc.transform([[0, 1, 1]]).toarray() + array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]]) + + See also + -------- + sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of + dictionary items (also handles string-valued features). + sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot + encoding of dictionary items or strings. + """ + def __init__(self, n_values="auto", categorical_features="all", + dtype=np.float, sparse=True, handle_unknown='error'): + self.n_values = n_values + self.categorical_features = categorical_features + self.dtype = dtype + self.sparse = sparse + self.handle_unknown = handle_unknown + + def fit(self, X, y=None): + """Fit OneHotEncoder to X. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_feature) + Input array of type int. + + Returns + ------- + self + """ + self.fit_transform(X) + return self + + def _fit_transform(self, X): + """Assumes X contains only categorical features.""" + X = check_array(X, dtype=np.int) + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + if self.n_values == 'auto': + n_values = np.max(X, axis=0) + 1 + elif isinstance(self.n_values, numbers.Integral): + if (np.max(X, axis=0) >= self.n_values).any(): + raise ValueError("Feature out of bounds for n_values=%d" + % self.n_values) + n_values = np.empty(n_features, dtype=np.int) + n_values.fill(self.n_values) + else: + try: + n_values = np.asarray(self.n_values, dtype=int) + except (ValueError, TypeError): + raise TypeError("Wrong type for parameter `n_values`. Expected" + " 'auto', int or array of ints, got %r" + % type(X)) + if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: + raise ValueError("Shape mismatch: if n_values is an array," + " it has to be of shape (n_features,).") + + self.n_values_ = n_values + n_values = np.hstack([[0], n_values]) + indices = np.cumsum(n_values) + self.feature_indices_ = indices + + column_indices = (X + indices[:-1]).ravel() + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + n_features) + #data = np.ones(n_samples * n_features) + #out = sparse.coo_matrix((data, (row_indices, column_indices)), + # shape=(n_samples, indices[-1]), + # dtype=self.dtype).tocsr() + data = np.zeros(n_samples, indices[-1], dtype = self.dtype) + data[row_indices, column_indices] = 1 + + print out + + if self.n_values == 'auto': + mask = np.array(out.sum(axis=0)).ravel() != 0 + active_features = np.where(mask)[0] + out = out[:, active_features] + self.active_features_ = active_features + + return out if self.sparse else out.toarray() + + def fit_transform(self, X, y=None): + """Fit OneHotEncoder to X, then transform X. + + Equivalent to self.fit(X).transform(X), but more convenient and more + efficient. See fit for the parameters, transform for the return value. + """ + return _transform_selected(X, self._fit_transform, + self.categorical_features, copy=True) + + def _transform(self, X): + """Assumes X contains only categorical features.""" + X = check_array(X, dtype=np.int) + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + + indices = self.feature_indices_ + if n_features != indices.shape[0] - 1: + raise ValueError("X has different shape than during fitting." + " Expected %d, got %d." + % (indices.shape[0] - 1, n_features)) + + # We use only those catgorical features of X that are known using fit. + # i.e lesser than n_values_ using mask. + # This means, if self.handle_unknown is "ignore", the row_indices and + # col_indices corresponding to the unknown categorical feature are + # ignored. + mask = (X < self.n_values_).ravel() + if np.any(~mask): + if self.handle_unknown not in ['error', 'ignore']: + raise ValueError("handle_unknown should be either error or " + "unknown got %s" % self.handle_unknown) + if self.handle_unknown == 'error': + raise ValueError("unknown categorical feature present %s " + "during transform." % X[~mask]) + + column_indices = (X + indices[:-1]).ravel()[mask] + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + n_features)[mask] + data = np.ones(np.sum(mask)) + out = sparse.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsr() + if self.n_values == 'auto': + out = out[:, self.active_features_] + + return out if self.sparse else out.toarray() + + def transform(self, X): + """Transform X using one-hot encoding. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + Input array of type int. + + Returns + ------- + X_out : sparse matrix if sparse=True else a 2-d array, dtype=int + Transformed input. + """ + return _transform_selected(X, self._transform, + self.categorical_features, copy=True)