Source code for medusa.classification_utils

from sklearn.preprocessing import OneHotEncoder
import numpy as np


[docs]def categorical_labels(one_hot_labels):
    cat_labels = np.argmax(one_hot_labels, axis=1)
    return cat_labels


[docs]def one_hot_labels(categorical_labels):
    enc = OneHotEncoder(handle_unknown='ignore')
    one_hot_labels = \
        enc.fit_transform(categorical_labels.reshape(-1, 1)).toarray()

    return one_hot_labels


[docs]def k_fold_split(x, y, k, keys=None, shuffle=False):
    """
    Special implementation of k fold splitting that allows to split the dataset
    into k folds for cross validation in function of keys array.

    It returns a list with the dataset for each iteration (k iterations).

    Parameters
    ----------
    x: numpy array or list
        Training set data. Axis 0 represents each observation. Features could
        have one or more dimensions. For instance, [observations x eeg samples],
        [observations x eeg samples x channels]
    y: numpy array or list
        Training set labels.
    k: int
        Number of folds to split the dataset
    keys: numpy array or list
        Keys to split the dataset. If None, the dataset is splitted considering
        each observation independently. If not None, each position of keys
        array identifies the set that owns the observation. For instance, This
        is useful to split the dataset by subjects or trials.
    shuffle: boolean
        True if you want to shuffle the dataset randomly.

    Returns
    -------
    sets: list
        List that contains a dict with the train and test set for each iteration
        of the k-fold algorithm.

    Examples
    --------
    >>> k_fold_iter = k_fold_split(x, y, k)
    >>> k_fold_acc = 0
    >>> for iter in k_fold_iter:
    >>>     model.fit(iter["x_train"], iter["y_train"])
    >>>     y_test_pred = model.predict(iter["x_test"], iter["y_test"])
    >>>     k_fold_acc += np.sum(y_test_pred == iter["y_test"])/len(iter["y_test"])
    >>> k_fold_acc = k_fold_acc/len(k_fold_iter)

    """
    # Convert to numpy arrays
    x = np.array(x)
    y = np.array(y)
    # If keys is None, each observation is treated independently
    if keys is None:
        keys = np.arange(len(x))
    else:
        keys = np.array(keys)
    if keys.shape[0] != x.shape[0] or keys.shape[0] != y.shape[0]:
        raise ValueError("Dimensions of x, y and keys arrays must match along"
                         " axis 0.")
    # Divide keys array in k folds
    keys_values = np.unique(keys)
    if shuffle:
        np.random.shuffle(keys_values)
    keys_folds = np.array_split(keys_values, k)
    # Divide the dataset
    k_fold_iter = list()
    for i in range(k):
        idx = np.isin(keys, keys_folds[i]).nonzero()
        # Get train set
        x_train = np.delete(x, idx, axis=0)
        y_train = np.delete(y, idx, axis=0)
        # Get test set
        x_test = x[idx]
        y_test = y[idx]
        # Save train and test sets of iteration i
        split = dict()
        split["x_train"] = x_train
        split["y_train"] = y_train
        split["x_test"] = x_test
        split["y_test"] = y_test
        k_fold_iter.append(split)
    return k_fold_iter