Source code for bcselector.datasets

import json
from os.path import dirname, join

import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer


def _discretize(vector, **kwargs):
    """Discretizes vector with sklearn.preprocessing.KBinsDiscretizer.

    Parameters
    ----------
    vector : np.array
    kwargs
        Arguments passed to sklearn.preprocessing.KBinsDiscretizer constructor.

    Returns
    -------
    discretized_vector: np.array
        Discretized by **kwargs arguments method vector.
    """
    discretizer = KBinsDiscretizer(encode='ordinal', **kwargs)
    discretized_vector = discretizer.fit_transform(vector.reshape(-1, 1)).reshape(-1)
    return discretized_vector


[docs]def load_mimic3(as_frame=True, discretize_data=True, **kwargs): """Load and return the mimic3 dataset. The mimic3 dataset is a medical dataset with multiple target variables. Dataset is avaliable at Physiobank [1]_. Costs of features were collected in article [2]_. ================= ============== Samples total 6591 Dimensionality 306 Target variables 10 ================= ============== Parameters ---------- as_frame : bool, default=True If True, the data is a pandas DataFrame including columns with appropriate names. The target is a pandas DataFrame with multiple target variables. discretize_data: bool, default=True If True, the returned data is discretized with sklearn.preprocessing.KBinsDiscretizer. **kwargs Arguments passed to sklearn.preprocessing.KBinsDiscretizer constructor. Returns ------- data : {np.ndarray, pd.DataFrame} of shape (6591, 306) The data matrix. If `as_frame=True`, `data` will be a pd.DataFrame. target: {np.ndarray, pd.DataFrame} of shape (6591, 10) The binary classification target variable. If `as_frame=True`, `target` will be a pd.DataFrame. costs: {dict, list) Cost of every feature in data. If `as_frame=True`, `target` will be a dict. References ---------- .. [1] MIMIC-III, a freely accessible critical care database. Johnson AEW, Pollard TJ, Shen L, Lehman LH, Feng M, Ghassemi M, Moody B, Szolovits P, Celi LA, and Mark RG. Scientific Data (2016). DOI: 10.1038/sdata.2016.35. Available at: http://www.nature.com/articles/sdata201635. .. [2] Paweł Teisseyre, Damien Zufferey, and Marta Słomka. Cost-sensitive classifier chains: Se-lecting low-cost features in multi-label classification.Pattern Recognition, 86, 09 2018. Examples -------- >>> from bcselector.dataset import load_mimic3 >>> data, target, costs = load_mimic3() """ module_path = dirname(__file__) # Load data data = pd.read_csv(join(module_path, 'data', 'mimic3', 'mimic3.csv')) targets = pd.read_csv(join(module_path, 'data', 'mimic3', 'mimic3_targets.csv')) with open(join(module_path, 'data', 'mimic3', 'mimic3_costs.json'), 'r') as j: costs = json.load(j) if discretize_data: data_discretized = np.apply_along_axis(func1d=_discretize, axis=0, arr=data.values, **kwargs) data = pd.DataFrame(data_discretized, columns=data.columns) if as_frame: return data, targets, costs else: return data.values, targets.values, list(costs.values())
[docs]def load_hepatitis(as_frame=True, discretize_data=True, **kwargs): """Load and return the hepatitis dataset provided. The mimic3 dataset is a small medical dataset with single target variable. Dataset is collected from UCI repository [3]_. ================= ============== Samples total 155 Dimensionality 19 Target variables 1 ================= ============== Parameters ---------- as_frame : bool, default=True If True, the data is a pandas DataFrame including columns with appropriate names. The target is a pandas DataFrame with multiple target variables. discretize_data: bool, default=True If True, the returned data is discretized with sklearn.preprocessing.KBinsDiscretizer. kwargs Arguments passed to sklearn.preprocessing.KBinsDiscretizer constructor. Returns ------- data : {np.ndarray, pd.DataFrame} of shape (6591, 306) The data matrix. If `as_frame=True`, `data` will be a pd.DataFrame. target: {np.ndarray, pd.Series} of shape (6591, 10) The binary classification target variable. If `as_frame=True`, `target` will be a pd.DataFrame. costs: {dict, list) Cost of every feature in data. If `as_frame=True`, `target` will be a dict. References ---------- .. [3] Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. Examples -------- >>> from bcselector.dataset import load_hepatitis >>> data, target, costs = load_hepatitis() """ module_path = dirname(__file__) # Load data data = pd.read_csv(join(module_path, 'data', 'hepatitis', 'hepatitis.csv')) targets = pd.read_csv(join(module_path, 'data', 'hepatitis', 'hepatitis_target.csv')) with open(join(module_path, 'data', 'hepatitis', 'hepatitis_costs.json'), 'r') as j: costs = json.load(j) if discretize_data: data_discretized = np.apply_along_axis(func1d=_discretize, axis=0, arr=data.values, **kwargs) data = pd.DataFrame(data_discretized, columns=data.columns) if as_frame: return data, targets['Class'], costs else: return data.values, targets.values, list(costs.values())