Source code for responsibly.dataset.core

import abc


[docs]class Dataset(abc.ABC): """Base class for datasets. Attributes - `df` - :class:`pandas.DataFrame` that holds the actual data. - `target` - Column name of the variable to predict (ground truth) - `sensitive_attributes` - Column name of the sensitive attributes - `prediction` - Columns name of the prediction (optional) """ @abc.abstractmethod def __init__(self, target, sensitive_attributes, prediction=None): """Load, preprocess and validate the dataset. :param target: Column name of the variable to predict (ground truth) :param sensitive_attributes: Column name of the sensitive attributes :param prediction: Columns name of the prediction (optional) :type target: str :type sensitive_attributes: list :type prediction: str """ self.df = self._load_data() self._preprocess() self._name = self.__doc__.splitlines()[0] self.target = target self.sensitive_attributes = sensitive_attributes self.prediction = prediction self._validate() def __str__(self): return ('<{} {} rows, {} columns' ' in which {{{}}} are sensitive attributes>' .format(self._name, len(self.df), len(self.df.columns), ', '.join(self.sensitive_attributes))) @abc.abstractmethod def _load_data(self): pass @abc.abstractmethod def _preprocess(self): pass @abc.abstractmethod def _validate(self): # pylint: disable=line-too-long assert self.target in self.df.columns,\ ('the target label \'{}\' should be in the columns' .format(self.target)) assert all(attr in self.df.columns for attr in self.sensitive_attributes),\ ('the sensitive attributes {{{}}} should be in the columns' .format(','.join(attr for attr in self.sensitive_attributes if attr not in self.df.columns)))
# assert all(attr in SENSITIVE_ATTRIBUTES # for attr in self.sensitive_attributes),\ # ('the sensitive attributes {} can be only from {}.' # noqa # .format(self.sensitive_attributes, SENSITIVE_ATTRIBUTES))