Source code for responsibly.dataset.core
import abc
[docs]class Dataset(abc.ABC):
    """Base class for datasets.
    Attributes
        - `df` - :class:`pandas.DataFrame` that holds the actual data.
        - `target` - Column name of the variable to predict
                    (ground truth)
        - `sensitive_attributes` - Column name of the
                                sensitive attributes
        - `prediction` - Columns name of the
                        prediction (optional)
    """
    @abc.abstractmethod
    def __init__(self, target, sensitive_attributes, prediction=None):
        """Load, preprocess and validate the dataset.
        :param target: Column name of the variable
                    to predict (ground truth)
        :param sensitive_attributes: Column name of the
                                    sensitive attributes
        :param prediction: Columns name of the
                           prediction (optional)
        :type target: str
        :type sensitive_attributes: list
        :type prediction: str
        """
        self.df = self._load_data()
        self._preprocess()
        self._name = self.__doc__.splitlines()[0]
        self.target = target
        self.sensitive_attributes = sensitive_attributes
        self.prediction = prediction
        self._validate()
    def __str__(self):
        return ('<{} {} rows, {} columns'
                ' in which {{{}}} are sensitive attributes>'
                .format(self._name,
                        len(self.df),
                        len(self.df.columns),
                        ', '.join(self.sensitive_attributes)))
    @abc.abstractmethod
    def _load_data(self):
        pass
    @abc.abstractmethod
    def _preprocess(self):
        pass
    @abc.abstractmethod
    def _validate(self):
        # pylint: disable=line-too-long
        assert self.target in self.df.columns,\
            
('the target label \'{}\' should be in the columns'
             .format(self.target))
        assert all(attr in self.df.columns
                   for attr in self.sensitive_attributes),\
            
('the sensitive attributes {{{}}} should be in the columns'
             .format(','.join(attr for attr in self.sensitive_attributes
                              if attr not in self.df.columns))) 
        # assert all(attr in SENSITIVE_ATTRIBUTES
        #           for attr in self.sensitive_attributes),\
        # ('the sensitive attributes {} can be only from {}.'  # noqa
        #  .format(self.sensitive_attributes, SENSITIVE_ATTRIBUTES))