Source code for responsibly.dataset.compas
__all__ = ['COMPASDataset']
import numpy as np
import pandas as pd
from pkg_resources import resource_filename
from responsibly.dataset.core import Dataset
COMPAS_PATH = resource_filename(__name__,
'compas-scores-two-years.csv')
[docs]class COMPASDataset(Dataset):
"""ProPublica Recidivism/COMPAS Dataset.
See :class:`~responsibly.dataset.Dataset` for a description of
the arguments and attributes.
References:
https://github.com/propublica/compas-analysis
"""
def __init__(self):
super().__init__(target='is_recid',
sensitive_attributes=['race', 'sex'],
prediction=['y_pred', 'score_factor',
'score_text'])
def _load_data(self):
return pd.read_csv(COMPAS_PATH)
def _preprocess(self):
"""Perform the same preprocessing as the original analysis.
https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb
"""
self.df = self.df[(self.df['days_b_screening_arrest'] <= 30)
& (self.df['days_b_screening_arrest'] >= -30)
& (self.df['is_recid'] != -1)
& (self.df['c_charge_degree'] != 'O')
& (self.df['score_text'] != 'N/A')]
self.df['c_jail_out'] = pd.to_datetime(self.df['c_jail_out'])
self.df['c_jail_in'] = pd.to_datetime(self.df['c_jail_in'])
self.df['length_of_stay'] = (self.df['c_jail_out']
- self.df['c_jail_in'])
self.df['score_factor'] = np.where(self.df['score_text']
!= 'Low',
'HighScore', 'LowScore')
self.df['y_pred'] = (self.df['score_factor'] == 'HighScore')
def _validate(self):
# pylint: disable=line-too-long
super()._validate()
assert len(self.df) == 6172, 'the number of rows should be 6172,'\
' but it is {}.'.format(len(self.df))
assert len(self.df.columns) == 56, 'the number of columns should be 56,'\
' but it is {}.'.format(len(self.df.columns))