Source code for responsibly.dataset.fico

__all__ = ['build_FICO_dataset']

import numpy as np
import pandas as pd
from pkg_resources import resource_filename
from sklearn.metrics import auc

CDF_BY_RACE_PATH = resource_filename(__name__,

PERFORMANCE_BY_RACE_PATH = resource_filename(__name__,
                                             'transrisk_performance_by_race_ssa.csv')  # pylint: disable=line-too-long

TOTAL_BY_RACE_PATH = resource_filename(__name__,

def _cleanup_frame(frame):
    """Rename and re-order columns."""
    frame = frame.rename(columns={'Non- Hispanic white': 'White'})
    frame = frame.reindex(['Asian', 'Black', 'Hispanic', 'White'],
    return frame

def _read_totals():
    """Read the total number of people of each race."""
    frame = _cleanup_frame(pd.read_csv(TOTAL_BY_RACE_PATH, index_col=0))
    return {r: frame[r]['SSA'] for r in frame.columns}

def _parse_data():
    """Parse sqf data set."""
    cdfs = _cleanup_frame(pd.read_csv(CDF_BY_RACE_PATH, index_col=0))
    performance = (100
                   - _cleanup_frame(pd.read_csv(PERFORMANCE_BY_RACE_PATH,
    return (cdfs / 100, performance / 100)

def _load_data():
    totals = _read_totals()
    cdfs_df, performance_df = _parse_data()
    return totals, cdfs_df, performance_df

def _get_pdfs(cdfs_df):
    cdf_vs = np.concatenate([[np.zeros_like(cdfs_df.values[0])],
    pdf_vs = (cdf_vs[1:] - cdf_vs[:-1])
    pdfs_df = pd.DataFrame(pdf_vs,
                           columns=cdfs_df.columns, index=cdfs_df.index)
    return pdfs_df

def _calc_tpr_fpr(pdfs_df, performance_df):
    dfs = []
    for value in [performance_df, 1 - performance_df]:
        proportion_per_score = value * pdfs_df

        proportion_over_all_scores = proportion_per_score.sum(axis=0)

        cum_prop_per_score = proportion_per_score[::-1].cumsum(axis=0)[::-1]

        rate = cum_prop_per_score / proportion_over_all_scores

        # by sklean convention, thresholds[0]
        # represents no instances being predicted positive
        # and is arbitrarily set to max(y_score) + 1
        rate.loc[max(rate.index) + 1] = [0] * len(rate.columns)


    tpr_df, fpr_df = dfs  # pylint: disable=unbalanced-tuple-unpacking
    return tpr_df, fpr_df

def _build_rocs(fpr_df, tpr_df):
    rocs = {}
    for group in fpr_df.columns:
        fprs = fpr_df[group].values[::-1]
        tprs = tpr_df[group].values[::-1]
        thresholds = fpr_df.index[::-1]

        rocs[group] = (fprs,

    return rocs

[docs]def build_FICO_dataset(): """Build the FICO dataset. Dataset of the credit score of TransUnion (called TransRisk). The TransRisk score is in turn based on a proprietary model created by FICO, hence often referred to as FICO scores. The data is *aggregated*, i.e., there is no outcome and prediction information per individual, but summarized statistics for each FICO score and race/race/ethnicity group. +---------------+------------------------------------------------------+ | FICO key | Meaning | +===============+======================================================+ | `total` | Total number of individuals | +---------------+------------------------------------------------------+ | `totals` | Number of individuals per group | +---------------+------------------------------------------------------+ | `cdf` | Cumulative distribution function of score per group | +---------------+------------------------------------------------------+ | `pdf` | Probability distribution function of score per group | +---------------+------------------------------------------------------+ | `performance` | Fraction of non-defaulters per score and group | +---------------+------------------------------------------------------+ | `base_rates` | Base rate of non-defaulters per group | +---------------+------------------------------------------------------+ | `base_rate` | The overall base rate non-defaulters | +---------------+------------------------------------------------------+ | `proportions` | Fraction of individuals per group | +---------------+------------------------------------------------------+ | `fpr` | True Positive Rate by score as threshold per group | +---------------+------------------------------------------------------+ | `tpr` | False Positive Rate by score as threshold per group | +---------------+------------------------------------------------------+ | `rocs` | ROC per group | +---------------+------------------------------------------------------+ | `aucs` | ROC AUC per group | +---------------+------------------------------------------------------+ :return: Dictionary of various aggregated statics of the FICO credit score. :rtype: dict References: - Based on code (MIT License) by Moritz Hardt from - """ totals, cdfs_df, performance_df = _load_data() pdfs_df = _get_pdfs(cdfs_df) total = sum(totals.values()) proportions = {group: total / sum(totals.values()) for group, total in totals.items()} base_rates = (pdfs_df * performance_df).sum() base_rate = (base_rates * pd.Series(proportions)).sum() tpr_df, fpr_df = _calc_tpr_fpr(pdfs_df, performance_df) rocs = _build_rocs(fpr_df, tpr_df) aucs = {group: auc(fpr, tpr) for group, (fpr, tpr, _) in rocs.items()} return {'total': total, 'totals': totals, 'cdf': cdfs_df, 'pdf': pdfs_df, 'performance': performance_df, 'base_rates': base_rates, 'base_rate': base_rate, 'proportions': proportions, 'fpr': fpr_df, 'tpr': tpr_df, 'rocs': rocs, 'aucs': aucs}