Machine learning fairness#

Machine Learning fairness is an important part of modern day data modeling. Here we explore an introduction to make models more fair and equitable.

Machine learning is a powerful tool that has revolutionized many industries by enabling computers to learn from data and make predictions or decisions.

However, as machine learning algorithms become increasingly ubiquitous in our daily lives, concerns about fairness and equity have emerged. Machine learning fairness refers to the idea that machine learning models should not perpetuate or exacerbate existing biases or discrimination. Fairness means that the model treats all individuals or groups fairly, regardless of race, gender, ethnicity, or other protected characteristics.

This notebook will provide an overview of the key concepts and challenges in machine learning fairness, as well as some techniques commonly used to address them.

How To#

from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("data/housing.csv").dropna()
df.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY
x_train, x_, y_train, y_ = train_test_split(df.drop(["longitude","latitude", "ocean_proximity", "median_house_value"], axis=1), 
                                                    df.median_house_value, test_size=.5, stratify=df.ocean_proximity)

x_val, x_test, y_val, y_test = train_test_split(x_, y_, test_size=.5)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor().fit(x_train, y_train)
model.score(x_val, y_val)
0.6539347643708056
from sklearn.model_selection import cross_val_score
for cls in df.ocean_proximity.unique():
    print(cls)
    try:
        idx = df[df.ocean_proximity.isin([cls])].index

        idx_val = x_val.index.intersection(idx)
        print(model.score(x_val.loc[idx_val, :], y_val.loc[idx_val]))

        val = cross_val_score(model, x_val.loc[idx_val, :], y_val.loc[idx_val])
        print(val)
        print(val.mean(), " +- ", val.std(), "\n")
    except:
        print("Error in Validation")
    try:
        idx = df[df.ocean_proximity.isin([cls])].index

        idx_test = x_test.index.intersection(idx)
        print(model.score(x_test.loc[idx_test, :], y_test.loc[idx_test]))
        
        tst = cross_val_score(model,x_test.loc[idx_test, :], y_test.loc[idx_test])
        print(tst)
        print(tst.mean(), " +- ", tst.std(), "\n")
    except:
        print("Error in Test")
NEAR BAY
0.587686050802992
[0.46969838 0.45067197 0.48109773 0.428077   0.54191136]
0.4742912881440667  +-  0.03829759361646877 

0.6447497512417075
[0.58484457 0.59526471 0.63347549 0.43758752 0.57670932]
0.5655763225974818  +-  0.06688503393005851 

<1H OCEAN
0.6216868940511435
[0.60358274 0.62461935 0.62871463 0.61373241 0.63721671]
0.6215731672712625  +-  0.011748473416741366 

0.5920464685385907
[0.64567012 0.61500231 0.5595641  0.61634899 0.55311785]
0.5979406745596677  +-  0.03574888013497276 

INLAND
0.19422233382374166
[0.36300777 0.41052694 0.50897827 0.5265259  0.50476789]
0.462761355637806  +-  0.06425835499103538 

0.3109013894688205
[0.53474599 0.45803812 0.64085195 0.61749888 0.5743819 ]
0.5651033664550061  +-  0.0647360260589818 

NEAR OCEAN
0.5729438171550865
[0.54920562 0.65400732 0.62004661 0.5629213  0.49143857]
0.5755238855953249  +-  0.056681105327491974 

0.5972330716059948
[0.6604076  0.63663473 0.60370088 0.54050457 0.43262552]
0.5747746617749105  +-  0.08170337823809312 

ISLAND
-5.073788281236961
Error in Validation
Error in Test

Calculate Residuals#

from yellowbrick.regressor import residuals_plot, prediction_error
residuals_plot(model, x_train, y_train, x_test, y_test)
../_images/cfaa5e8d65bc4214fa276ab19dafb8afbeab9b278958d09521034fc80ba4b7e9.png
ResidualsPlot(ax=<AxesSubplot:title={'center':'Residuals for RandomForestRegressor Model'}, xlabel='Predicted Value', ylabel='Residuals'>,
              line_color=None, model=None, test_alpha=None, test_color=None,
              train_alpha=None, train_color=None)
prediction_error(model, x_train, y_train, x_test, y_test)
../_images/10811f17f0c50476eb2e65153d345807edd490f1a45d9c03b01cef6605caa859.png
PredictionError(ax=<AxesSubplot:title={'center':'Prediction Error for RandomForestRegressor'}, xlabel='$y$', ylabel='$\\hat{y}$'>,
                model=None)

Confusion Matrix for Classifiers#

from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier

x_train, x_, y_train, y_ = train_test_split(df.drop(["longitude","latitude", "ocean_proximity"], axis=1), 
                                                    df.ocean_proximity, test_size=.5, stratify=df.ocean_proximity)

x_val, x_test, y_val, y_test = train_test_split(x_, y_, test_size=.5)

model = RandomForestClassifier().fit(x_train, y_train)
plot_confusion_matrix(model, x_test, y_test)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f5f5c4d0580>
../_images/39568d358520a3ff8989db28d0ddba1035b9aa6014dbe922e8f5a4a5097893d4.png
plot_confusion_matrix(model, x_test, y_test, normalize="all")
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f5f29d2d9a0>
../_images/cd27c4c26531ed139c7e6b4584d23f5ab4366c76285b97e74e703b46b4e499fe.png

Other Visualizations that are important#

from yellowbrick.classifier import confusion_matrix, classification_report, precision_recall_curve, roc_auc
confusion_matrix(model, x_train, y_train, x_test, y_test)
../_images/4267ce0adc79601d6c9c4ad41613ecdb89421b66a6e49aed05012b8cbdbda1bf.png
ConfusionMatrix(ax=<AxesSubplot:title={'center':'RandomForestClassifier Confusion Matrix'}, xlabel='Predicted Class', ylabel='True Class'>,
                cmap=<matplotlib.colors.ListedColormap object at 0x7f5f290714f0>,
                model=None)
classification_report(model, x_train, y_train, x_test, y_test)
../_images/a3b90ef70a8f11e4448a2d3f124c90a6b45eec96c95a100789870e14cae89759.png
ClassificationReport(ax=<AxesSubplot:title={'center':'RandomForestClassifier Classification Report'}>,
                     cmap=<matplotlib.colors.ListedColormap object at 0x7f5f285f8430>,
                     model=None)
from sklearn.metrics import classification_report
print(classification_report(y_test, model.predict(x_test)))
              precision    recall  f1-score   support

   <1H OCEAN       0.64      0.87      0.74      2226
      INLAND       0.80      0.80      0.80      1691
      ISLAND       0.00      0.00      0.00         2
    NEAR BAY       0.50      0.23      0.32       554
  NEAR OCEAN       0.44      0.08      0.14       636

    accuracy                           0.68      5109
   macro avg       0.47      0.40      0.40      5109
weighted avg       0.65      0.68      0.64      5109
precision_recall_curve(model, x_train, y_train, x_test, y_test)
../_images/a199576fab92bed64c24e63381803ac549bc38ee280c438bc249a435c93c79cb.png
PrecisionRecallCurve(ax=<AxesSubplot:title={'center':'Precision-Recall Curve for RandomForestClassifier'}, xlabel='Recall', ylabel='Precision'>,
                     iso_f1_values={0.2, 0.4, 0.6, 0.8}, model=None)
roc_auc(model, x_train, y_train, x_test, y_test)
../_images/38b5c19d407c633ffd71ce980e823076d14a442055046c135609502ae9fcc990.png
ROCAUC(ax=<AxesSubplot:title={'center':'ROC Curves for RandomForestClassifier'}, xlabel='False Positive Rate', ylabel='True Postive Rate'>,
       model=None)

Exercise#

Modify the code to generate dummy models for each class.

Additional Resources#