# --- Imports and Global Configuration ---
import os
from pathlib import Path

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

import pandas as pd
from ucimlrepo import fetch_ucirepo

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

FIG_DIR = Path("docs/assets/figures")
DL_DIR = Path("docs/assets/downloads")
FIG_DIR.mkdir(parents=True, exist_ok=True)
DL_DIR.mkdir(parents=True, exist_ok=True)

plt.rcParams["figure.figsize"] = (6, 4)
plt.rcParams["figure.dpi"] = 120

# !pip install ucimlrepo

from ucimlrepo import fetch_ucirepo 
import pandas as pd

# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
print(adult.variables)

              name     role         type      demographic  \
0              age  Feature      Integer              Age   
1        workclass  Feature  Categorical           Income   
2           fnlwgt  Feature      Integer             None   
3        education  Feature  Categorical  Education Level   
4    education-num  Feature      Integer  Education Level   
5   marital-status  Feature  Categorical            Other   
6       occupation  Feature  Categorical            Other   
7     relationship  Feature  Categorical            Other   
8             race  Feature  Categorical             Race   
9              sex  Feature       Binary              Sex   
10    capital-gain  Feature      Integer             None   
11    capital-loss  Feature      Integer             None   
12  hours-per-week  Feature      Integer             None   
13  native-country  Feature  Categorical            Other   
14          income   Target       Binary           Income   

                                          description units missing_values  
0                                                 N/A  None             no  
1   Private, Self-emp-not-inc, Self-emp-inc, Feder...  None            yes  
2                                                None  None             no  
3    Bachelors, Some-college, 11th, HS-grad, Prof-...  None             no  
4                                                None  None             no  
5   Married-civ-spouse, Divorced, Never-married, S...  None             no  
6   Tech-support, Craft-repair, Other-service, Sal...  None            yes  
7   Wife, Own-child, Husband, Not-in-family, Other...  None             no  
8   White, Asian-Pac-Islander, Amer-Indian-Eskimo,...  None             no  
9                                       Female, Male.  None             no  
10                                               None  None             no  
11                                               None  None             no  
12                                               None  None             no  
13  United-States, Cambodia, England, Puerto-Rico,...  None            yes  
14                                       >50K, <=50K.  None             no

print(X)
print(y)

       age         workclass  fnlwgt  education  education-num  \
0       39         State-gov   77516  Bachelors             13   
1       50  Self-emp-not-inc   83311  Bachelors             13   
2       38           Private  215646    HS-grad              9   
3       53           Private  234721       11th              7   
4       28           Private  338409  Bachelors             13   
...    ...               ...     ...        ...            ...   
48837   39           Private  215419  Bachelors             13   
48838   64               NaN  321403    HS-grad              9   
48839   38           Private  374983  Bachelors             13   
48840   44           Private   83891  Bachelors             13   
48841   35      Self-emp-inc  182148  Bachelors             13   

           marital-status         occupation    relationship  \
0           Never-married       Adm-clerical   Not-in-family   
1      Married-civ-spouse    Exec-managerial         Husband   
2                Divorced  Handlers-cleaners   Not-in-family   
3      Married-civ-spouse  Handlers-cleaners         Husband   
4      Married-civ-spouse     Prof-specialty            Wife   
...                   ...                ...             ...   
48837            Divorced     Prof-specialty   Not-in-family   
48838             Widowed                NaN  Other-relative   
48839  Married-civ-spouse     Prof-specialty         Husband   
48840            Divorced       Adm-clerical       Own-child   
48841  Married-civ-spouse    Exec-managerial         Husband   

                     race     sex  capital-gain  capital-loss  hours-per-week  \
0                   White    Male          2174             0              40   
1                   White    Male             0             0              13   
2                   White    Male             0             0              40   
3                   Black    Male             0             0              40   
4                   Black  Female             0             0              40   
...                   ...     ...           ...           ...             ...   
48837               White  Female             0             0              36   
48838               Black    Male             0             0              40   
48839               White    Male             0             0              50   
48840  Asian-Pac-Islander    Male          5455             0              40   
48841               White    Male             0             0              60   

      native-country  
0      United-States  
1      United-States  
2      United-States  
3      United-States  
4               Cuba  
...              ...  
48837  United-States  
48838  United-States  
48839  United-States  
48840  United-States  
48841  United-States  

[48842 rows x 14 columns]
       income
0       <=50K
1       <=50K
2       <=50K
3       <=50K
4       <=50K
...       ...
48837  <=50K.
48838  <=50K.
48839  <=50K.
48840  <=50K.
48841   >50K.

[48842 rows x 1 columns]

df = pd.concat([X, y], axis=1)
df

df.info()
df.describe(include="all")
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
income              0
dtype: int64

import matplotlib.pyplot
# matplotlib.use("Agg")  # "headless" backend for CI
import matplotlib.pyplot as plt


df['income'].value_counts(normalize=True).plot(kind="bar")
matplotlib.pyplot.title("Income Distribution (>50K vs <=50K)")
plt.show()

# --- CLEANING THE income VARIABLE ---

# Check original unique values
print("Original unique values in income:")
print(df['income'].unique())

# 1. Convert everything to uppercase to avoid issues with 'K' vs 'k'
# 2. Remove extra leading/trailing whitespace
# 3. Remove '.' characters that appear in some values
df['income'] = df['income'].str.upper().str.strip().str.replace('.', '', regex=False)

# Check unique values after cleaning
print("\nUnique values after cleaning:")
print(df['income'].unique())

# --- VISUALIZATION OF THE DISTRIBUTION ---
df['income'].value_counts(normalize=True).plot(kind="bar")
plt.figure(figsize=(5, 3))
plt.title("Income Distribution (>50K vs <=50K)")
plt.ylabel("Proportion")
plt.xlabel("Income")
plt.show()

Original unique values in income:
['<=50K' '>50K' '<=50K.' '>50K.']

Unique values after cleaning:
['<=50K' '>50K']

# =========================================================
# CLEANING CATEGORICAL VARIABLES
# =========================================================

import numpy as np
import pandas as pd

# -----------------------------
# 1) Normalize the TARGET variable (income)
# -----------------------------
if 'income' in df.columns:
    print("Original unique values in income:", df['income'].unique())
    df['income'] = (
        df['income']
        .astype(str)
        .str.upper()
        .str.strip()
        .str.replace('.', '', regex=False)
    )
    print("Unique values after cleaning income:", df['income'].unique())
    assert set(df['income'].unique()) <= {">50K", "<=50K"}, "Income contains values outside {>50K, <=50K}"

# -----------------------------
# 2) Categorical variables to clean
# -----------------------------
cat_cols = ["workclass", "marital-status", "occupation", "relationship", "native-country"]

# -----------------------------
# 3) Replace "?" with NaN and strip whitespace
# -----------------------------
for col in cat_cols:
    print(f"\n[{col}] unique values (before):", df[col].unique()[:15], "...")
    df[col] = (
        df[col]
        .astype(str)
        .str.strip()                
        .replace({"?": np.nan})    
    )
    print(f"[{col}] unique values (after):", df[col].dropna().unique()[:15], "...")
    print(f"[{col}] missing values after:", df[col].isna().sum())

# -----------------------------
# 4) Group infrequent countries into "Other"
# -----------------------------
COUNTRY_MIN_COUNT = 200

if "native-country" in df.columns:
    country_counts = df["native-country"].value_counts(dropna=True)
    common_countries = country_counts[country_counts >= COUNTRY_MIN_COUNT].index

    df["native-country"] = df["native-country"].where(df["native-country"].isna() | df["native-country"].isin(common_countries), "Other")

    print("\nSummary of native-country after grouping:")
    print(df["native-country"].value_counts(dropna=False).head(15))
    print(f"Total 'common' countries: {len(common_countries)}")
    print(f"Records labeled as 'Other': {(df['native-country'] == 'Other').sum()}")

# -----------------------------
# 5) Top categories per column
# -----------------------------
print("\n=== Top categories per variable (post-cleaning) ===")
for col in cat_cols:
    vc = df[col].value_counts(dropna=False).head(10)
    print(f"\n{col}:\n{vc}")

try:
    out_path = DL_DIR / "adult_clean.csv"
    df.to_csv(out_path, index=False)
    print(f"\n✅ Clean dataset saved at: {out_path}")
except Exception as e:
    print("\nNote: CSV was not saved because DL_DIR does not exist in this environment. Error:", e)

Original unique values in income: ['<=50K' '>50K']
Unique values after cleaning income: ['<=50K' '>50K']

[workclass] unique values (before): ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked' nan] ...
[workclass] unique values (after): ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov'
 'Self-emp-inc' 'Without-pay' 'Never-worked' 'nan'] ...
[workclass] missing values after: 1836

[marital-status] unique values (before): ['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed'] ...

[marital-status] unique values (after): ['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed'] ...
[marital-status] missing values after: 0

[occupation] unique values (before): ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv'] ...
[occupation] unique values (after): ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' 'Protective-serv'
 'Armed-Forces' 'Priv-house-serv' 'nan'] ...
[occupation] missing values after: 1843

[relationship] unique values (before): ['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative'] ...
[relationship] unique values (after): ['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative'] ...
[relationship] missing values after: 0

[native-country] unique values (before): ['United-States' 'Cuba' 'Jamaica' 'India' '?' 'Mexico' 'South'
 'Puerto-Rico' 'Honduras' 'England' 'Canada' 'Germany' 'Iran'
 'Philippines' 'Italy'] ...
[native-country] unique values (after): ['United-States' 'Cuba' 'Jamaica' 'India' 'Mexico' 'South' 'Puerto-Rico'
 'Honduras' 'England' 'Canada' 'Germany' 'Iran' 'Philippines' 'Italy'
 'Poland'] ...
[native-country] missing values after: 583

Summary of native-country after grouping:
native-country
United-States    43832
Other             2701
Mexico             951
NaN                583
Philippines        295
nan                274
Germany            206
Name: count, dtype: int64
Total 'common' countries: 5
Records labeled as 'Other': 2701

=== Top categories per variable (post-cleaning) ===

workclass:
workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
NaN                  1836
Self-emp-inc         1695
Federal-gov          1432
nan                   963
Without-pay            21
Never-worked           10
Name: count, dtype: int64

marital-status:
marital-status
Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse           37
Name: count, dtype: int64

occupation:
occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
NaN                  1843
Name: count, dtype: int64

relationship:
relationship
Husband           19716
Not-in-family     12583
Own-child          7581
Unmarried          5125
Wife               2331
Other-relative     1506
Name: count, dtype: int64

native-country:
native-country
United-States    43832
Other             2701
Mexico             951
NaN                583
Philippines        295
nan                274
Germany            206
Name: count, dtype: int64

✅ Clean dataset saved at: docs/assets/downloads/adult_clean.csv

fig, axes = plt.subplots(2, 2, figsize=(12, 10))  

# --- Sex ---
df['sex'].value_counts(normalize=True).plot(kind="bar", ax=axes[0,0])
axes[0,0].set_title("Distribution of Sex")
axes[0,0].set_ylabel("Proportion")

# --- Race ---
df['race'].value_counts(normalize=True).plot(kind="bar", ax=axes[0,1])
axes[0,1].set_title("Distribution of Race")
axes[0,1].set_ylabel("Proportion")

# --- Education ---
df['education'].value_counts().plot(kind="barh", ax=axes[1,0])
axes[1,0].set_title("Distribution of Education")
axes[1,0].set_xlabel("Frequency")

# --- Hours per week ---
df['hours-per-week'].hist(bins=30, ax=axes[1,1])
axes[1,1].set_title("Distribution of Hours Worked per Week")
axes[1,1].set_xlabel("Hours")
axes[1,1].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

# --- LIST OF CATEGORICAL VARIABLES TO REVIEW ---
categorical_vars = ["workclass", "marital-status", "occupation", "relationship", "native-country"]

for col in categorical_vars:
    print(f"\n--- {col.upper()} ---")
    print("Unique values:")
    print(df[col].unique()) 
    
    print("\nFrequency (top 10):")
    print(df[col].value_counts(dropna=False).head(10)) 
    
    print("-"*50)

fig, axes = plt.subplots(3, 2, figsize=(14, 12))
axes = axes.flatten()

for i, col in enumerate(categorical_vars):
    df[col].value_counts(normalize=True).head(10).plot(kind="bar", ax=axes[i])
    axes[i].set_title(f"Distribution of {col} (Top 10)")
    axes[i].set_ylabel("Proportion")

fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()

--- WORKCLASS ---
Unique values:
['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' nan
 'Self-emp-inc' 'Without-pay' 'Never-worked' 'nan']

Frequency (top 10):
workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
NaN                  1836
Self-emp-inc         1695
Federal-gov          1432
nan                   963
Without-pay            21
Never-worked           10
Name: count, dtype: int64
--------------------------------------------------

--- MARITAL-STATUS ---
Unique values:
['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']

Frequency (top 10):
marital-status
Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse           37
Name: count, dtype: int64
--------------------------------------------------

--- OCCUPATION ---
Unique values:
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' nan
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv' 'nan']

Frequency (top 10):
occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
NaN                  1843
Name: count, dtype: int64
--------------------------------------------------

--- RELATIONSHIP ---
Unique values:

['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']

Frequency (top 10):
relationship
Husband           19716
Not-in-family     12583
Own-child          7581
Unmarried          5125
Wife               2331
Other-relative     1506
Name: count, dtype: int64
--------------------------------------------------

--- NATIVE-COUNTRY ---
Unique values:
['United-States' 'Other' nan 'Mexico' 'Germany' 'Philippines' 'nan']

Frequency (top 10):
native-country
United-States    43832
Other             2701
Mexico             951
NaN                583
Philippines        295
nan                274
Germany            206
Name: count, dtype: int64
--------------------------------------------------

df

X = df.drop("income", axis=1)
y = df["income"]

# Categorical and numerical variables
categorical = X.select_dtypes(include=["object"]).columns
numeric = X.select_dtypes(exclude=["object"]).columns

# --- PREPROCESSING ---
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
    ]
)

# --- PIPELINE ---
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=500))
])

# --- SPLIT ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- TRAIN ---
model.fit(X_train, y_train)

# --- EVALUATION ---
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# --- EXPORT ---
joblib.dump(model, "income_model.pkl")
print("✅ Model trained and saved")

              precision    recall  f1-score   support

       <=50K       0.88      0.94      0.91      7414
        >50K       0.74      0.59      0.66      2355

    accuracy                           0.85      9769
   macro avg       0.81      0.76      0.78      9769
weighted avg       0.84      0.85      0.85      9769

✅ Model trained and saved

POS_LABEL = ">50K"

def metrics_by_group(X, y_true, y_pred, sensitive_attribute):
    results = {}
    for group in X[sensitive_attribute].dropna().unique():
        idx = (X[sensitive_attribute] == group).values
        if idx.sum() == 0:
            continue
        acc = accuracy_score(y_true[idx], y_pred[idx])
        prec = precision_score(y_true[idx], y_pred[idx], pos_label=POS_LABEL, zero_division=0)
        rec = recall_score(y_true[idx], y_pred[idx], pos_label=POS_LABEL, zero_division=0)
        f1 = f1_score(y_true[idx], y_pred[idx], pos_label=POS_LABEL, zero_division=0)
        results[group] = {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}
    return pd.DataFrame(results).T.sort_index()

def rates_and_fairness(X, y_true, y_pred, sensitive_attribute):
    df_tmp = X[[sensitive_attribute]].copy()
    df_tmp["y_true"] = y_true.values
    df_tmp["y_pred"] = y_pred

    rows = []
    for group, dfg in df_tmp.groupby(sensitive_attribute):
        # Selection Rate
        sr = np.mean(dfg["y_pred"] == POS_LABEL)

        # TPR (Recall on positives)
        pos_mask = dfg["y_true"] == POS_LABEL
        tpr = np.mean(dfg.loc[pos_mask, "y_pred"] == POS_LABEL) if pos_mask.any() else np.nan

        # FPR (False Positive Rate)
        neg_mask = dfg["y_true"] != POS_LABEL
        fpr = np.mean(dfg.loc[neg_mask, "y_pred"] == POS_LABEL) if neg_mask.any() else np.nan

        rows.append({"group": group, "selection_rate": sr, "TPR": tpr, "FPR": fpr})

    out = pd.DataFrame(rows).set_index("group").sort_index()

    # Disparate impact vs group with highest selection_rate
    ref_group = out["selection_rate"].idxmax()
    ref_val = out.loc[ref_group, "selection_rate"]
    out["disparate_impact_vs_max"] = out["selection_rate"] / (ref_val if ref_val > 0 else np.nan)

    # Equal Opportunity Difference (range of TPR)
    equal_opp_diff = out["TPR"].max() - out["TPR"].min()

    return out, ref_group, equal_opp_diff

def plot_rates(df_rates, sensitive_attribute, folder="fairness_plots"):
    Path(folder).mkdir(parents=True, exist_ok=True)
    for col in ["selection_rate", "TPR", "FPR"]:
        if col not in df_rates.columns:
            continue
        plt.figure(figsize=(6, 4)) 
        df_rates[col].plot(kind="bar", color="pink", edgecolor="black")
        plt.title(f"{col} by {sensitive_attribute}")
        plt.xlabel(sensitive_attribute)
        plt.ylabel(col)
        plt.tight_layout()
        plt.xticks(rotation=45)
        path = f"{folder}/{col}_by_{sensitive_attribute}.png"
        plt.savefig(path, dpi=150)
        plt.show()

model = joblib.load("income_model.pkl")
y_pred = model.predict(X_test)

# METRICS by gender
attribute = "sex"
metrics_sex = metrics_by_group(X_test, y_test, y_pred, attribute)
print("Metrics by sex:")
display(metrics_sex)

rates_sex, ref_grp_sex, eq_opp_diff_sex = rates_and_fairness(X_test, y_test, y_pred, attribute)
print("\nRates and fairness by sex:")
display(rates_sex)
print(f"Reference group (highest selection_rate): {ref_grp_sex}")
print(f"Equal Opportunity Difference (TPR range): {eq_opp_diff_sex:.4f}")
plot_rates(rates_sex, attribute)

# METRICS by race
attribute = "race"
metrics_race = metrics_by_group(X_test, y_test, y_pred, attribute)
print("\nMetrics by race:")
display(metrics_race)

rates_race, ref_grp_race, eq_opp_diff_race = rates_and_fairness(X_test, y_test, y_pred, attribute)
print("\nRates and fairness by race:")
display(rates_race)
print(f"Reference group (highest selection_rate): {ref_grp_race}")
print(f"Equal Opportunity Difference (TPR range): {eq_opp_diff_race:.4f}")
plot_rates(rates_race, attribute)

Metrics by sex:

Rates and fairness by sex:

Reference group (highest selection_rate): Male
Equal Opportunity Difference (TPR range): 0.1001

Metrics by race:

Rates and fairness by race:

Reference group (highest selection_rate): Asian-Pac-Islander
Equal Opportunity Difference (TPR range): 0.4128

# ==== SHAP ====
import shap, numpy as np, pandas as pd, matplotlib.pyplot as plt
from scipy import sparse
from sklearn.pipeline import Pipeline

# 1) Preparation
prep = model.named_steps["preprocessor"]
clf  = model.named_steps["classifier"]

def feat_names(prep, orig_cols):
    names = []
    for _, trf, cols in prep.transformers_:
        if trf == "drop":
            continue
        if isinstance(trf, Pipeline):
            trf = trf[-1]
        if hasattr(trf, "get_feature_names_out"):
            names += trf.get_feature_names_out(cols).tolist()
        else:
            names += list(cols)
    return names

Xtr = prep.transform(X_train)
Xte = prep.transform(X_test)
feature_names = feat_names(prep, X_train.columns)

def to_dense(X):
    return X.toarray() if sparse.issparse(X) else np.asarray(X)

# Small background
rng = np.random.default_rng(42)
n = Xtr.shape[0]
row_bg = rng.choice(n, size=min(200, n), replace=False)
bg_dense  = to_dense(Xtr[row_bg])
Xte_dense = to_dense(Xte)

# Linear explainer with masker
masker = shap.maskers.Independent(bg_dense)
explainer = shap.Explainer(clf, masker, algorithm="linear")
shap_expl = explainer(Xte_dense)

shap_expl.feature_names = feature_names
shap_expl.data = Xte_dense

proba   = model.predict_proba(X_test)[:, 1]
y_pred  = model.predict(X_test)
y_true  = y_test.values
errs    = np.where(y_pred != y_true)[0]
i_local = int(errs[0]) if len(errs) else int(np.argmin(np.abs(proba - 0.5)))

# ======= SHAP =======
import numpy as np, pandas as pd, matplotlib.pyplot as plt, textwrap
from scipy.special import expit

plt.rcParams.update({"figure.dpi": 170, "font.size": 10})
def wrap_labels(labels, width=22):  
    return ["\n".join(textwrap.wrap(str(s), width=width)) for s in labels]

def clean_name(s: str):
    s = s.replace("native-country_", "country:")
    s = s.replace("marital-status_", "marital:")
    s = s.replace("education-num", "edu_num")
    s = s.replace("capital-gain", "cap_gain")
    s = s.replace("capital-loss", "cap_loss")
    s = s.replace("hours-per-week", "hrs_week")
    return s

# ---------- base data for ordering/selecting ----------
vals = shap_expl.values
base_logit = float(np.mean(shap_expl.base_values)) if shap_expl.base_values is not None else 0.0
base_prob  = float(expit(base_logit))

mean_abs    = np.abs(vals).mean(axis=0)
mean_signed = vals.mean(axis=0)
order_abs   = np.argsort(-mean_abs)

TOPN = 15
idx_top = order_abs[:TOPN]
feat_top_names = [clean_name(feature_names[i]) for i in idx_top]

# Signed bar
df_signed = pd.DataFrame({
    "feature": feat_top_names,
    "mean_SHAP": mean_signed[idx_top],
    "mean_abs_SHAP": mean_abs[idx_top]
}).sort_values("mean_SHAP")

plt.figure(figsize=(10, 6))
colors = df_signed["mean_SHAP"].apply(lambda x: "#1f77b4" if x < 0 else "#d62728")
plt.barh(wrap_labels(df_signed["feature"]), df_signed["mean_SHAP"], color=colors)
plt.axvline(0, color="k", lw=1)
plt.title(f"01. Mean SHAP with sign (Top {TOPN}) – log-odds")
plt.xlabel("Mean SHAP (negative ↓  / positive ↑)")
plt.tight_layout()
plt.savefig(FIG_DIR / "01_shap_bar_signed.png", bbox_inches="tight")
plt.close()

# Beeswarm 
plt.figure(figsize=(10, 6))
shap.plots.beeswarm(shap_expl[:, idx_top], max_display=TOPN, show=False)
plt.title(f"02. SHAP Beeswarm – distribution per individual (Top {TOPN})")
plt.tight_layout()
plt.savefig(FIG_DIR / "02_shap_beeswarm.png", bbox_inches="tight")
plt.close()

# ===================== 02. Trajectories =====================
# Most informative observations by total impact
impact = np.abs(vals).sum(axis=1)
row_idx = np.argsort(-impact)[:min(400, vals.shape[0])]
top12 = idx_top[:12]
plt.figure(figsize=(11, 5))
shap.decision_plot(
    base_logit,
    vals[row_idx][:, top12],
    feature_names=wrap_labels([clean_name(feature_names[i]) for i in top12], 18),
    ignore_warnings=True,
    show=False
)
plt.title("03. Decision plot – accumulation of contributions (subsample)")
plt.tight_layout()
plt.savefig(FIG_DIR / "03_shap_decision_plot.png", bbox_inches="tight")
plt.close()

# ============== 03. Local explanation ==========
x_local_raw = X_test.iloc[i_local]        
x_local_tr  = Xte_dense[i_local]          
fx_logit    = base_logit + float(vals[i_local].sum())
fx_prob     = float(expit(fx_logit))
pred_prob   = float(proba[i_local])
pred_label  = y_pred[i_local]
true_label  = y_test.iloc[i_local]

# Waterfall
plt.figure(figsize=(10, 5))
shap.plots.waterfall(shap_expl[i_local], show=False)
plt.title(f"04. Waterfall – record idx={i_local} (pred={pred_label}, true={true_label})")
plt.tight_layout()
plt.savefig(FIG_DIR / "04_shap_waterfall_local.png", bbox_inches="tight")
plt.close()

# Table of top contributions for the record
K = 10
order_local = np.argsort(-np.abs(vals[i_local]))[:K]
df_local = pd.DataFrame({
    "feature": [feature_names[j] for j in order_local],
    "transformed_value": [float(x_local_tr[j]) for j in order_local],
    "shap": [float(vals[i_local, j]) for j in order_local],
    "direction": ["↑" if vals[i_local, j] > 0 else "↓" for j in order_local]
})
df_local.to_csv(DL_DIR / "local_top_contributions.csv", index=False)

with open(DL_DIR / "local_summary.txt", "w", encoding="utf-8") as f:
    f.write(
        "Explained local record\n"
        f"- index in X_test: {i_local}\n"
        f"- true label: {true_label}\n"
        f"- prediction: {pred_label}  |  prob={pred_prob:.3%}\n"
        f"- base_value (log-odds): {base_logit:.3f}  |  base prob={base_prob:.3%}\n"
        f"- f(x) (log-odds): {fx_logit:.3f}  |  prob(sigmoid): {fx_prob:.3%}\n"
        f"- Top-{K} contributions -> see CSV: local_top_contributions.csv\n"
    )

# ===================== 04. Marginal effect =====================
for f in ("age", "hours-per-week", "education-num"):
    if f in feature_names:
        j = feature_names.index(f)
        plt.figure(figsize=(8, 5))
        shap.plots.scatter(shap_expl[:, j], show=False)
        plt.title(f"05. Dependence – {f}")
        plt.xlabel(f"Transformed value of {f}")
        plt.ylabel("SHAP (log-odds)")
        plt.tight_layout()
        plt.savefig(FIG_DIR / f"05_dep_{f}.png", bbox_inches="tight")
        plt.close()

# ==================== 05. Fairness (by sex and by race) =====================
def abs_by_group(expl, feat_names, groups):
    df = pd.DataFrame(np.abs(expl.values), columns=feat_names)
    df["__g__"] = groups.values
    return df.groupby("__g__")[feat_names].mean().T

def margin_by_group(expl, groups):
    margins = expl.values.sum(axis=1)
    return pd.DataFrame({"g": groups.values, "margin": margins}).groupby("g")["margin"].agg(["mean","median","count"])

for s in ("sex", "race"):
    if s not in X_test.columns: 
        continue
    g = X_test[s]
    tab_abs = abs_by_group(shap_expl, feature_names, g)
    tab_abs.to_csv(DL_DIR / f"{s}_abs_by_feature.csv")
    # Top-15 by mean across groups
    mean_across = tab_abs.mean(axis=1).sort_values(ascending=False).head(15)
    plt.figure(figsize=(10, 6))
    plt.barh(wrap_labels([clean_name(i) for i in mean_across.index]), mean_across.values)
    plt.title(f"06. Mean |SHAP| by {s} – Top 15 features")
    plt.xlabel("Mean |SHAP|")
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"06_abs_{s}_top15.png", bbox_inches="tight")
    plt.close()

    tab_m = margin_by_group(shap_expl, g)
    tab_m.to_csv(DL_DIR / f"{s}_margin_summary.csv")
    plt.figure(figsize=(7, 4))
    plt.barh(tab_m.index.astype(str), tab_m["mean"].values)
    plt.title(f"07. Mean SHAP margin by {s} (higher ↑ → stronger push to >50K)")
    plt.xlabel("Sum of SHAP (log-odds)")
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"07_margin_{s}.png", bbox_inches="tight")
    plt.close()

# ===================== 06. Interactive ===========================
from pathlib import Path
Path(DL_DIR).mkdir(parents=True, exist_ok=True)

# Local force plot of the same record as waterfall
shap.save_html(
    str(DL_DIR / "08_force_local.html"),
    shap.plots.force(shap_expl[i_local], feature_names=feature_names, features=Xte_dense[i_local])
)
# Global force plot
take = min(500, vals.shape[0])
idx_rows = np.argsort(-impact)[:take]
shap.save_html(
    str(DL_DIR / "09_force_global.html"),
    shap.plots.force(shap_expl[idx_rows], feature_names=feature_names, features=Xte_dense[idx_rows])
)

print("✅ Presentation generated.")
print("   Suggested order of figures in PDF:")
print("   01_shap_bar_signed.png  → importance + direction (global)")
print("   02_shap_beeswarm.png    → distribution per individual (global)")
print("   03_shap_decision_plot.png")
print("   04_shap_waterfall_local.png  + DL/local_summary.txt + local_top_contributions.csv")
print("   05_dep_*.png")
print("   06_abs_*_top15.png  & 07_margin_* .png")
print("   Interactives: 08_force_local.html, 09_force_global.html")

✅ Presentation generated.
   Suggested order of figures in PDF:
   01_shap_bar_signed.png  → importance + direction (global)
   02_shap_beeswarm.png    → distribution per individual (global)
   03_shap_decision_plot.png
   04_shap_waterfall_local.png  + DL/local_summary.txt + local_top_contributions.csv
   05_dep_*.png
   06_abs_*_top15.png  & 07_margin_* .png
   Interactives: 08_force_local.html, 09_force_global.html

# ===========================================
# Model Comparator (baseline vs debiased)
# ===========================================
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def _overall_metrics(y_true, y_pred, pos_label=">50K"):
    """Global metrics summarized in a dict."""
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision_pos": precision_score(y_true, y_pred, pos_label=pos_label, zero_division=0),
        "recall_pos": recall_score(y_true, y_pred, pos_label=pos_label, zero_division=0),
        "f1_pos": f1_score(y_true, y_pred, pos_label=pos_label, zero_division=0),
    }

def _bar_side_by_side(ax, left_vals, right_vals, labels, title, ylabel, legend=("Baseline","Debiased")):
    """Side-by-side bar chart for quick comparison."""
    x = np.arange(len(labels))
    w = 0.38
    ax.bar(x - w/2, left_vals, width=w, label=legend[0])
    ax.bar(x + w/2, right_vals, width=w, label=legend[1])
    ax.set_xticks(x, labels, rotation=45, ha="right")
    ax.set_title(title)
    ax.set_ylabel(ylabel)
    ax.legend()
    ax.grid(axis="y", alpha=0.3)

def compare_models(
    baseline_model,
    debiased_model,
    X_test,
    y_test,
    sensitive_attributes=("sex","race"),
    pos_label=">50K",
    fig_dir=FIG_DIR,
    dl_dir=DL_DIR,
    tag_a="baseline",
    tag_b="debiased"
):
    """
    Compare baseline_model vs debiased_model in:
      - global metrics
      - group metrics (accuracy/precision/recall/f1)
      - fairness rates (selection_rate, TPR, FPR, disparate impact, equal opportunity diff)
    """
    cmp_name = f"compare_{tag_a}_vs_{tag_b}"
    out_fig = Path(fig_dir) / cmp_name
    out_dl  = Path(dl_dir) / cmp_name
    out_fig.mkdir(parents=True, exist_ok=True)
    out_dl.mkdir(parents=True, exist_ok=True)

    # ------------------ Predictions ------------------
    y_pred_a = baseline_model.predict(X_test)
    y_pred_b = debiased_model.predict(X_test)

    # ------------------ Global metrics ------------------
    glob_a = _overall_metrics(y_test, y_pred_a, pos_label)
    glob_b = _overall_metrics(y_test, y_pred_b, pos_label)
    df_global = pd.DataFrame([glob_a, glob_b], index=[tag_a, tag_b])
    df_global.to_csv(out_dl / "global_metrics.csv")
    print("== Global metrics ==")
    display(df_global)

    # Confusion matrix
    for tag, yhat in [(tag_a, y_pred_a), (tag_b, y_pred_b)]:
        cm = confusion_matrix(y_test, yhat, labels=[pos_label, "<=50K"])
        df_cm = pd.DataFrame(
            cm,
            index=[f"true_{pos_label}", "true_<=50K"],
            columns=[f"pred_{pos_label}", "pred_<=50K"]
        )
        df_cm.to_csv(out_dl / f"confusion_matrix_{tag}.csv")
        plt.figure(figsize=(4.5, 4))
        plt.imshow(df_cm, cmap="Blues")
        plt.title(f"Confusion Matrix – {tag}")
        plt.xticks(range(2), df_cm.columns, rotation=15)
        plt.yticks(range(2), df_cm.index)
        for (i,j), v in np.ndenumerate(cm):
            plt.text(j, i, int(v), ha="center", va="center")
        plt.tight_layout()
        plt.savefig(out_fig / f"confusion_matrix_{tag}.png", dpi=150)
        plt.close()

    # ------------------ By sensitive attribute ------------------
    for attr in sensitive_attributes:
        if attr not in X_test.columns:
            print(f"(skip) '{attr}' not found in X_test.columns")
            continue

        # Group metrics
        m_a = metrics_by_group(X_test, y_test, y_pred_a, attr)
        m_b = metrics_by_group(X_test, y_test, y_pred_b, attr)
        m_a.to_csv(out_dl / f"{attr}_metrics_{tag_a}.csv")
        m_b.to_csv(out_dl / f"{attr}_metrics_{tag_b}.csv")

        print(f"\n== {attr.upper()} – metrics by group ==")
        print(f"[{tag_a}]"); display(m_a)
        print(f"[{tag_b}]"); display(m_b)

        # Fairness rates
        r_a, ref_a, eq_a = rates_and_fairness(X_test, y_test, y_pred_a, attr)
        r_b, ref_b, eq_b = rates_and_fairness(X_test, y_test, y_pred_b, attr)
        r_a.to_csv(out_dl / f"{attr}_rates_{tag_a}.csv")
        r_b.to_csv(out_dl / f"{attr}_rates_{tag_b}.csv")

        print(f"Rates & fairness – {attr} [{tag_a}] ref={ref_a} | EOD={eq_a:.4f}")
        display(r_a)
        print(f"Rates & fairness – {attr} [{tag_b}] ref={ref_b} | EOD={eq_b:.4f}")
        display(r_b)

        # --------- Side-by-side bar plots: selection_rate, TPR, FPR ---------
        common_index = sorted(set(r_a.index).intersection(set(r_b.index)))
        if len(common_index) == 0:
            continue

        def _vals(df, col):
            return df.loc[common_index, col].values if col in df.columns else np.array([np.nan]*len(common_index))

        labels = [str(x) for x in common_index]
        for col, nice in [("selection_rate","Selection Rate"),
                          ("TPR","TPR (Recall on positives)"),
                          ("FPR","FPR (False Positive Rate)")]:
            fig, ax = plt.subplots(figsize=(8,5))
            _bar_side_by_side(
                ax,
                _vals(r_a, col),
                _vals(r_b, col),
                labels,
                title=f"{nice} by {attr}",
                ylabel=col,
                legend=(tag_a, tag_b)
            )
            plt.tight_layout()
            plt.savefig(out_fig / f"{attr}_{col}_compare.png", dpi=150)
            plt.close()

        # --------- Fairness summary table ---------
        sum_a = r_a[["selection_rate","TPR","FPR","disparate_impact_vs_max"]].copy()
        sum_b = r_b[["selection_rate","TPR","FPR","disparate_impact_vs_max"]].copy()
        sum_a.columns = [f"{c}_{tag_a}" for c in sum_a.columns]
        sum_b.columns = [f"{c}_{tag_b}" for c in sum_b.columns]
        df_sum = pd.concat([sum_a, sum_b], axis=1).loc[common_index]
        df_sum["equal_opp_diff_"+tag_a] = eq_a
        df_sum["equal_opp_diff_"+tag_b] = eq_b
        df_sum.to_csv(out_dl / f"{attr}_fairness_summary_compare.csv")

    print("\n✅ Comparison generated.")
    print(f"Figures in: {out_fig}")
    print(f"Tables/CSVs in: {out_dl}")
    return {
        "global_metrics": df_global,
        "paths": {"fig_dir": str(out_fig), "dl_dir": str(out_dl)}
    }

# =========================================================
# Bias mitigation via Reweighing (by sensitive attribute) + retrain model
# =========================================================
import numpy as np
import pandas as pd
from sklearn.base import clone

POS_LABEL = ">50K"

def compute_reweighing_weights(X: pd.DataFrame, y: pd.Series, sensitive_attr: str, pos_label: str = POS_LABEL):
    """
    Implements Reweighing (Kamiran & Calders).
    Weight for each (A=a, Y=y) is:  w(a,y) = P(A=a)*P(Y=y) / P(A=a, Y=y)
    Returns a numpy array of sample weights aligned with X.index / y.index.
    """
    df = pd.DataFrame({sensitive_attr: X[sensitive_attr].values, "y": y.values})
    # probabilities
    pA  = df[sensitive_attr].value_counts(normalize=True)              # P(A=a)
    pY  = df["y"].value_counts(normalize=True)                         # P(Y=y)
    pAY = df.value_counts([sensitive_attr, "y"], normalize=True)       # P(A=a, Y=y)

    # safe lookup
    groups = df[sensitive_attr].dropna().unique()
    labels = df["y"].dropna().unique()
    eps = 1e-9

    weights = []
    for a, yv in zip(df[sensitive_attr].values, df["y"].values):
        pa  = pA.get(a, 0.0)
        py  = pY.get(yv, 0.0)
        pay = pAY.get((a, yv), 0.0)
        w = (pa * py) / max(pay, eps) if pa > 0 and py > 0 else 1.0
        weights.append(w)

    return np.asarray(weights, dtype=float)

def train_reweighed_logreg(
    base_pipeline,
    X_train: pd.DataFrame,
    y_train: pd.Series,
    sensitive_attr: str = "sex",
    pos_label: str = POS_LABEL,
    save_path: str = "income_model_debiased_reweigh_sex.pkl"
):
    # 1) compute sample weights using train
    sample_w = compute_reweighing_weights(X_train, y_train, sensitive_attr, pos_label)

    # 2) clone and fit
    debiased_model = clone(base_pipeline)
    # Pipeline forwards sample_weight to the last step if argument is named 'classifier__sample_weight'
    debiased_model.fit(X_train, y_train, classifier__sample_weight=sample_w)

    try:
        joblib.dump(debiased_model, save_path)
        print(f"✅ Debiased model (reweigh by '{sensitive_attr}') saved to: {save_path}")
    except Exception as e:
        print("Note: model not saved due to environment error:", e)

    # sanity check
    print(f"Reweighing performed on sensitive attribute: '{sensitive_attr}'.")
    return debiased_model

# =========================================================
# Train debiased model (reweighing by race) and compare
# =========================================================
model_debiased_race = train_reweighed_logreg(
    base_pipeline=model,
    X_train=X_train,
    y_train=y_train,
    sensitive_attr="race",
    save_path="income_model_debiased_reweigh_race.pkl"
)

results_race = compare_models(
    baseline_model=model,
    debiased_model=model_debiased_race,
    X_test=X_test,
    y_test=y_test,
    sensitive_attributes=("race",), 
    pos_label=POS_LABEL,
    tag_a="baseline",
    tag_b="debiased_reweigh_race"
)

✅ Debiased model (reweigh by 'race') saved to: income_model_debiased_reweigh_race.pkl
Reweighing performed on sensitive attribute: 'race'.

== Global metrics ==

== RACE – metrics by group ==
[baseline]

[debiased_reweigh_race]

Rates & fairness – race [baseline] ref=Asian-Pac-Islander | EOD=0.4128

Rates & fairness – race [debiased_reweigh_race] ref=Asian-Pac-Islander | EOD=0.2500

✅ Comparison generated.
Figures in: docs/assets/figures/compare_baseline_vs_debiased_reweigh_race
Tables/CSVs in: docs/assets/downloads/compare_baseline_vs_debiased_reweigh_race

# ======================
# 1) Baseline vs Reweighing por SEX
# ======================
model_debiased_sex = train_reweighed_logreg(
    base_pipeline=model,
    X_train=X_train,
    y_train=y_train,
    sensitive_attr="sex",
    save_path="income_model_debiased_reweigh_sex.pkl"
)

results_sex = compare_models(
    baseline_model=model,
    debiased_model=model_debiased_sex,
    X_test=X_test,
    y_test=y_test,
    sensitive_attributes=("sex",),
    pos_label=POS_LABEL,
    tag_a="baseline",
    tag_b="debiased_reweigh_sex"
)

✅ Debiased model (reweigh by 'sex') saved to: income_model_debiased_reweigh_sex.pkl
Reweighing performed on sensitive attribute: 'sex'.

== Global metrics ==

== SEX – metrics by group ==
[baseline]

[debiased_reweigh_sex]

Rates & fairness – sex [baseline] ref=Male | EOD=0.1001

Rates & fairness – sex [debiased_reweigh_sex] ref=Male | EOD=0.1711

✅ Comparison generated.
Figures in: docs/assets/figures/compare_baseline_vs_debiased_reweigh_sex
Tables/CSVs in: docs/assets/downloads/compare_baseline_vs_debiased_reweigh_sex

	accuracy	precision	recall	f1
Female	0.92824	0.785408	0.501370	0.612040
Male	0.81380	0.738433	0.601508	0.662974

	selection_rate	TPR	FPR	disparate_impact_vs_max
group
Female	0.072069	0.501370	0.017434	0.290589
Male	0.248011	0.601508	0.093269	1.000000

	accuracy	precision	recall	f1
Amer-Indian-Eskimo	0.927083	0.600000	0.375000	0.461538
Asian-Pac-Islander	0.833333	0.703704	0.662791	0.682635
Black	0.904311	0.702703	0.429752	0.533333
Other	0.880597	0.500000	0.250000	0.333333
White	0.845268	0.749112	0.593809	0.662480

	selection_rate	TPR	FPR	disparate_impact_vs_max
group
Amer-Indian-Eskimo	0.052083	0.375000	0.022727	0.204475
Asian-Pac-Islander	0.254717	0.662791	0.103448	1.000000
Black	0.077813	0.429752	0.026506	0.305487
Other	0.059701	0.250000	0.033898	0.234384
White	0.202711	0.593809	0.068332	0.795828

	accuracy	precision_pos	recall_pos	f1_pos
baseline	0.851674	0.744337	0.585987	0.655738
debiased_reweigh_race	0.849422	0.740479	0.577919	0.649177

Site sections

Universidad del Valle de Guatemala - UVG

Faculty of Engineering - Computer Science

Exploratory Analysis¶

Exploratory Analysis and Data Cleaning¶

Identification of Possible Biases¶

Baseline Model Development¶

Evaluation of Bias in the Model¶

SHAP¶

Bias Mitigation Proposal: Reweighing by Race¶

Why Reweighing?¶

Expected Effect¶

Why Also Evaluate Sex?¶

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	capital-loss	hours-per-week	native-country	income
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	0	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	0	40	Cuba	<=50K
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
48837	39	Private	215419	Bachelors	13	Divorced	Prof-specialty	Not-in-family	White	Female	0	0	36	United-States	<=50K.
48838	64	NaN	321403	HS-grad	9	Widowed	NaN	Other-relative	Black	Male	0	0	40	United-States	<=50K.
48839	38	Private	374983	Bachelors	13	Married-civ-spouse	Prof-specialty	Husband	White	Male	0	0	50	United-States	<=50K.
48840	44	Private	83891	Bachelors	13	Divorced	Adm-clerical	Own-child	Asian-Pac-Islander	Male	5455	0	40	United-States	<=50K.
48841	35	Self-emp-inc	182148	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	0	60	United-States	>50K.

	accuracy	precision	recall	f1
Amer-Indian-Eskimo	0.895833	0.400000	0.500000	0.444444
Asian-Pac-Islander	0.830189	0.700000	0.651163	0.674699
Black	0.898002	0.593750	0.628099	0.610442
Other	0.925373	0.666667	0.750000	0.705882
White	0.843469	0.756673	0.571764	0.651349

	selection_rate	TPR	FPR	disparate_impact_vs_max
group
Amer-Indian-Eskimo	0.104167	0.500000	0.068182	0.414062
Asian-Pac-Islander	0.251572	0.651163	0.103448	1.000000
Black	0.134595	0.628099	0.062651	0.535016
Other	0.134328	0.750000	0.050847	0.533955
White	0.193235	0.571764	0.063175	0.768109

	accuracy	precision	recall	f1
Female	0.918961	0.629073	0.687671	0.657068
Male	0.808905	0.781749	0.516583	0.622088

	selection_rate	TPR	FPR	disparate_impact_vs_max
group
Female	0.123415	0.687671	0.051604	0.613414
Male	0.201193	0.516583	0.063132	1.000000