iBrokeTheCode's picture
chore: Save LightGBM model
c742ac4
import marimo
__generated_with = "0.14.16"
app = marimo.App()
@app.cell
def _():
import marimo as mo
return (mo,)
@app.cell
def _(mo):
mo.center(mo.md("# Home Credit Default Risk Prediction"))
return
@app.cell
def _():
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from lightgbm import LGBMClassifier
from src.plots import (
plot_target_distribution,
plot_credit_amounts,
plot_education_levels,
plot_occupation,
plot_family_status,
plot_income_type,
)
from src.utils import get_dataset, get_features_target, get_train_test_sets
from src.preprocessing import preprocess_data_pipeline
return (
get_dataset,
get_features_target,
get_train_test_sets,
pd,
plot_credit_amounts,
plot_education_levels,
plot_family_status,
plot_income_type,
plot_occupation,
plot_target_distribution,
preprocess_data_pipeline,
)
@app.cell
def _(get_dataset, get_features_target):
df = get_dataset()
X, y = get_features_target(df)
return X, df, y
@app.cell
def _(mo):
mo.md("""## 1. Exploratory Data Analysis""")
return
@app.cell
def _(mo):
mo.callout(
kind="info",
value=mo.md(
"""πŸ’‘ **Want a step-by-step walkthrough instead?**
Check the Jupyter notebook version here: πŸ‘‰ [Jupyter notebook](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/tutorial_app.ipynb)""",
),
)
return
@app.cell
def _(mo):
mo.md("""### 1.1 Dataset Information""")
return
@app.cell
def _(mo):
mo.md("""**a. Shape of the train and test datasets**""")
return
@app.cell
def _(X_test, X_train, df):
train_samples = "Train dataset samples: {}".format(X_train.shape[0])
test_samples = "Test dataset samples: {}".format(X_test.shape[0])
columns_number = "Number of columns: {}".format(df.shape[1])
train_samples, test_samples, columns_number
return
@app.cell
def _(mo):
mo.md("""**b. Dataset features**""")
return
@app.cell
def _(X):
X.columns
return
@app.cell
def _(mo):
mo.md("""**c. Sample from dataset**""")
return
@app.cell
def _(X):
sample = X.head(5).T
sample.columns = [
str(col) for col in sample.columns
] # fix integer name warning
sample = sample.astype(str) # avoid numeric conversion issues in viewer
sample
return
@app.cell
def _(mo):
mo.md("""**d. Target variable Distribution**""")
return
@app.cell
def _(df, plot_target_distribution):
target_table, target_plot = plot_target_distribution(df=df)
target_table
return (target_plot,)
@app.cell
def _(target_plot):
target_plot
return
@app.cell
def _(mo):
mo.md("""**e. Number of columns of each data type**""")
return
@app.cell
def _(X):
X.dtypes.value_counts().sort_values(ascending=False)
return
@app.cell
def _(X):
categorical_cols = (
X.select_dtypes(include=["object"]).nunique().sort_values(ascending=False)
)
categorical_cols
return
@app.cell
def _(mo):
mo.md("""**f. Missing data**""")
return
@app.cell
def _(X, pd):
missing_count = X.isna().sum().sort_values(ascending=False)
missing_percentage = (missing_count / X.shape[0] * 100).round(2)
missing_data = pd.DataFrame(
data={"Count": missing_count, "percentage": missing_percentage}
)
missing_data
return
@app.cell
def _(mo):
mo.md("""### 1.2 Distribution of Variables""")
return
@app.cell
def _(mo):
mo.md(
r"""Want to see how these plots were created? You can find the source code for the visualizations in [plots.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/plots.py)."""
)
return
@app.cell
def _(mo):
mo.md("""**a. Credit Amounts**""")
return
@app.cell
def _(X, plot_credit_amounts):
plot_credit_amounts(df=X)
return
@app.cell
def _(mo):
mo.md("""**b. Education Level of Credit Applicants**""")
return
@app.cell
def _(X, plot_education_levels):
education_table, education_plot = plot_education_levels(df=X)
education_table
return (education_plot,)
@app.cell
def _(education_plot):
education_plot
return
@app.cell
def _(mo):
mo.md("""**c. Ocupation of Credit Applicants**""")
return
@app.cell
def _(X, plot_occupation):
occupation_table, occupation_plot = plot_occupation(df=X)
occupation_table
return (occupation_plot,)
@app.cell
def _(occupation_plot):
occupation_plot
return
@app.cell
def _(mo):
mo.md("""**d. Family Status of Applicants**""")
return
@app.cell
def _(X, plot_family_status):
family_status_table, family_status_plot = plot_family_status(df=X)
family_status_table
return (family_status_plot,)
@app.cell
def _(family_status_plot):
family_status_plot
return
@app.cell
def _(mo):
mo.md("""**e. Income Type of Applicants by Target Variable**""")
return
@app.cell
def _(df, plot_income_type):
plot_income_type(df=df)
return
@app.cell
def _(mo):
mo.md("""## 2. Preprocessing""")
return
@app.cell
def _(mo):
mo.md("""**a. Separate Train and Test Datasets**""")
return
@app.cell
def _(X, get_train_test_sets, y):
X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
return X_test, X_train
@app.cell
def _(mo):
mo.md("""**b. Preprocess Data**""")
return
@app.cell
def _(mo):
mo.md(
r"""
This preprocessing perform:
- Correct outliers/anomalous values in numerical columns (`DAYS_EMPLOYED` column).
- Encode string categorical features (`dtype object`).
- If the feature has 2 categories, Binary Encoding is applied.
- One Hot Encoding for more than 2 categories.
- Impute values for all columns with missing data (using median as imputing value).
- Feature scaling with Min-Max scaler
Want to see how the dataset was processed? You can find the code for the preprocessing steps in [preprocessing.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/preprocessing.py).
"""
)
return
@app.cell
def _(X_test, X_train, preprocess_data_pipeline):
train_data, test_data = preprocess_data_pipeline(
train_df=X_train, test_df=X_test
)
train_data.shape, test_data.shape
return
@app.cell
def _(mo):
mo.md("""## 3. Training Models""")
return
@app.cell
def _(mo):
mo.md(
r"""At this points, we will work with `train_data` and `test_data` as features sets; also `y_train` and `y_test` as target sets."""
)
return
@app.cell
def _(mo):
mo.md(r"""### 3.1 Logistic Regression""")
return
@app.cell
def _(mo):
mo.callout(
mo.md("""
In Logistic Regression, C is the inverse of regularization strength:
- **Small C** β†’ Stronger regularization β†’ Simpler model, less overfitting risk, but may underfit.
- **Large C** β†’ Weaker regularization β†’ Model fits training data more closely, but may overfit.
"""),
kind="info",
)
return
@app.cell
def _(mo):
mo.md(
r"""
We trained our Logistic Regression model using the following code:
```py
# πŸ“Œ Logistic Regression
log_reg = LogisticRegression(C=0.0001)
log_reg.fit(train_data, y_train)
# Train data predicton (class 1)
lr_train_pred = log_reg.predict_proba(train_data)[:, 1]
# Test data prediction (class 1)
lr_test_pred = log_reg.predict_proba(test_data)[:, 1]
# Get the ROC AUC Score on train and test datasets
log_reg_scores = {
"train_score": roc_auc_score(y_train, lr_train_pred),
"test_score": roc_auc_score(y_test, lr_test_pred),
}
log_reg_scores
```
πŸ“ˆ The ROC AUC scores obtained:
"""
)
return
@app.cell
def _():
lr_scores = {
"train_score": 0.6868418961663535,
"test_score": 0.6854973003347028,
}
lr_scores
return
@app.cell
def _(mo):
mo.md(r"""### 3.2 Random Forest Classifier""")
return
@app.cell
def _(mo):
mo.md(
r"""
We trained our Random Forest Classifier model using the following code:
```py
# πŸ“Œ Random Forest Classifier
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf.fit(train_data, y_train)
rf_train_pred = rf.predict_proba(train_data)[:, 1]
rf_test_pred = rf.predict_proba(test_data)[:, 1]
rf_scores = {
"train_score": roc_auc_score(y_train, rf_train_pred),
"test_score": roc_auc_score(y_test, rf_test_pred),
}
rf_scores
```
πŸ“ˆ The ROC AUC scores obtained:
"""
)
return
@app.cell
def _():
rf_scores = {"train_score": 1.0, "test_score": 0.7066811557903828}
rf_scores
return
@app.cell
def _(mo):
mo.md(r"""### 3.3. Randomized Search with Cross Validations""")
return
@app.cell
def _(mo):
mo.md(
r"""
We trained the Randomized Search CV using the following code:
```py
# πŸ“Œ RandomizedSearchCV
param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1)
rscv = RandomizedSearchCV(
estimator=rf_optimized,
param_distributions=param_dist,
n_iter=5,
scoring="roc_auc",
cv=3,
random_state=42,
n_jobs=-1,
)
rscv.fit(train_data, y_train)
rfo_train_pred = rscv.predict_proba(train_data)[:, 1]
rfo_test_pred = rscv.predict_proba(test_data)[:, 1]
rfo_scores = {
"train_score": roc_auc_score(y_train, rfo_train_pred),
"test_score": roc_auc_score(y_test, rfo_test_pred),
}
rfo_scores
```
πŸ“ˆ The ROC AUC scores obtained:
"""
)
return
@app.cell
def _():
rfo_scores = {
"train_score": 0.8196620915431655,
"test_score": 0.7308385425476998,
}
rfo_scores
return
@app.cell
def _(mo):
mo.md(r"""πŸ₯‡The best results:""")
return
@app.cell
def _():
optimized_results = {
"best_params_": {"n_estimators": 100, "max_depth": 10},
"best_score_": 0.7296259755147781,
"best_estimator_": "RandomForestClassifier(max_depth=10, n_jobs=-1, random_state=42)",
}
optimized_results
return
@app.cell
def _(mo):
mo.md(r"""### 3.4 LightGBM""")
return
@app.cell
def _(mo):
mo.md(
r"""
We trained our LightGBM Classifier model using the following code:
```py
# πŸ“Œ LightGBM
import warnings
warnings.filterwarnings(
"ignore", message="X does not have valid feature names"
)
# πŸ“Œ Get numerical and categorical variables (binary and mutiple)
num_cols = X_train.select_dtypes(include="number").columns.to_list()
cat_cols = X_train.select_dtypes(include="object").columns.to_list()
binary_cols = [col for col in cat_cols if X_train[col].nunique() == 2]
multi_cols = [col for col in cat_cols if X_train[col].nunique() > 2]
# πŸ“Œ [1] Create the pipelines for different data types
numerical_pipeline = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", MinMaxScaler()),
]
)
binary_pipeline = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("ordinal", OrdinalEncoder()),
("scaler", MinMaxScaler()),
]
)
multi_pipeline = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
(
"onehot",
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
),
("scaler", MinMaxScaler()),
]
)
# πŸ“Œ [2] Create the preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
("binary", binary_pipeline, binary_cols),
("multi", multi_pipeline, multi_cols),
("numerical", numerical_pipeline, num_cols),
],
remainder="passthrough",
)
# πŸ“Œ [3] Create the Final Pipeline that combines the preprocessor and the model
lgbm = LGBMClassifier(
n_estimators=500,
learning_rate=0.05,
max_depth=-1,
random_state=42,
class_weight="balanced",
n_jobs=-1,
)
lgbm_pipeline = Pipeline(
steps=[("preprocessor", preprocessor), ("classifier", lgbm)]
)
# πŸ“Œ [4] Fit the Final Pipeline on the ORIGINAL, unprocessed data
# The pipeline takes care of all the preprocessing internally.
lgbm_pipeline.fit(X_train, y_train)
lgbm_train_pred = lgbm_pipeline.predict_proba(X_train)[:, 1]
lgbm_test_pred = lgbm_pipeline.predict_proba(X_test)[:, 1]
lgbm_scores = {
"train_score": roc_auc_score(y_train, lgbm_train_pred),
"test_score": roc_auc_score(y_test, lgbm_test_pred),
}
lgbm_scores
```
πŸ“ˆ The ROC AUC scores obtained:
"""
)
return
@app.cell
def _():
lgbm_scores = {
"train_score": 0.8523466410959462,
"test_score": 0.7514895868142193,
}
lgbm_scores
return
@app.cell
def _(mo):
mo.md(r"""## 4. Model Performance Analysis""")
return
@app.cell
def _(mo):
lg_stat = mo.stat(
label="Logistic Regression",
bordered=True,
value="πŸ‹οΈ 0.687 πŸ”Ž 0.685",
caption="Scores are consistent across train and test, indicating no overfitting. However, the overall AUC is low, suggesting underfitting β€” the model is too simple to capture complex patterns.",
direction="decrease",
)
rfc_stat = mo.stat(
label="Random Forest Classifier",
bordered=True,
value="πŸ‹οΈ 1.0 πŸ”Ž 0.707",
caption="Perfect training AUC indicates severe overfitting β€” the model memorized the training set. While the test score is better than Logistic Regression, the gap is too large for good generalization.",
direction="decrease",
)
rfo_stat = mo.stat(
label="Random Forest with Randomized Search",
bordered=True,
value="πŸ‹οΈ 0.820 πŸ”Ž 0.731",
caption="Hyperparameter tuning greatly reduced overfitting. The smaller train–test gap and improved test AUC show better generalization and a strong performance.",
direction="increase",
)
lgbm_stat = mo.stat(
label="LightGBM",
bordered=True,
value="πŸ‹οΈ 0.852 πŸ”Ž 0.751",
caption="Best overall performance. Small train–test gap and highest test AUC indicate a well-balanced model with strong generalization.",
direction="increase",
)
mo.vstack(
items=[
mo.hstack(items=[lg_stat, rfc_stat], widths="equal", gap=1),
mo.hstack(items=[rfo_stat, lgbm_stat], widths="equal", gap=1),
],
gap=1,
heights="equal",
align="center",
justify="center",
)
return
@app.cell
def _(mo):
mo.md(r"""## 5. Model Selection""")
return
@app.cell
def _(mo):
mo.md(
r"""
Based on a comparison of all the models, the final model selection is clear.
| Model | Train Score (AUC ROC) | Test Score (AUC ROC) |
| :--- | :---: | :---: |
| Logistic Regression | 0.687 | 0.685 |
| Random Forest Classifier | 1.000 | 0.707 |
| Randomized Search (Tuned RF) | 0.820 | 0.731 |
| **LightGBM** | 0.852 | **0.751** |
* The **Logistic Regression** model performed poorly due to underfitting.
* The base **Random Forest** model, while better, suffered from severe overfitting.
* The tuned **Random Forest** model was a significant improvement and a strong contender, achieving a solid `test_score`.
* However, the **LightGBM** model ultimately demonstrated the best performance, achieving the highest **ROC AUC test score of 0.751**. This indicates that it is the most robust and accurate model for predicting loan repayment risk on unseen data.
"""
)
return
@app.cell
def _(mo):
mo.callout(
kind="success",
value="πŸ₯‡ Therefore, we will select the LightGBM model as our final choice for deployment.",
)
return
if __name__ == "__main__":
app.run()