import marimo __generated_with = "0.14.16" app = marimo.App() @app.cell def _(): import marimo as mo return (mo,) @app.cell def _(mo): mo.center(mo.md("# Home Credit Default Risk Prediction")) return @app.cell def _(): import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score from sklearn.model_selection import RandomizedSearchCV from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder from lightgbm import LGBMClassifier from src.plots import ( plot_target_distribution, plot_credit_amounts, plot_education_levels, plot_occupation, plot_family_status, plot_income_type, ) from src.utils import get_dataset, get_features_target, get_train_test_sets from src.preprocessing import preprocess_data_pipeline return ( get_dataset, get_features_target, get_train_test_sets, pd, plot_credit_amounts, plot_education_levels, plot_family_status, plot_income_type, plot_occupation, plot_target_distribution, preprocess_data_pipeline, ) @app.cell def _(get_dataset, get_features_target): df = get_dataset() X, y = get_features_target(df) return X, df, y @app.cell def _(mo): mo.md("""## 1. Exploratory Data Analysis""") return @app.cell def _(mo): mo.callout( kind="info", value=mo.md( """πŸ’‘ **Want a step-by-step walkthrough instead?** Check the Jupyter notebook version here: πŸ‘‰ [Jupyter notebook](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/tutorial_app.ipynb)""", ), ) return @app.cell def _(mo): mo.md("""### 1.1 Dataset Information""") return @app.cell def _(mo): mo.md("""**a. Shape of the train and test datasets**""") return @app.cell def _(X_test, X_train, df): train_samples = "Train dataset samples: {}".format(X_train.shape[0]) test_samples = "Test dataset samples: {}".format(X_test.shape[0]) columns_number = "Number of columns: {}".format(df.shape[1]) train_samples, test_samples, columns_number return @app.cell def _(mo): mo.md("""**b. Dataset features**""") return @app.cell def _(X): X.columns return @app.cell def _(mo): mo.md("""**c. Sample from dataset**""") return @app.cell def _(X): sample = X.head(5).T sample.columns = [ str(col) for col in sample.columns ] # fix integer name warning sample = sample.astype(str) # avoid numeric conversion issues in viewer sample return @app.cell def _(mo): mo.md("""**d. Target variable Distribution**""") return @app.cell def _(df, plot_target_distribution): target_table, target_plot = plot_target_distribution(df=df) target_table return (target_plot,) @app.cell def _(target_plot): target_plot return @app.cell def _(mo): mo.md("""**e. Number of columns of each data type**""") return @app.cell def _(X): X.dtypes.value_counts().sort_values(ascending=False) return @app.cell def _(X): categorical_cols = ( X.select_dtypes(include=["object"]).nunique().sort_values(ascending=False) ) categorical_cols return @app.cell def _(mo): mo.md("""**f. Missing data**""") return @app.cell def _(X, pd): missing_count = X.isna().sum().sort_values(ascending=False) missing_percentage = (missing_count / X.shape[0] * 100).round(2) missing_data = pd.DataFrame( data={"Count": missing_count, "percentage": missing_percentage} ) missing_data return @app.cell def _(mo): mo.md("""### 1.2 Distribution of Variables""") return @app.cell def _(mo): mo.md( r"""Want to see how these plots were created? You can find the source code for the visualizations in [plots.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/plots.py).""" ) return @app.cell def _(mo): mo.md("""**a. Credit Amounts**""") return @app.cell def _(X, plot_credit_amounts): plot_credit_amounts(df=X) return @app.cell def _(mo): mo.md("""**b. Education Level of Credit Applicants**""") return @app.cell def _(X, plot_education_levels): education_table, education_plot = plot_education_levels(df=X) education_table return (education_plot,) @app.cell def _(education_plot): education_plot return @app.cell def _(mo): mo.md("""**c. Ocupation of Credit Applicants**""") return @app.cell def _(X, plot_occupation): occupation_table, occupation_plot = plot_occupation(df=X) occupation_table return (occupation_plot,) @app.cell def _(occupation_plot): occupation_plot return @app.cell def _(mo): mo.md("""**d. Family Status of Applicants**""") return @app.cell def _(X, plot_family_status): family_status_table, family_status_plot = plot_family_status(df=X) family_status_table return (family_status_plot,) @app.cell def _(family_status_plot): family_status_plot return @app.cell def _(mo): mo.md("""**e. Income Type of Applicants by Target Variable**""") return @app.cell def _(df, plot_income_type): plot_income_type(df=df) return @app.cell def _(mo): mo.md("""## 2. Preprocessing""") return @app.cell def _(mo): mo.md("""**a. Separate Train and Test Datasets**""") return @app.cell def _(X, get_train_test_sets, y): X_train, y_train, X_test, y_test = get_train_test_sets(X, y) X_train.shape, y_train.shape, X_test.shape, y_test.shape return X_test, X_train @app.cell def _(mo): mo.md("""**b. Preprocess Data**""") return @app.cell def _(mo): mo.md( r""" This preprocessing perform: - Correct outliers/anomalous values in numerical columns (`DAYS_EMPLOYED` column). - Encode string categorical features (`dtype object`). - If the feature has 2 categories, Binary Encoding is applied. - One Hot Encoding for more than 2 categories. - Impute values for all columns with missing data (using median as imputing value). - Feature scaling with Min-Max scaler Want to see how the dataset was processed? You can find the code for the preprocessing steps in [preprocessing.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/preprocessing.py). """ ) return @app.cell def _(X_test, X_train, preprocess_data_pipeline): train_data, test_data = preprocess_data_pipeline( train_df=X_train, test_df=X_test ) train_data.shape, test_data.shape return @app.cell def _(mo): mo.md("""## 3. Training Models""") return @app.cell def _(mo): mo.md( r"""At this points, we will work with `train_data` and `test_data` as features sets; also `y_train` and `y_test` as target sets.""" ) return @app.cell def _(mo): mo.md(r"""### 3.1 Logistic Regression""") return @app.cell def _(mo): mo.callout( mo.md(""" In Logistic Regression, C is the inverse of regularization strength: - **Small C** β†’ Stronger regularization β†’ Simpler model, less overfitting risk, but may underfit. - **Large C** β†’ Weaker regularization β†’ Model fits training data more closely, but may overfit. """), kind="info", ) return @app.cell def _(mo): mo.md( r""" We trained our Logistic Regression model using the following code: ```py # πŸ“Œ Logistic Regression log_reg = LogisticRegression(C=0.0001) log_reg.fit(train_data, y_train) # Train data predicton (class 1) lr_train_pred = log_reg.predict_proba(train_data)[:, 1] # Test data prediction (class 1) lr_test_pred = log_reg.predict_proba(test_data)[:, 1] # Get the ROC AUC Score on train and test datasets log_reg_scores = { "train_score": roc_auc_score(y_train, lr_train_pred), "test_score": roc_auc_score(y_test, lr_test_pred), } log_reg_scores ``` πŸ“ˆ The ROC AUC scores obtained: """ ) return @app.cell def _(): lr_scores = { "train_score": 0.6868418961663535, "test_score": 0.6854973003347028, } lr_scores return @app.cell def _(mo): mo.md(r"""### 3.2 Random Forest Classifier""") return @app.cell def _(mo): mo.md( r""" We trained our Random Forest Classifier model using the following code: ```py # πŸ“Œ Random Forest Classifier rf = RandomForestClassifier(random_state=42, n_jobs=-1) rf.fit(train_data, y_train) rf_train_pred = rf.predict_proba(train_data)[:, 1] rf_test_pred = rf.predict_proba(test_data)[:, 1] rf_scores = { "train_score": roc_auc_score(y_train, rf_train_pred), "test_score": roc_auc_score(y_test, rf_test_pred), } rf_scores ``` πŸ“ˆ The ROC AUC scores obtained: """ ) return @app.cell def _(): rf_scores = {"train_score": 1.0, "test_score": 0.7066811557903828} rf_scores return @app.cell def _(mo): mo.md(r"""### 3.3. Randomized Search with Cross Validations""") return @app.cell def _(mo): mo.md( r""" We trained the Randomized Search CV using the following code: ```py # πŸ“Œ RandomizedSearchCV param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]} rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1) rscv = RandomizedSearchCV( estimator=rf_optimized, param_distributions=param_dist, n_iter=5, scoring="roc_auc", cv=3, random_state=42, n_jobs=-1, ) rscv.fit(train_data, y_train) rfo_train_pred = rscv.predict_proba(train_data)[:, 1] rfo_test_pred = rscv.predict_proba(test_data)[:, 1] rfo_scores = { "train_score": roc_auc_score(y_train, rfo_train_pred), "test_score": roc_auc_score(y_test, rfo_test_pred), } rfo_scores ``` πŸ“ˆ The ROC AUC scores obtained: """ ) return @app.cell def _(): rfo_scores = { "train_score": 0.8196620915431655, "test_score": 0.7308385425476998, } rfo_scores return @app.cell def _(mo): mo.md(r"""πŸ₯‡The best results:""") return @app.cell def _(): optimized_results = { "best_params_": {"n_estimators": 100, "max_depth": 10}, "best_score_": 0.7296259755147781, "best_estimator_": "RandomForestClassifier(max_depth=10, n_jobs=-1, random_state=42)", } optimized_results return @app.cell def _(mo): mo.md(r"""### 3.4 LightGBM""") return @app.cell def _(mo): mo.md( r""" We trained our LightGBM Classifier model using the following code: ```py # πŸ“Œ LightGBM import warnings warnings.filterwarnings( "ignore", message="X does not have valid feature names" ) # πŸ“Œ Get numerical and categorical variables (binary and mutiple) num_cols = X_train.select_dtypes(include="number").columns.to_list() cat_cols = X_train.select_dtypes(include="object").columns.to_list() binary_cols = [col for col in cat_cols if X_train[col].nunique() == 2] multi_cols = [col for col in cat_cols if X_train[col].nunique() > 2] # πŸ“Œ [1] Create the pipelines for different data types numerical_pipeline = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", MinMaxScaler()), ] ) binary_pipeline = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="most_frequent")), ("ordinal", OrdinalEncoder()), ("scaler", MinMaxScaler()), ] ) multi_pipeline = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="most_frequent")), ( "onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ), ("scaler", MinMaxScaler()), ] ) # πŸ“Œ [2] Create the preprocessor using ColumnTransformer preprocessor = ColumnTransformer( transformers=[ ("binary", binary_pipeline, binary_cols), ("multi", multi_pipeline, multi_cols), ("numerical", numerical_pipeline, num_cols), ], remainder="passthrough", ) # πŸ“Œ [3] Create the Final Pipeline that combines the preprocessor and the model lgbm = LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, random_state=42, class_weight="balanced", n_jobs=-1, ) lgbm_pipeline = Pipeline( steps=[("preprocessor", preprocessor), ("classifier", lgbm)] ) # πŸ“Œ [4] Fit the Final Pipeline on the ORIGINAL, unprocessed data # The pipeline takes care of all the preprocessing internally. lgbm_pipeline.fit(X_train, y_train) lgbm_train_pred = lgbm_pipeline.predict_proba(X_train)[:, 1] lgbm_test_pred = lgbm_pipeline.predict_proba(X_test)[:, 1] lgbm_scores = { "train_score": roc_auc_score(y_train, lgbm_train_pred), "test_score": roc_auc_score(y_test, lgbm_test_pred), } lgbm_scores ``` πŸ“ˆ The ROC AUC scores obtained: """ ) return @app.cell def _(): lgbm_scores = { "train_score": 0.8523466410959462, "test_score": 0.7514895868142193, } lgbm_scores return @app.cell def _(mo): mo.md(r"""## 4. Model Performance Analysis""") return @app.cell def _(mo): lg_stat = mo.stat( label="Logistic Regression", bordered=True, value="πŸ‹οΈ 0.687 πŸ”Ž 0.685", caption="Scores are consistent across train and test, indicating no overfitting. However, the overall AUC is low, suggesting underfitting β€” the model is too simple to capture complex patterns.", direction="decrease", ) rfc_stat = mo.stat( label="Random Forest Classifier", bordered=True, value="πŸ‹οΈ 1.0 πŸ”Ž 0.707", caption="Perfect training AUC indicates severe overfitting β€” the model memorized the training set. While the test score is better than Logistic Regression, the gap is too large for good generalization.", direction="decrease", ) rfo_stat = mo.stat( label="Random Forest with Randomized Search", bordered=True, value="πŸ‹οΈ 0.820 πŸ”Ž 0.731", caption="Hyperparameter tuning greatly reduced overfitting. The smaller train–test gap and improved test AUC show better generalization and a strong performance.", direction="increase", ) lgbm_stat = mo.stat( label="LightGBM", bordered=True, value="πŸ‹οΈ 0.852 πŸ”Ž 0.751", caption="Best overall performance. Small train–test gap and highest test AUC indicate a well-balanced model with strong generalization.", direction="increase", ) mo.vstack( items=[ mo.hstack(items=[lg_stat, rfc_stat], widths="equal", gap=1), mo.hstack(items=[rfo_stat, lgbm_stat], widths="equal", gap=1), ], gap=1, heights="equal", align="center", justify="center", ) return @app.cell def _(mo): mo.md(r"""## 5. Model Selection""") return @app.cell def _(mo): mo.md( r""" Based on a comparison of all the models, the final model selection is clear. | Model | Train Score (AUC ROC) | Test Score (AUC ROC) | | :--- | :---: | :---: | | Logistic Regression | 0.687 | 0.685 | | Random Forest Classifier | 1.000 | 0.707 | | Randomized Search (Tuned RF) | 0.820 | 0.731 | | **LightGBM** | 0.852 | **0.751** | * The **Logistic Regression** model performed poorly due to underfitting. * The base **Random Forest** model, while better, suffered from severe overfitting. * The tuned **Random Forest** model was a significant improvement and a strong contender, achieving a solid `test_score`. * However, the **LightGBM** model ultimately demonstrated the best performance, achieving the highest **ROC AUC test score of 0.751**. This indicates that it is the most robust and accurate model for predicting loan repayment risk on unseen data. """ ) return @app.cell def _(mo): mo.callout( kind="success", value="πŸ₯‡ Therefore, we will select the LightGBM model as our final choice for deployment.", ) return if __name__ == "__main__": app.run()