import marimo __generated_with = "0.14.16" app = marimo.App() @app.cell def _(): import joblib import warnings import marimo as mo import pandas as pd warnings.filterwarnings( "ignore", message="X does not have valid feature names" ) return joblib, mo, pd @app.cell def _(mo): mo.center(mo.md("# ๐Ÿฆ Home Credit Default Risk Prediction")) return @app.cell def _(mo): mo.Html("
") return @app.cell def _(joblib, mo): # ๐Ÿ“Œ [1] Load the saved model pipeline with mo.redirect_stdout(): loaded_pipeline = joblib.load("./model/lgbm_model.joblib") return (loaded_pipeline,) @app.cell def _(): # ๐Ÿ“Œ [2] Define the default values for all other features default_values = { "SK_ID_CURR": 277659.5, "CNT_CHILDREN": 0.0, "AMT_INCOME_TOTAL": 147150.0, "AMT_CREDIT": 512997.75, "AMT_ANNUITY": 24885.0, "AMT_GOODS_PRICE": 450000.0, "REGION_POPULATION_RELATIVE": 0.01885, "DAYS_BIRTH": -15743.5, "DAYS_EMPLOYED": -1219.0, "DAYS_REGISTRATION": -4492.0, "DAYS_ID_PUBLISH": -3254.0, "OWN_CAR_AGE": 9.0, "FLAG_MOBIL": 1.0, "FLAG_EMP_PHONE": 1.0, "FLAG_WORK_PHONE": 0.0, "FLAG_CONT_MOBILE": 1.0, "FLAG_PHONE": 0.0, "FLAG_EMAIL": 0.0, "CNT_FAM_MEMBERS": 2.0, "REGION_RATING_CLIENT": 2.0, "REGION_RATING_CLIENT_W_CITY": 2.0, "HOUR_APPR_PROCESS_START": 12.0, "REG_REGION_NOT_LIVE_REGION": 0.0, "REG_REGION_NOT_WORK_REGION": 0.0, "LIVE_REGION_NOT_WORK_REGION": 0.0, "REG_CITY_NOT_LIVE_CITY": 0.0, "REG_CITY_NOT_WORK_CITY": 0.0, "LIVE_CITY_NOT_WORK_CITY": 0.0, "EXT_SOURCE_1": 0.5068839442599388, "EXT_SOURCE_2": 0.5662837032261614, "EXT_SOURCE_3": 0.5370699579791587, "APARTMENTS_AVG": 0.0876, "BASEMENTAREA_AVG": 0.0764, "YEARS_BEGINEXPLUATATION_AVG": 0.9816, "YEARS_BUILD_AVG": 0.7552, "COMMONAREA_AVG": 0.0211, "ELEVATORS_AVG": 0.0, "ENTRANCES_AVG": 0.1379, "FLOORSMAX_AVG": 0.1667, "FLOORSMIN_AVG": 0.2083, "LANDAREA_AVG": 0.0483, "LIVINGAPARTMENTS_AVG": 0.0756, "LIVINGAREA_AVG": 0.0746, "NONLIVINGAPARTMENTS_AVG": 0.0, "NONLIVINGAREA_AVG": 0.0035, "APARTMENTS_MODE": 0.084, "BASEMENTAREA_MODE": 0.0748, "YEARS_BEGINEXPLUATATION_MODE": 0.9816, "YEARS_BUILD_MODE": 0.7648, "COMMONAREA_MODE": 0.0191, "ELEVATORS_MODE": 0.0, "ENTRANCES_MODE": 0.1379, "FLOORSMAX_MODE": 0.1667, "FLOORSMIN_MODE": 0.2083, "LANDAREA_MODE": 0.0459, "LIVINGAPARTMENTS_MODE": 0.0771, "LIVINGAREA_MODE": 0.0731, "NONLIVINGAPARTMENTS_MODE": 0.0, "NONLIVINGAREA_MODE": 0.0011, "APARTMENTS_MEDI": 0.0864, "BASEMENTAREA_MEDI": 0.0761, "YEARS_BEGINEXPLUATATION_MEDI": 0.9816, "YEARS_BUILD_MEDI": 0.7585, "COMMONAREA_MEDI": 0.0209, "ELEVATORS_MEDI": 0.0, "ENTRANCES_MEDI": 0.1379, "FLOORSMAX_MEDI": 0.1667, "FLOORSMIN_MEDI": 0.2083, "LANDAREA_MEDI": 0.0488, "LIVINGAPARTMENTS_MEDI": 0.0765, "LIVINGAREA_MEDI": 0.0749, "NONLIVINGAPARTMENTS_MEDI": 0.0, "NONLIVINGAREA_MEDI": 0.003, "TOTALAREA_MODE": 0.0687, "OBS_30_CNT_SOCIAL_CIRCLE": 0.0, "DEF_30_CNT_SOCIAL_CIRCLE": 0.0, "OBS_60_CNT_SOCIAL_CIRCLE": 0.0, "DEF_60_CNT_SOCIAL_CIRCLE": 0.0, "DAYS_LAST_PHONE_CHANGE": -755.0, "FLAG_DOCUMENT_2": 0.0, "FLAG_DOCUMENT_3": 1.0, "FLAG_DOCUMENT_4": 0.0, "FLAG_DOCUMENT_5": 0.0, "FLAG_DOCUMENT_6": 0.0, "FLAG_DOCUMENT_7": 0.0, "FLAG_DOCUMENT_8": 0.0, "FLAG_DOCUMENT_9": 0.0, "FLAG_DOCUMENT_10": 0.0, "FLAG_DOCUMENT_11": 0.0, "FLAG_DOCUMENT_12": 0.0, "FLAG_DOCUMENT_13": 0.0, "FLAG_DOCUMENT_14": 0.0, "FLAG_DOCUMENT_15": 0.0, "FLAG_DOCUMENT_16": 0.0, "FLAG_DOCUMENT_17": 0.0, "FLAG_DOCUMENT_18": 0.0, "FLAG_DOCUMENT_19": 0.0, "FLAG_DOCUMENT_20": 0.0, "FLAG_DOCUMENT_21": 0.0, "AMT_REQ_CREDIT_BUREAU_HOUR": 0.0, "AMT_REQ_CREDIT_BUREAU_DAY": 0.0, "AMT_REQ_CREDIT_BUREAU_WEEK": 0.0, "AMT_REQ_CREDIT_BUREAU_MON": 0.0, "AMT_REQ_CREDIT_BUREAU_QRT": 0.0, "AMT_REQ_CREDIT_BUREAU_YEAR": 1.0, "NAME_CONTRACT_TYPE": "Cash loans", "CODE_GENDER": "F", "FLAG_OWN_CAR": "N", "FLAG_OWN_REALTY": "Y", "NAME_TYPE_SUITE": "Unaccompanied", "NAME_INCOME_TYPE": "Working", "NAME_EDUCATION_TYPE": "Secondary / secondary special", "NAME_FAMILY_STATUS": "Married", "NAME_HOUSING_TYPE": "House / apartment", "OCCUPATION_TYPE": "Laborers", "WEEKDAY_APPR_PROCESS_START": "TUESDAY", "ORGANIZATION_TYPE": "Business Entity Type 3", "FONDKAPREMONT_MODE": "reg oper account", "HOUSETYPE_MODE": "block of flats", "WALLSMATERIAL_MODE": "Panel", "EMERGENCYSTATE_MODE": "No", } return (default_values,) @app.cell def _(mo): # ๐Ÿ“Œ [3] Create widgets for the top 10 features EXT_SOURCE_3 = mo.ui.slider( start=0.00, stop=0.90, step=0.01, value=0.5, label="EXT_SOURCE_3", ) EXT_SOURCE_2 = mo.ui.slider( start=0.00, stop=0.86, step=0.01, value=0.5, label="EXT_SOURCE_2", ) DAYS_BIRTH = mo.ui.slider( start=-25229, stop=-7673, value=-15743, label="DAYS_BIRTH", ) EXT_SOURCE_1 = mo.ui.slider( start=0.01, stop=0.97, step=0.01, value=0.5, label="EXT_SOURCE_1", ) AMT_ANNUITY = mo.ui.slider( start=1980, stop=258025, step=100, value=24885, label="AMT_ANNUITY", ) AMT_CREDIT = mo.ui.slider( start=45000, stop=4050000, step=50000, value=512997, label="AMT_CREDIT", ) DAYS_EMPLOYED = mo.ui.slider( start=-17583, stop=365243, value=-1219, label="DAYS_EMPLOYED", ) DAYS_ID_PUBLISH = mo.ui.slider( start=-7197, stop=0, value=-3254, label="DAYS_ID_PUBLISH", ) DAYS_REGISTRATION = mo.ui.slider( start=-24672, stop=0, value=-4492, label="DAYS_REGISTRATION", ) SK_ID_CURR = mo.ui.slider( start=100003, stop=456253, step=100, value=277659, label="SK_ID_CURR", ) features_widgets = { "EXT_SOURCE_3": EXT_SOURCE_3, "EXT_SOURCE_2": EXT_SOURCE_2, "DAYS_BIRTH": DAYS_BIRTH, "EXT_SOURCE_1": EXT_SOURCE_1, "AMT_ANNUITY": AMT_ANNUITY, "AMT_CREDIT": AMT_CREDIT, "DAYS_EMPLOYED": DAYS_EMPLOYED, "DAYS_ID_PUBLISH": DAYS_ID_PUBLISH, "DAYS_REGISTRATION": DAYS_REGISTRATION, "SK_ID_CURR": SK_ID_CURR, } return (features_widgets,) @app.cell def _(features_widgets, mo): # ๐Ÿ“Œ [4] Create the form with the sliders sliders_form = ( mo.md(""" ###Fill in the Client Profile to see the prediction {EXT_SOURCE_3} {EXT_SOURCE_2} {DAYS_BIRTH} {EXT_SOURCE_1} {AMT_ANNUITY} {AMT_CREDIT} {DAYS_EMPLOYED} {DAYS_ID_PUBLISH} {DAYS_REGISTRATION} {SK_ID_CURR} """) .batch(**features_widgets) # Pass the dict unpacked .form(show_clear_button=True, bordered=True) ) return (sliders_form,) @app.cell def _(default_values, loaded_pipeline, mo, pd, sliders_form): # ๐Ÿ“Œ [5] Get prediction from model probability = None # Process form submission if sliders_form.value is not None: # Copy default values prediction_data = default_values.copy() # Update with sliders' submitted values prediction_data.update(sliders_form.value) # Create a DataFrame predict_df = pd.DataFrame([prediction_data]) # Predict probability probability = loaded_pipeline.predict_proba(predict_df)[:, 1][0] else: mo.md("Fill in the form and click **Submit** to get a prediction.") return (probability,) @app.cell def _(probability): # ๐Ÿ“Œ [6] Display prediction results prob_percent = 70.12 risk = "High Risk" direction = "decrease" if probability is not None: prob_percent = round(probability * 100, 2) # Define risk category if probability < 0.34: risk = "Low Risk" direction = "increase" elif probability < 0.67: risk = "Medium Risk" direction = None else: risk = "High Risk" direction = "decrease" return direction, prob_percent, risk @app.cell def _(direction, mo, prob_percent, risk): interpretation_text = f"""This means there is a {prob_percent}% chance the client will default on their loan. Risk level is categorized as {risk}, which can help guide loan approval decisions. """ result_stat = mo.stat( label="โš–๏ธ Probability of Payment Difficulties", bordered=True, value=f"{prob_percent}%", caption=risk, direction=direction, ) interpretation_stat = mo.stat( label="๐Ÿ’ก Interpretation", bordered=True, value="", caption=interpretation_text, ) return interpretation_stat, result_stat @app.cell def _(mo): mo.md("""## ๐Ÿ”ฎ Credit Risk Predictor โ€” Try It Yourself!""") return @app.cell def _(mo): mo.Html("

") return @app.cell def _(interpretation_stat, mo, result_stat): mo.vstack( items=[ mo.hstack( items=[result_stat, interpretation_stat], widths="equal", gap=1 ), ], gap=1, heights="equal", ) return @app.cell def _(mo): mo.Html("
") return @app.cell def _(sliders_form): sliders_form return @app.cell def _(mo): mo.md( r""" _(*) Predictions are based on the top 10 most important features. Remaining features are assigned default values (median for numeric, mode for categorical)._ """ ) return @app.cell def _(mo): mo.Html("
") return @app.cell def _(mo): mo.md(r"""## ๐Ÿš€ Model Selection""") return @app.cell def _(mo): mo.Html("

") return @app.cell def _(mo): lg_stat = mo.stat( label="Logistic Regression", bordered=True, value="๐Ÿ’ช๐Ÿป 68.7% ๐Ÿ“ 68.5%", caption="Scores are consistent across train and test, indicating no overfitting. However, the overall AUC is low, suggesting underfitting โ€” the model is too simple to capture complex patterns.", direction="decrease", ) rfc_stat = mo.stat( label="Random Forest Classifier", bordered=True, value="๐Ÿ’ช๐Ÿป 100% ๐Ÿ“ 70.7%", caption="Perfect training AUC indicates severe overfitting โ€” the model memorized the training set. While the test score is better than Logistic Regression, the gap is too large for good generalization.", direction="decrease", ) rfo_stat = mo.stat( label="Random Forest with Randomized Search", bordered=True, value="๐Ÿ’ช๐Ÿป 82% ๐Ÿ“ 73.1%", caption="Hyperparameter tuning greatly reduced overfitting. The smaller trainโ€“test gap and improved test AUC show better generalization and a strong performance.", direction="increase", ) lgbm_stat = mo.stat( label="LightGBM", bordered=True, value="๐Ÿ’ช๐Ÿป 85.2% ๐Ÿ“ 75.1%", caption="Best overall performance. Small trainโ€“test gap and highest test AUC indicate a well-balanced model with strong generalization.", direction="increase", ) mo.vstack( items=[ mo.hstack(items=[lg_stat, rfc_stat], widths="equal", gap=1), mo.hstack(items=[rfo_stat, lgbm_stat], widths="equal", gap=1), ], gap=1, heights="equal", align="center", justify="center", ) return @app.cell def _(mo): mo.Html("
") return @app.cell def _(mo): mo.md( r"""Based on a comparison of all the models _(using AUC ROC metric)_, the final model selection is clear:""" ) return @app.cell def _(mo): mo.Html("
") return @app.cell def _(mo): mo.center( mo.md(r""" | Model | ๐Ÿ’ช๐Ÿป Train Score | ๐Ÿ“ Test Score | | :--- | :---: | :---: | | Logistic Regression | 0.687 | 0.685 | | Random Forest Classifier | 1.000 | 0.707 | | Randomized Search (Tuned RF) | 0.820 | 0.731 | | **LightGBM** | **0.852** | **0.751** | """) ) return @app.cell def _(mo): mo.Html("
") return @app.cell def _(mo): mo.md( r""" * The **Logistic Regression** model performed poorly due to underfitting. * The base **Random Forest** model, while better, suffered from severe overfitting. * The tuned **Random Forest** model was a significant improvement and a strong contender, achieving a solid `test_score`. * However, the **LightGBM** model ultimately demonstrated the best performance, achieving the highest **ROC AUC test score of 0.751**. This indicates that it is the most robust and accurate model for predicting loan repayment risk on unseen data. """ ) return @app.cell def _(mo): mo.callout( kind="info", value=mo.md( """๐Ÿ’ก **Want to explore the process in detail?** See the full ๐Ÿ‘‰ [Jupyter notebook](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/tutorial_app.ipynb) ๐Ÿ‘ˆ๏ธ for an end-to-end walkthrough, including Exploratory Data Analysis, preprocessing, model training, evaluation, model selection, and saving the final model.""" ), ) return @app.cell def _(mo): mo.Html("


") return @app.cell def _(mo): mo.center( mo.md( "**Connect with me:** ๐Ÿ’ผ [Linkedin](https://www.linkedin.com/in/alex-turpo/) โ€ข ๐Ÿฑ [GitHub](https://github.com/iBrokeTheCode) โ€ข ๐Ÿค— [Hugging Face](https://huggingface.co/iBrokeTheCode)" ) ) return if __name__ == "__main__": app.run()