Spaces:

iBrokeTheCode
/

Home_Credit_Default_Risk_Prediction

Sleeping

App Files Files Community

iBrokeTheCode commited on Aug 9

Commit

e39e2ca

1 Parent(s): c9351e4

chore: Add LightGBM model

Browse files

Files changed (2) hide show

README.md +25 -0
app.py +127 -1

README.md CHANGED Viewed

@@ -11,3 +11,28 @@ short_description: ML Classification models applied to Home Credit Risk dataset
 Check out marimo at <https://github.com/marimo-team/marimo>
 Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>

 Check out marimo at <https://github.com/marimo-team/marimo>
 Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
+## 3. Technology Stack
+This project was built using the following technologies and libraries:
+**Dashboard & Hosting:**
+- [Marimo](https://github.com/marimo-team/marimo): A Python library for building interactive dashboards.
+- [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces-config-reference): Used for hosting and sharing the interactive dashboard.
+**Data Analysis & Visualization:**
+- [Pandas](https://pandas.pydata.org/): For data manipulation and analysis.
+- [Matplotlib](https://matplotlib.org/): For creating static visualizations.
+- [Seaborn](https://seaborn.pydata.org/): For creating statistical graphics.
+**Modeling & Training:**
+- [Scikit-Learn](https://scikit-learn.org/stable/): For machine learning tasks such as preprocessing, feature engineering, and model training.
+- [LightGBM](https://lightgbm.readthedocs.io/en/stable/): It is a gradient boosting framework that uses tree based learning algorithms.
+**Development Tools:**
+- [Ruff](https://github.com/charliermarsh/ruff): A fast Python linter and code formatter.
+- [uv](https://github.com/astral-sh/uv): A fast Python package installer and resolver.

app.py CHANGED Viewed

@@ -22,11 +22,18 @@ def _():
     import pandas as pd
     import seaborn as sns
     from sklearn.linear_model import LogisticRegression
     from sklearn.metrics import roc_auc_score
-    from sklearn.ensemble import RandomForestClassifier
     from sklearn.model_selection import RandomizedSearchCV
     from src.plots import (
         plot_target_distribution,
         plot_credit_amounts,
@@ -181,6 +188,14 @@ def _(mo):
     return
 @app.cell
 def _(mo):
     mo.md("""**a. Credit Amounts**""")
@@ -299,6 +314,8 @@ def _(mo):
         - One Hot Encoding for more than 2 categories.
     - Impute values for all columns with missing data (using median as imputing value).
     - Feature scaling with Min-Max scaler
     """
     )
     return
@@ -441,6 +458,7 @@ def _(mo):
     We trained the Randomized Search CV using the following code:
     ```py
     param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
     rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1)
@@ -499,5 +517,113 @@ def _():
     return
 if __name__ == "__main__":
     app.run()

     import pandas as pd
     import seaborn as sns
+    from sklearn.ensemble import RandomForestClassifier
     from sklearn.linear_model import LogisticRegression
     from sklearn.metrics import roc_auc_score
     from sklearn.model_selection import RandomizedSearchCV
+    from sklearn.pipeline import Pipeline
+    from sklearn.compose import ColumnTransformer
+    from sklearn.impute import SimpleImputer
+    from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
+    from lightgbm import LGBMClassifier
     from src.plots import (
         plot_target_distribution,
         plot_credit_amounts,
     return
+@app.cell
+def _(mo):
+    mo.md(
+        r"""Want to see how these plots were created? You can find the source code for the visualizations in [plots.py](./src/plots.py)."""
+    )
+    return
 @app.cell
 def _(mo):
     mo.md("""**a. Credit Amounts**""")
         - One Hot Encoding for more than 2 categories.
     - Impute values for all columns with missing data (using median as imputing value).
     - Feature scaling with Min-Max scaler
+    Want to see how the dataset was processed? You can find the code for the preprocessing steps in [preprocessing.py](./src/preprocessing.py).
     """
     )
     return
     We trained the Randomized Search CV using the following code:
     ```py
+    # 📌 RandomizedSearchCV
     param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
     rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1)
     return
+@app.cell
+def _(mo):
+    mo.md(r"""### 3.4 LightGBM""")
+    return
+@app.cell
+def _(mo):
+    mo.md(
+        r"""
+    We trained our LightGBM Classifier model using the following code:
+    ```py
+    # 📌 LightGBM
+    import warnings
+    warnings.filterwarnings(
+        "ignore", message="X does not have valid feature names"
+    )
+    # 📌 Get numerical and categorical variables (binary and mutiple)
+    num_cols = X_train.select_dtypes(include="number").columns.to_list()
+    cat_cols = X_train.select_dtypes(include="object").columns.to_list()
+    binary_cols = [col for col in cat_cols if X_train[col].nunique() == 2]
+    multi_cols = [col for col in cat_cols if X_train[col].nunique() > 2]
+    # 📌 [1] Create the pipelines for different data types
+    numerical_pipeline = Pipeline(
+        steps=[
+            ("imputer", SimpleImputer(strategy="median")),
+            ("scaler", MinMaxScaler()),
+        ]
+    )
+    binary_pipeline = Pipeline(
+        steps=[
+            ("imputer", SimpleImputer(strategy="most_frequent")),
+            ("ordinal", OrdinalEncoder()),
+            ("scaler", MinMaxScaler()),
+        ]
+    )
+    multi_pipeline = Pipeline(
+        steps=[
+            ("imputer", SimpleImputer(strategy="most_frequent")),
+            (
+                "onehot",
+                OneHotEncoder(handle_unknown="ignore", sparse_output=False),
+            ),
+            ("scaler", MinMaxScaler()),
+        ]
+    )
+    # 📌 [2] Create the preprocessor using ColumnTransformer
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ("binary", binary_pipeline, binary_cols),
+            ("multi", multi_pipeline, multi_cols),
+            ("numerical", numerical_pipeline, num_cols),
+        ],
+        remainder="passthrough",
+    )
+    # 📌 [3] Create the Final Pipeline that combines the preprocessor and the model
+    lgbm = LGBMClassifier(
+        n_estimators=500,
+        learning_rate=0.05,
+        max_depth=-1,
+        random_state=42,
+        class_weight="balanced",
+        n_jobs=-1,
+    )
+    lgbm_pipeline = Pipeline(
+        steps=[("preprocessor", preprocessor), ("classifier", lgbm)]
+    )
+    # 📌 [4] Fit the Final Pipeline on the ORIGINAL, unprocessed data
+    # The pipeline takes care of all the preprocessing internally.
+    lgbm_pipeline.fit(X_train, y_train)
+    lgbm_train_pred = lgbm_pipeline.predict_proba(X_train)[:, 1]
+    lgbm_test_pred = lgbm_pipeline.predict_proba(X_test)[:, 1]
+    lgbm_scores = {
+        "train_score": roc_auc_score(y_train, lgbm_train_pred),
+        "test_score": roc_auc_score(y_test, lgbm_test_pred),
+    }
+    lgbm_scores
+    ```
+    📈 The ROC AUC scores obtained:
+    """
+    )
+    return
+@app.cell
+def _():
+    lgbm_scores = {
+        "train_score": 0.8523466410959462,
+        "test_score": 0.7514895868142193,
+    }
+    lgbm_scores
+    return
 if __name__ == "__main__":
     app.run()