Commit
Β·
e39e2ca
1
Parent(s):
c9351e4
chore: Add LightGBM model
Browse files
README.md
CHANGED
|
@@ -11,3 +11,28 @@ short_description: ML Classification models applied to Home Credit Risk dataset
|
|
| 11 |
|
| 12 |
Check out marimo at <https://github.com/marimo-team/marimo>
|
| 13 |
Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
Check out marimo at <https://github.com/marimo-team/marimo>
|
| 13 |
Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
|
| 14 |
+
|
| 15 |
+
## 3. Technology Stack
|
| 16 |
+
|
| 17 |
+
This project was built using the following technologies and libraries:
|
| 18 |
+
|
| 19 |
+
**Dashboard & Hosting:**
|
| 20 |
+
|
| 21 |
+
- [Marimo](https://github.com/marimo-team/marimo): A Python library for building interactive dashboards.
|
| 22 |
+
- [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces-config-reference): Used for hosting and sharing the interactive dashboard.
|
| 23 |
+
|
| 24 |
+
**Data Analysis & Visualization:**
|
| 25 |
+
|
| 26 |
+
- [Pandas](https://pandas.pydata.org/): For data manipulation and analysis.
|
| 27 |
+
- [Matplotlib](https://matplotlib.org/): For creating static visualizations.
|
| 28 |
+
- [Seaborn](https://seaborn.pydata.org/): For creating statistical graphics.
|
| 29 |
+
|
| 30 |
+
**Modeling & Training:**
|
| 31 |
+
|
| 32 |
+
- [Scikit-Learn](https://scikit-learn.org/stable/): For machine learning tasks such as preprocessing, feature engineering, and model training.
|
| 33 |
+
- [LightGBM](https://lightgbm.readthedocs.io/en/stable/): It is a gradient boosting framework that uses tree based learning algorithms.
|
| 34 |
+
|
| 35 |
+
**Development Tools:**
|
| 36 |
+
|
| 37 |
+
- [Ruff](https://github.com/charliermarsh/ruff): A fast Python linter and code formatter.
|
| 38 |
+
- [uv](https://github.com/astral-sh/uv): A fast Python package installer and resolver.
|
app.py
CHANGED
|
@@ -22,11 +22,18 @@ def _():
|
|
| 22 |
import pandas as pd
|
| 23 |
import seaborn as sns
|
| 24 |
|
|
|
|
| 25 |
from sklearn.linear_model import LogisticRegression
|
| 26 |
from sklearn.metrics import roc_auc_score
|
| 27 |
-
from sklearn.ensemble import RandomForestClassifier
|
| 28 |
from sklearn.model_selection import RandomizedSearchCV
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
from src.plots import (
|
| 31 |
plot_target_distribution,
|
| 32 |
plot_credit_amounts,
|
|
@@ -181,6 +188,14 @@ def _(mo):
|
|
| 181 |
return
|
| 182 |
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
@app.cell
|
| 185 |
def _(mo):
|
| 186 |
mo.md("""**a. Credit Amounts**""")
|
|
@@ -299,6 +314,8 @@ def _(mo):
|
|
| 299 |
- One Hot Encoding for more than 2 categories.
|
| 300 |
- Impute values for all columns with missing data (using median as imputing value).
|
| 301 |
- Feature scaling with Min-Max scaler
|
|
|
|
|
|
|
| 302 |
"""
|
| 303 |
)
|
| 304 |
return
|
|
@@ -441,6 +458,7 @@ def _(mo):
|
|
| 441 |
We trained the Randomized Search CV using the following code:
|
| 442 |
|
| 443 |
```py
|
|
|
|
| 444 |
param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
|
| 445 |
|
| 446 |
rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1)
|
|
@@ -499,5 +517,113 @@ def _():
|
|
| 499 |
return
|
| 500 |
|
| 501 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
if __name__ == "__main__":
|
| 503 |
app.run()
|
|
|
|
| 22 |
import pandas as pd
|
| 23 |
import seaborn as sns
|
| 24 |
|
| 25 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 26 |
from sklearn.linear_model import LogisticRegression
|
| 27 |
from sklearn.metrics import roc_auc_score
|
|
|
|
| 28 |
from sklearn.model_selection import RandomizedSearchCV
|
| 29 |
|
| 30 |
+
from sklearn.pipeline import Pipeline
|
| 31 |
+
from sklearn.compose import ColumnTransformer
|
| 32 |
+
from sklearn.impute import SimpleImputer
|
| 33 |
+
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
|
| 34 |
+
|
| 35 |
+
from lightgbm import LGBMClassifier
|
| 36 |
+
|
| 37 |
from src.plots import (
|
| 38 |
plot_target_distribution,
|
| 39 |
plot_credit_amounts,
|
|
|
|
| 188 |
return
|
| 189 |
|
| 190 |
|
| 191 |
+
@app.cell
|
| 192 |
+
def _(mo):
|
| 193 |
+
mo.md(
|
| 194 |
+
r"""Want to see how these plots were created? You can find the source code for the visualizations in [plots.py](./src/plots.py)."""
|
| 195 |
+
)
|
| 196 |
+
return
|
| 197 |
+
|
| 198 |
+
|
| 199 |
@app.cell
|
| 200 |
def _(mo):
|
| 201 |
mo.md("""**a. Credit Amounts**""")
|
|
|
|
| 314 |
- One Hot Encoding for more than 2 categories.
|
| 315 |
- Impute values for all columns with missing data (using median as imputing value).
|
| 316 |
- Feature scaling with Min-Max scaler
|
| 317 |
+
|
| 318 |
+
Want to see how the dataset was processed? You can find the code for the preprocessing steps in [preprocessing.py](./src/preprocessing.py).
|
| 319 |
"""
|
| 320 |
)
|
| 321 |
return
|
|
|
|
| 458 |
We trained the Randomized Search CV using the following code:
|
| 459 |
|
| 460 |
```py
|
| 461 |
+
# π RandomizedSearchCV
|
| 462 |
param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
|
| 463 |
|
| 464 |
rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1)
|
|
|
|
| 517 |
return
|
| 518 |
|
| 519 |
|
| 520 |
+
@app.cell
|
| 521 |
+
def _(mo):
|
| 522 |
+
mo.md(r"""### 3.4 LightGBM""")
|
| 523 |
+
return
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
@app.cell
|
| 527 |
+
def _(mo):
|
| 528 |
+
mo.md(
|
| 529 |
+
r"""
|
| 530 |
+
We trained our LightGBM Classifier model using the following code:
|
| 531 |
+
|
| 532 |
+
```py
|
| 533 |
+
# π LightGBM
|
| 534 |
+
import warnings
|
| 535 |
+
|
| 536 |
+
warnings.filterwarnings(
|
| 537 |
+
"ignore", message="X does not have valid feature names"
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
# π Get numerical and categorical variables (binary and mutiple)
|
| 541 |
+
num_cols = X_train.select_dtypes(include="number").columns.to_list()
|
| 542 |
+
cat_cols = X_train.select_dtypes(include="object").columns.to_list()
|
| 543 |
+
|
| 544 |
+
binary_cols = [col for col in cat_cols if X_train[col].nunique() == 2]
|
| 545 |
+
multi_cols = [col for col in cat_cols if X_train[col].nunique() > 2]
|
| 546 |
+
|
| 547 |
+
# π [1] Create the pipelines for different data types
|
| 548 |
+
numerical_pipeline = Pipeline(
|
| 549 |
+
steps=[
|
| 550 |
+
("imputer", SimpleImputer(strategy="median")),
|
| 551 |
+
("scaler", MinMaxScaler()),
|
| 552 |
+
]
|
| 553 |
+
)
|
| 554 |
+
|
| 555 |
+
binary_pipeline = Pipeline(
|
| 556 |
+
steps=[
|
| 557 |
+
("imputer", SimpleImputer(strategy="most_frequent")),
|
| 558 |
+
("ordinal", OrdinalEncoder()),
|
| 559 |
+
("scaler", MinMaxScaler()),
|
| 560 |
+
]
|
| 561 |
+
)
|
| 562 |
+
|
| 563 |
+
multi_pipeline = Pipeline(
|
| 564 |
+
steps=[
|
| 565 |
+
("imputer", SimpleImputer(strategy="most_frequent")),
|
| 566 |
+
(
|
| 567 |
+
"onehot",
|
| 568 |
+
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
|
| 569 |
+
),
|
| 570 |
+
("scaler", MinMaxScaler()),
|
| 571 |
+
]
|
| 572 |
+
)
|
| 573 |
+
|
| 574 |
+
# π [2] Create the preprocessor using ColumnTransformer
|
| 575 |
+
preprocessor = ColumnTransformer(
|
| 576 |
+
transformers=[
|
| 577 |
+
("binary", binary_pipeline, binary_cols),
|
| 578 |
+
("multi", multi_pipeline, multi_cols),
|
| 579 |
+
("numerical", numerical_pipeline, num_cols),
|
| 580 |
+
],
|
| 581 |
+
remainder="passthrough",
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
+
# π [3] Create the Final Pipeline that combines the preprocessor and the model
|
| 585 |
+
lgbm = LGBMClassifier(
|
| 586 |
+
n_estimators=500,
|
| 587 |
+
learning_rate=0.05,
|
| 588 |
+
max_depth=-1,
|
| 589 |
+
random_state=42,
|
| 590 |
+
class_weight="balanced",
|
| 591 |
+
n_jobs=-1,
|
| 592 |
+
)
|
| 593 |
+
|
| 594 |
+
lgbm_pipeline = Pipeline(
|
| 595 |
+
steps=[("preprocessor", preprocessor), ("classifier", lgbm)]
|
| 596 |
+
)
|
| 597 |
+
|
| 598 |
+
# π [4] Fit the Final Pipeline on the ORIGINAL, unprocessed data
|
| 599 |
+
# The pipeline takes care of all the preprocessing internally.
|
| 600 |
+
lgbm_pipeline.fit(X_train, y_train)
|
| 601 |
+
|
| 602 |
+
lgbm_train_pred = lgbm_pipeline.predict_proba(X_train)[:, 1]
|
| 603 |
+
lgbm_test_pred = lgbm_pipeline.predict_proba(X_test)[:, 1]
|
| 604 |
+
|
| 605 |
+
lgbm_scores = {
|
| 606 |
+
"train_score": roc_auc_score(y_train, lgbm_train_pred),
|
| 607 |
+
"test_score": roc_auc_score(y_test, lgbm_test_pred),
|
| 608 |
+
}
|
| 609 |
+
lgbm_scores
|
| 610 |
+
```
|
| 611 |
+
|
| 612 |
+
π The ROC AUC scores obtained:
|
| 613 |
+
"""
|
| 614 |
+
)
|
| 615 |
+
return
|
| 616 |
+
|
| 617 |
+
|
| 618 |
+
@app.cell
|
| 619 |
+
def _():
|
| 620 |
+
lgbm_scores = {
|
| 621 |
+
"train_score": 0.8523466410959462,
|
| 622 |
+
"test_score": 0.7514895868142193,
|
| 623 |
+
}
|
| 624 |
+
lgbm_scores
|
| 625 |
+
return
|
| 626 |
+
|
| 627 |
+
|
| 628 |
if __name__ == "__main__":
|
| 629 |
app.run()
|