iBrokeTheCode commited on
Commit
e39e2ca
Β·
1 Parent(s): c9351e4

chore: Add LightGBM model

Browse files
Files changed (2) hide show
  1. README.md +25 -0
  2. app.py +127 -1
README.md CHANGED
@@ -11,3 +11,28 @@ short_description: ML Classification models applied to Home Credit Risk dataset
11
 
12
  Check out marimo at <https://github.com/marimo-team/marimo>
13
  Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  Check out marimo at <https://github.com/marimo-team/marimo>
13
  Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
14
+
15
+ ## 3. Technology Stack
16
+
17
+ This project was built using the following technologies and libraries:
18
+
19
+ **Dashboard & Hosting:**
20
+
21
+ - [Marimo](https://github.com/marimo-team/marimo): A Python library for building interactive dashboards.
22
+ - [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces-config-reference): Used for hosting and sharing the interactive dashboard.
23
+
24
+ **Data Analysis & Visualization:**
25
+
26
+ - [Pandas](https://pandas.pydata.org/): For data manipulation and analysis.
27
+ - [Matplotlib](https://matplotlib.org/): For creating static visualizations.
28
+ - [Seaborn](https://seaborn.pydata.org/): For creating statistical graphics.
29
+
30
+ **Modeling & Training:**
31
+
32
+ - [Scikit-Learn](https://scikit-learn.org/stable/): For machine learning tasks such as preprocessing, feature engineering, and model training.
33
+ - [LightGBM](https://lightgbm.readthedocs.io/en/stable/): It is a gradient boosting framework that uses tree based learning algorithms.
34
+
35
+ **Development Tools:**
36
+
37
+ - [Ruff](https://github.com/charliermarsh/ruff): A fast Python linter and code formatter.
38
+ - [uv](https://github.com/astral-sh/uv): A fast Python package installer and resolver.
app.py CHANGED
@@ -22,11 +22,18 @@ def _():
22
  import pandas as pd
23
  import seaborn as sns
24
 
 
25
  from sklearn.linear_model import LogisticRegression
26
  from sklearn.metrics import roc_auc_score
27
- from sklearn.ensemble import RandomForestClassifier
28
  from sklearn.model_selection import RandomizedSearchCV
29
 
 
 
 
 
 
 
 
30
  from src.plots import (
31
  plot_target_distribution,
32
  plot_credit_amounts,
@@ -181,6 +188,14 @@ def _(mo):
181
  return
182
 
183
 
 
 
 
 
 
 
 
 
184
  @app.cell
185
  def _(mo):
186
  mo.md("""**a. Credit Amounts**""")
@@ -299,6 +314,8 @@ def _(mo):
299
  - One Hot Encoding for more than 2 categories.
300
  - Impute values for all columns with missing data (using median as imputing value).
301
  - Feature scaling with Min-Max scaler
 
 
302
  """
303
  )
304
  return
@@ -441,6 +458,7 @@ def _(mo):
441
  We trained the Randomized Search CV using the following code:
442
 
443
  ```py
 
444
  param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
445
 
446
  rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1)
@@ -499,5 +517,113 @@ def _():
499
  return
500
 
501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
  if __name__ == "__main__":
503
  app.run()
 
22
  import pandas as pd
23
  import seaborn as sns
24
 
25
+ from sklearn.ensemble import RandomForestClassifier
26
  from sklearn.linear_model import LogisticRegression
27
  from sklearn.metrics import roc_auc_score
 
28
  from sklearn.model_selection import RandomizedSearchCV
29
 
30
+ from sklearn.pipeline import Pipeline
31
+ from sklearn.compose import ColumnTransformer
32
+ from sklearn.impute import SimpleImputer
33
+ from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
34
+
35
+ from lightgbm import LGBMClassifier
36
+
37
  from src.plots import (
38
  plot_target_distribution,
39
  plot_credit_amounts,
 
188
  return
189
 
190
 
191
+ @app.cell
192
+ def _(mo):
193
+ mo.md(
194
+ r"""Want to see how these plots were created? You can find the source code for the visualizations in [plots.py](./src/plots.py)."""
195
+ )
196
+ return
197
+
198
+
199
  @app.cell
200
  def _(mo):
201
  mo.md("""**a. Credit Amounts**""")
 
314
  - One Hot Encoding for more than 2 categories.
315
  - Impute values for all columns with missing data (using median as imputing value).
316
  - Feature scaling with Min-Max scaler
317
+
318
+ Want to see how the dataset was processed? You can find the code for the preprocessing steps in [preprocessing.py](./src/preprocessing.py).
319
  """
320
  )
321
  return
 
458
  We trained the Randomized Search CV using the following code:
459
 
460
  ```py
461
+ # πŸ“Œ RandomizedSearchCV
462
  param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
463
 
464
  rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1)
 
517
  return
518
 
519
 
520
+ @app.cell
521
+ def _(mo):
522
+ mo.md(r"""### 3.4 LightGBM""")
523
+ return
524
+
525
+
526
+ @app.cell
527
+ def _(mo):
528
+ mo.md(
529
+ r"""
530
+ We trained our LightGBM Classifier model using the following code:
531
+
532
+ ```py
533
+ # πŸ“Œ LightGBM
534
+ import warnings
535
+
536
+ warnings.filterwarnings(
537
+ "ignore", message="X does not have valid feature names"
538
+ )
539
+
540
+ # πŸ“Œ Get numerical and categorical variables (binary and mutiple)
541
+ num_cols = X_train.select_dtypes(include="number").columns.to_list()
542
+ cat_cols = X_train.select_dtypes(include="object").columns.to_list()
543
+
544
+ binary_cols = [col for col in cat_cols if X_train[col].nunique() == 2]
545
+ multi_cols = [col for col in cat_cols if X_train[col].nunique() > 2]
546
+
547
+ # πŸ“Œ [1] Create the pipelines for different data types
548
+ numerical_pipeline = Pipeline(
549
+ steps=[
550
+ ("imputer", SimpleImputer(strategy="median")),
551
+ ("scaler", MinMaxScaler()),
552
+ ]
553
+ )
554
+
555
+ binary_pipeline = Pipeline(
556
+ steps=[
557
+ ("imputer", SimpleImputer(strategy="most_frequent")),
558
+ ("ordinal", OrdinalEncoder()),
559
+ ("scaler", MinMaxScaler()),
560
+ ]
561
+ )
562
+
563
+ multi_pipeline = Pipeline(
564
+ steps=[
565
+ ("imputer", SimpleImputer(strategy="most_frequent")),
566
+ (
567
+ "onehot",
568
+ OneHotEncoder(handle_unknown="ignore", sparse_output=False),
569
+ ),
570
+ ("scaler", MinMaxScaler()),
571
+ ]
572
+ )
573
+
574
+ # πŸ“Œ [2] Create the preprocessor using ColumnTransformer
575
+ preprocessor = ColumnTransformer(
576
+ transformers=[
577
+ ("binary", binary_pipeline, binary_cols),
578
+ ("multi", multi_pipeline, multi_cols),
579
+ ("numerical", numerical_pipeline, num_cols),
580
+ ],
581
+ remainder="passthrough",
582
+ )
583
+
584
+ # πŸ“Œ [3] Create the Final Pipeline that combines the preprocessor and the model
585
+ lgbm = LGBMClassifier(
586
+ n_estimators=500,
587
+ learning_rate=0.05,
588
+ max_depth=-1,
589
+ random_state=42,
590
+ class_weight="balanced",
591
+ n_jobs=-1,
592
+ )
593
+
594
+ lgbm_pipeline = Pipeline(
595
+ steps=[("preprocessor", preprocessor), ("classifier", lgbm)]
596
+ )
597
+
598
+ # πŸ“Œ [4] Fit the Final Pipeline on the ORIGINAL, unprocessed data
599
+ # The pipeline takes care of all the preprocessing internally.
600
+ lgbm_pipeline.fit(X_train, y_train)
601
+
602
+ lgbm_train_pred = lgbm_pipeline.predict_proba(X_train)[:, 1]
603
+ lgbm_test_pred = lgbm_pipeline.predict_proba(X_test)[:, 1]
604
+
605
+ lgbm_scores = {
606
+ "train_score": roc_auc_score(y_train, lgbm_train_pred),
607
+ "test_score": roc_auc_score(y_test, lgbm_test_pred),
608
+ }
609
+ lgbm_scores
610
+ ```
611
+
612
+ πŸ“ˆ The ROC AUC scores obtained:
613
+ """
614
+ )
615
+ return
616
+
617
+
618
+ @app.cell
619
+ def _():
620
+ lgbm_scores = {
621
+ "train_score": 0.8523466410959462,
622
+ "test_score": 0.7514895868142193,
623
+ }
624
+ lgbm_scores
625
+ return
626
+
627
+
628
  if __name__ == "__main__":
629
  app.run()