iBrokeTheCode commited on
Commit
d234096
·
1 Parent(s): c742ac4

feat: Add model prediction app

Browse files
Files changed (2) hide show
  1. app.py +353 -537
  2. tutorial_app.ipynb +60 -1
app.py CHANGED
@@ -6,637 +6,420 @@ app = marimo.App()
6
 
7
  @app.cell
8
  def _():
9
- import marimo as mo
10
- return (mo,)
11
-
12
-
13
- @app.cell
14
- def _(mo):
15
- mo.center(mo.md("# Home Credit Default Risk Prediction"))
16
- return
17
-
18
 
19
- @app.cell
20
- def _():
21
  import pandas as pd
22
 
23
- from sklearn.ensemble import RandomForestClassifier
24
- from sklearn.linear_model import LogisticRegression
25
- from sklearn.metrics import roc_auc_score
26
- from sklearn.model_selection import RandomizedSearchCV
27
-
28
- from sklearn.pipeline import Pipeline
29
- from sklearn.compose import ColumnTransformer
30
- from sklearn.impute import SimpleImputer
31
- from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
32
-
33
- from lightgbm import LGBMClassifier
34
-
35
- from src.plots import (
36
- plot_target_distribution,
37
- plot_credit_amounts,
38
- plot_education_levels,
39
- plot_occupation,
40
- plot_family_status,
41
- plot_income_type,
42
- )
43
- from src.utils import get_dataset, get_features_target, get_train_test_sets
44
- from src.preprocessing import preprocess_data_pipeline
45
- return (
46
- get_dataset,
47
- get_features_target,
48
- get_train_test_sets,
49
- pd,
50
- plot_credit_amounts,
51
- plot_education_levels,
52
- plot_family_status,
53
- plot_income_type,
54
- plot_occupation,
55
- plot_target_distribution,
56
- preprocess_data_pipeline,
57
- )
58
-
59
-
60
- @app.cell
61
- def _(get_dataset, get_features_target):
62
- df = get_dataset()
63
- X, y = get_features_target(df)
64
- return X, df, y
65
-
66
-
67
- @app.cell
68
- def _(mo):
69
- mo.md("""## 1. Exploratory Data Analysis""")
70
- return
71
-
72
-
73
- @app.cell
74
- def _(mo):
75
- mo.callout(
76
- kind="info",
77
- value=mo.md(
78
- """💡 **Want a step-by-step walkthrough instead?**
79
- Check the Jupyter notebook version here: 👉 [Jupyter notebook](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/tutorial_app.ipynb)""",
80
- ),
81
  )
82
- return
83
-
84
-
85
- @app.cell
86
- def _(mo):
87
- mo.md("""### 1.1 Dataset Information""")
88
- return
89
-
90
-
91
- @app.cell
92
- def _(mo):
93
- mo.md("""**a. Shape of the train and test datasets**""")
94
- return
95
-
96
-
97
- @app.cell
98
- def _(X_test, X_train, df):
99
- train_samples = "Train dataset samples: {}".format(X_train.shape[0])
100
- test_samples = "Test dataset samples: {}".format(X_test.shape[0])
101
- columns_number = "Number of columns: {}".format(df.shape[1])
102
-
103
- train_samples, test_samples, columns_number
104
- return
105
-
106
-
107
- @app.cell
108
- def _(mo):
109
- mo.md("""**b. Dataset features**""")
110
- return
111
-
112
-
113
- @app.cell
114
- def _(X):
115
- X.columns
116
- return
117
 
118
 
119
  @app.cell
120
  def _(mo):
121
- mo.md("""**c. Sample from dataset**""")
122
- return
123
-
124
-
125
- @app.cell
126
- def _(X):
127
- sample = X.head(5).T
128
- sample.columns = [
129
- str(col) for col in sample.columns
130
- ] # fix integer name warning
131
- sample = sample.astype(str) # avoid numeric conversion issues in viewer
132
- sample
133
  return
134
 
135
 
136
  @app.cell
137
  def _(mo):
138
- mo.md("""**d. Target variable Distribution**""")
139
  return
140
 
141
 
142
  @app.cell
143
- def _(df, plot_target_distribution):
144
- target_table, target_plot = plot_target_distribution(df=df)
145
- target_table
146
- return (target_plot,)
 
147
 
148
 
149
  @app.cell
150
- def _(target_plot):
151
- target_plot
152
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
 
155
  @app.cell
156
  def _(mo):
157
- mo.md("""**e. Number of columns of each data type**""")
158
- return
159
-
160
-
161
- @app.cell
162
- def _(X):
163
- X.dtypes.value_counts().sort_values(ascending=False)
164
- return
165
-
166
-
167
- @app.cell
168
- def _(X):
169
- categorical_cols = (
170
- X.select_dtypes(include=["object"]).nunique().sort_values(ascending=False)
171
  )
172
- categorical_cols
173
- return
174
-
175
-
176
- @app.cell
177
- def _(mo):
178
- mo.md("""**f. Missing data**""")
179
- return
180
-
181
 
182
- @app.cell
183
- def _(X, pd):
184
- missing_count = X.isna().sum().sort_values(ascending=False)
185
- missing_percentage = (missing_count / X.shape[0] * 100).round(2)
186
-
187
- missing_data = pd.DataFrame(
188
- data={"Count": missing_count, "percentage": missing_percentage}
189
  )
190
- missing_data
191
- return
192
-
193
-
194
- @app.cell
195
- def _(mo):
196
- mo.md("""### 1.2 Distribution of Variables""")
197
- return
198
 
199
-
200
- @app.cell
201
- def _(mo):
202
- mo.md(
203
- r"""Want to see how these plots were created? You can find the source code for the visualizations in [plots.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/plots.py)."""
204
  )
205
- return
206
-
207
-
208
- @app.cell
209
- def _(mo):
210
- mo.md("""**a. Credit Amounts**""")
211
- return
212
-
213
-
214
- @app.cell
215
- def _(X, plot_credit_amounts):
216
- plot_credit_amounts(df=X)
217
- return
218
 
 
 
 
 
 
 
 
219
 
220
- @app.cell
221
- def _(mo):
222
- mo.md("""**b. Education Level of Credit Applicants**""")
223
- return
 
 
 
224
 
 
 
 
 
 
 
 
225
 
226
- @app.cell
227
- def _(X, plot_education_levels):
228
- education_table, education_plot = plot_education_levels(df=X)
229
- education_table
230
- return (education_plot,)
 
231
 
 
 
 
 
 
 
232
 
233
- @app.cell
234
- def _(education_plot):
235
- education_plot
236
- return
 
 
237
 
 
 
 
 
 
 
 
238
 
239
- @app.cell
240
- def _(mo):
241
- mo.md("""**c. Ocupation of Credit Applicants**""")
242
- return
 
 
 
 
 
 
 
 
 
243
 
244
 
245
  @app.cell
246
- def _(X, plot_occupation):
247
- occupation_table, occupation_plot = plot_occupation(df=X)
248
- occupation_table
249
- return (occupation_plot,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
 
252
  @app.cell
253
- def _(occupation_plot):
254
- occupation_plot
 
255
  return
256
 
257
 
258
  @app.cell
259
- def _(mo):
260
- mo.md("""**d. Family Status of Applicants**""")
261
- return
262
 
 
 
 
 
263
 
264
- @app.cell
265
- def _(X, plot_family_status):
266
- family_status_table, family_status_plot = plot_family_status(df=X)
267
- family_status_table
268
- return (family_status_plot,)
269
 
 
 
270
 
271
- @app.cell
272
- def _(family_status_plot):
273
- family_status_plot
274
- return
 
275
 
276
 
277
  @app.cell
278
- def _(mo):
279
- mo.md("""**e. Income Type of Applicants by Target Variable**""")
280
- return
 
 
281
 
 
 
282
 
283
- @app.cell
284
- def _(df, plot_income_type):
285
- plot_income_type(df=df)
286
- return
 
 
 
 
 
 
 
287
 
288
 
289
  @app.cell
290
  def _(mo):
291
- mo.md("""## 2. Preprocessing""")
292
  return
293
 
294
 
295
  @app.cell
296
  def _(mo):
297
- mo.md("""**a. Separate Train and Test Datasets**""")
298
  return
299
 
300
 
301
- @app.cell
302
- def _(X, get_train_test_sets, y):
303
- X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
304
- X_train.shape, y_train.shape, X_test.shape, y_test.shape
305
- return X_test, X_train
306
-
307
-
308
  @app.cell
309
  def _(mo):
310
- mo.md("""**b. Preprocess Data**""")
311
  return
312
 
313
 
314
  @app.cell
315
- def _(mo):
316
- mo.md(
317
- r"""
318
- This preprocessing perform:
319
-
320
- - Correct outliers/anomalous values in numerical columns (`DAYS_EMPLOYED` column).
321
- - Encode string categorical features (`dtype object`).
322
- - If the feature has 2 categories, Binary Encoding is applied.
323
- - One Hot Encoding for more than 2 categories.
324
- - Impute values for all columns with missing data (using median as imputing value).
325
- - Feature scaling with Min-Max scaler
326
-
327
- Want to see how the dataset was processed? You can find the code for the preprocessing steps in [preprocessing.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/preprocessing.py).
328
  """
329
- )
330
- return
331
 
332
-
333
- @app.cell
334
- def _(X_test, X_train, preprocess_data_pipeline):
335
- train_data, test_data = preprocess_data_pipeline(
336
- train_df=X_train, test_df=X_test
 
337
  )
338
- train_data.shape, test_data.shape
339
- return
340
-
341
 
342
- @app.cell
343
- def _(mo):
344
- mo.md("""## 3. Training Models""")
345
- return
 
 
 
346
 
347
 
348
  @app.cell
349
- def _(mo):
350
- mo.md(
351
- r"""At this points, we will work with `train_data` and `test_data` as features sets; also `y_train` and `y_test` as target sets."""
 
 
 
 
 
 
352
  )
353
  return
354
 
355
 
356
  @app.cell
357
  def _(mo):
358
- mo.md(r"""### 3.1 Logistic Regression""")
359
  return
360
 
361
 
362
  @app.cell
363
  def _(mo):
364
  mo.callout(
365
- mo.md("""
366
- In Logistic Regression, C is the inverse of regularization strength:
367
-
368
- - **Small C** → Stronger regularization → Simpler model, less overfitting risk, but may underfit.
369
- - **Large C** → Weaker regularization → Model fits training data more closely, but may overfit.
370
- """),
371
  kind="info",
 
 
 
 
372
  )
373
  return
374
 
375
 
376
  @app.cell
377
  def _(mo):
378
- mo.md(
379
- r"""
380
- We trained our Logistic Regression model using the following code:
381
-
382
- ```py
383
- # 📌 Logistic Regression
384
- log_reg = LogisticRegression(C=0.0001)
385
- log_reg.fit(train_data, y_train)
386
-
387
- # Train data predicton (class 1)
388
- lr_train_pred = log_reg.predict_proba(train_data)[:, 1]
389
-
390
- # Test data prediction (class 1)
391
- lr_test_pred = log_reg.predict_proba(test_data)[:, 1]
392
-
393
- # Get the ROC AUC Score on train and test datasets
394
- log_reg_scores = {
395
- "train_score": roc_auc_score(y_train, lr_train_pred),
396
- "test_score": roc_auc_score(y_test, lr_test_pred),
397
- }
398
- log_reg_scores
399
- ```
400
-
401
- 📈 The ROC AUC scores obtained:
402
- """
403
- )
404
- return
405
-
406
-
407
- @app.cell
408
- def _():
409
- lr_scores = {
410
- "train_score": 0.6868418961663535,
411
- "test_score": 0.6854973003347028,
412
- }
413
- lr_scores
414
- return
415
-
416
-
417
- @app.cell
418
- def _(mo):
419
- mo.md(r"""### 3.2 Random Forest Classifier""")
420
- return
421
-
422
-
423
- @app.cell
424
- def _(mo):
425
- mo.md(
426
- r"""
427
- We trained our Random Forest Classifier model using the following code:
428
-
429
- ```py
430
- # 📌 Random Forest Classifier
431
- rf = RandomForestClassifier(random_state=42, n_jobs=-1)
432
- rf.fit(train_data, y_train)
433
-
434
- rf_train_pred = rf.predict_proba(train_data)[:, 1]
435
- rf_test_pred = rf.predict_proba(test_data)[:, 1]
436
-
437
- rf_scores = {
438
- "train_score": roc_auc_score(y_train, rf_train_pred),
439
- "test_score": roc_auc_score(y_test, rf_test_pred),
440
- }
441
- rf_scores
442
- ```
443
-
444
- 📈 The ROC AUC scores obtained:
445
- """
446
- )
447
- return
448
-
449
-
450
- @app.cell
451
- def _():
452
- rf_scores = {"train_score": 1.0, "test_score": 0.7066811557903828}
453
- rf_scores
454
- return
455
-
456
-
457
- @app.cell
458
- def _(mo):
459
- mo.md(r"""### 3.3. Randomized Search with Cross Validations""")
460
- return
461
-
462
-
463
- @app.cell
464
- def _(mo):
465
- mo.md(
466
- r"""
467
- We trained the Randomized Search CV using the following code:
468
-
469
- ```py
470
- # 📌 RandomizedSearchCV
471
- param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
472
-
473
- rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1)
474
- rscv = RandomizedSearchCV(
475
- estimator=rf_optimized,
476
- param_distributions=param_dist,
477
- n_iter=5,
478
- scoring="roc_auc",
479
- cv=3,
480
- random_state=42,
481
- n_jobs=-1,
482
- )
483
-
484
- rscv.fit(train_data, y_train)
485
-
486
- rfo_train_pred = rscv.predict_proba(train_data)[:, 1]
487
- rfo_test_pred = rscv.predict_proba(test_data)[:, 1]
488
-
489
- rfo_scores = {
490
- "train_score": roc_auc_score(y_train, rfo_train_pred),
491
- "test_score": roc_auc_score(y_test, rfo_test_pred),
492
- }
493
- rfo_scores
494
- ```
495
-
496
- 📈 The ROC AUC scores obtained:
497
- """
498
- )
499
- return
500
-
501
-
502
- @app.cell
503
- def _():
504
- rfo_scores = {
505
- "train_score": 0.8196620915431655,
506
- "test_score": 0.7308385425476998,
507
- }
508
- rfo_scores
509
- return
510
-
511
-
512
- @app.cell
513
- def _(mo):
514
- mo.md(r"""🥇The best results:""")
515
- return
516
-
517
-
518
- @app.cell
519
- def _():
520
- optimized_results = {
521
- "best_params_": {"n_estimators": 100, "max_depth": 10},
522
- "best_score_": 0.7296259755147781,
523
- "best_estimator_": "RandomForestClassifier(max_depth=10, n_jobs=-1, random_state=42)",
524
- }
525
- optimized_results
526
- return
527
-
528
-
529
- @app.cell
530
- def _(mo):
531
- mo.md(r"""### 3.4 LightGBM""")
532
  return
533
 
534
 
535
  @app.cell
536
  def _(mo):
537
- mo.md(
538
- r"""
539
- We trained our LightGBM Classifier model using the following code:
540
-
541
- ```py
542
- # 📌 LightGBM
543
- import warnings
544
-
545
- warnings.filterwarnings(
546
- "ignore", message="X does not have valid feature names"
547
- )
548
-
549
- # 📌 Get numerical and categorical variables (binary and mutiple)
550
- num_cols = X_train.select_dtypes(include="number").columns.to_list()
551
- cat_cols = X_train.select_dtypes(include="object").columns.to_list()
552
-
553
- binary_cols = [col for col in cat_cols if X_train[col].nunique() == 2]
554
- multi_cols = [col for col in cat_cols if X_train[col].nunique() > 2]
555
-
556
- # 📌 [1] Create the pipelines for different data types
557
- numerical_pipeline = Pipeline(
558
- steps=[
559
- ("imputer", SimpleImputer(strategy="median")),
560
- ("scaler", MinMaxScaler()),
561
- ]
562
- )
563
-
564
- binary_pipeline = Pipeline(
565
- steps=[
566
- ("imputer", SimpleImputer(strategy="most_frequent")),
567
- ("ordinal", OrdinalEncoder()),
568
- ("scaler", MinMaxScaler()),
569
- ]
570
- )
571
-
572
- multi_pipeline = Pipeline(
573
- steps=[
574
- ("imputer", SimpleImputer(strategy="most_frequent")),
575
- (
576
- "onehot",
577
- OneHotEncoder(handle_unknown="ignore", sparse_output=False),
578
- ),
579
- ("scaler", MinMaxScaler()),
580
- ]
581
- )
582
-
583
- # 📌 [2] Create the preprocessor using ColumnTransformer
584
- preprocessor = ColumnTransformer(
585
- transformers=[
586
- ("binary", binary_pipeline, binary_cols),
587
- ("multi", multi_pipeline, multi_cols),
588
- ("numerical", numerical_pipeline, num_cols),
589
- ],
590
- remainder="passthrough",
591
- )
592
-
593
- # 📌 [3] Create the Final Pipeline that combines the preprocessor and the model
594
- lgbm = LGBMClassifier(
595
- n_estimators=500,
596
- learning_rate=0.05,
597
- max_depth=-1,
598
- random_state=42,
599
- class_weight="balanced",
600
- n_jobs=-1,
601
- )
602
-
603
- lgbm_pipeline = Pipeline(
604
- steps=[("preprocessor", preprocessor), ("classifier", lgbm)]
605
- )
606
-
607
- # 📌 [4] Fit the Final Pipeline on the ORIGINAL, unprocessed data
608
- # The pipeline takes care of all the preprocessing internally.
609
- lgbm_pipeline.fit(X_train, y_train)
610
-
611
- lgbm_train_pred = lgbm_pipeline.predict_proba(X_train)[:, 1]
612
- lgbm_test_pred = lgbm_pipeline.predict_proba(X_test)[:, 1]
613
-
614
- lgbm_scores = {
615
- "train_score": roc_auc_score(y_train, lgbm_train_pred),
616
- "test_score": roc_auc_score(y_test, lgbm_test_pred),
617
- }
618
- lgbm_scores
619
- ```
620
-
621
- 📈 The ROC AUC scores obtained:
622
- """
623
- )
624
- return
625
-
626
-
627
- @app.cell
628
- def _():
629
- lgbm_scores = {
630
- "train_score": 0.8523466410959462,
631
- "test_score": 0.7514895868142193,
632
- }
633
- lgbm_scores
634
- return
635
-
636
-
637
- @app.cell
638
- def _(mo):
639
- mo.md(r"""## 4. Model Performance Analysis""")
640
  return
641
 
642
 
@@ -645,7 +428,7 @@ def _(mo):
645
  lg_stat = mo.stat(
646
  label="Logistic Regression",
647
  bordered=True,
648
- value="🏋️ 0.687 🔎 0.685",
649
  caption="Scores are consistent across train and test, indicating no overfitting. However, the overall AUC is low, suggesting underfitting — the model is too simple to capture complex patterns.",
650
  direction="decrease",
651
  )
@@ -653,7 +436,7 @@ def _(mo):
653
  rfc_stat = mo.stat(
654
  label="Random Forest Classifier",
655
  bordered=True,
656
- value="🏋️ 1.0 🔎 0.707",
657
  caption="Perfect training AUC indicates severe overfitting — the model memorized the training set. While the test score is better than Logistic Regression, the gap is too large for good generalization.",
658
  direction="decrease",
659
  )
@@ -661,7 +444,7 @@ def _(mo):
661
  rfo_stat = mo.stat(
662
  label="Random Forest with Randomized Search",
663
  bordered=True,
664
- value="🏋️ 0.820 🔎 0.731",
665
  caption="Hyperparameter tuning greatly reduced overfitting. The smaller train–test gap and improved test AUC show better generalization and a strong performance.",
666
  direction="increase",
667
  )
@@ -669,7 +452,7 @@ def _(mo):
669
  lgbm_stat = mo.stat(
670
  label="LightGBM",
671
  bordered=True,
672
- value="🏋️ 0.852 🔎 0.751",
673
  caption="Best overall performance. Small train–test gap and highest test AUC indicate a well-balanced model with strong generalization.",
674
  direction="increase",
675
  )
@@ -689,23 +472,49 @@ def _(mo):
689
 
690
  @app.cell
691
  def _(mo):
692
- mo.md(r"""## 5. Model Selection""")
693
  return
694
 
695
 
696
  @app.cell
697
  def _(mo):
698
  mo.md(
699
- r"""
700
- Based on a comparison of all the models, the final model selection is clear.
 
 
 
 
 
 
 
701
 
702
- | Model | Train Score (AUC ROC) | Test Score (AUC ROC) |
 
 
 
 
 
703
  | :--- | :---: | :---: |
704
  | Logistic Regression | 0.687 | 0.685 |
705
  | Random Forest Classifier | 1.000 | 0.707 |
706
  | Randomized Search (Tuned RF) | 0.820 | 0.731 |
707
  | **LightGBM** | 0.852 | **0.751** |
 
 
 
 
 
 
 
 
 
708
 
 
 
 
 
 
709
  * The **Logistic Regression** model performed poorly due to underfitting.
710
  * The base **Random Forest** model, while better, suffered from severe overfitting.
711
  * The tuned **Random Forest** model was a significant improvement and a strong contender, achieving a solid `test_score`.
@@ -717,9 +526,16 @@ def _(mo):
717
 
718
  @app.cell
719
  def _(mo):
720
- mo.callout(
721
- kind="success",
722
- value="🥇 Therefore, we will select the LightGBM model as our final choice for deployment.",
 
 
 
 
 
 
 
723
  )
724
  return
725
 
 
6
 
7
  @app.cell
8
  def _():
9
+ import joblib
10
+ import warnings
 
 
 
 
 
 
 
11
 
12
+ import marimo as mo
 
13
  import pandas as pd
14
 
15
+ warnings.filterwarnings(
16
+ "ignore", message="X does not have valid feature names"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  )
18
+ return joblib, mo, pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  @app.cell
22
  def _(mo):
23
+ mo.center(mo.md("# 🏦 Home Credit Default Risk Prediction"))
 
 
 
 
 
 
 
 
 
 
 
24
  return
25
 
26
 
27
  @app.cell
28
  def _(mo):
29
+ mo.Html("<br><hr><br>")
30
  return
31
 
32
 
33
  @app.cell
34
+ def _(joblib, mo):
35
+ # 📌 [1] Load the saved model pipeline
36
+ with mo.redirect_stdout():
37
+ loaded_pipeline = joblib.load("./model/lgbm_model.joblib")
38
+ return (loaded_pipeline,)
39
 
40
 
41
  @app.cell
42
+ def _():
43
+ # 📌 [2] Define the default values for all other features
44
+ default_values = {
45
+ "SK_ID_CURR": 277659.5,
46
+ "CNT_CHILDREN": 0.0,
47
+ "AMT_INCOME_TOTAL": 147150.0,
48
+ "AMT_CREDIT": 512997.75,
49
+ "AMT_ANNUITY": 24885.0,
50
+ "AMT_GOODS_PRICE": 450000.0,
51
+ "REGION_POPULATION_RELATIVE": 0.01885,
52
+ "DAYS_BIRTH": -15743.5,
53
+ "DAYS_EMPLOYED": -1219.0,
54
+ "DAYS_REGISTRATION": -4492.0,
55
+ "DAYS_ID_PUBLISH": -3254.0,
56
+ "OWN_CAR_AGE": 9.0,
57
+ "FLAG_MOBIL": 1.0,
58
+ "FLAG_EMP_PHONE": 1.0,
59
+ "FLAG_WORK_PHONE": 0.0,
60
+ "FLAG_CONT_MOBILE": 1.0,
61
+ "FLAG_PHONE": 0.0,
62
+ "FLAG_EMAIL": 0.0,
63
+ "CNT_FAM_MEMBERS": 2.0,
64
+ "REGION_RATING_CLIENT": 2.0,
65
+ "REGION_RATING_CLIENT_W_CITY": 2.0,
66
+ "HOUR_APPR_PROCESS_START": 12.0,
67
+ "REG_REGION_NOT_LIVE_REGION": 0.0,
68
+ "REG_REGION_NOT_WORK_REGION": 0.0,
69
+ "LIVE_REGION_NOT_WORK_REGION": 0.0,
70
+ "REG_CITY_NOT_LIVE_CITY": 0.0,
71
+ "REG_CITY_NOT_WORK_CITY": 0.0,
72
+ "LIVE_CITY_NOT_WORK_CITY": 0.0,
73
+ "EXT_SOURCE_1": 0.5068839442599388,
74
+ "EXT_SOURCE_2": 0.5662837032261614,
75
+ "EXT_SOURCE_3": 0.5370699579791587,
76
+ "APARTMENTS_AVG": 0.0876,
77
+ "BASEMENTAREA_AVG": 0.0764,
78
+ "YEARS_BEGINEXPLUATATION_AVG": 0.9816,
79
+ "YEARS_BUILD_AVG": 0.7552,
80
+ "COMMONAREA_AVG": 0.0211,
81
+ "ELEVATORS_AVG": 0.0,
82
+ "ENTRANCES_AVG": 0.1379,
83
+ "FLOORSMAX_AVG": 0.1667,
84
+ "FLOORSMIN_AVG": 0.2083,
85
+ "LANDAREA_AVG": 0.0483,
86
+ "LIVINGAPARTMENTS_AVG": 0.0756,
87
+ "LIVINGAREA_AVG": 0.0746,
88
+ "NONLIVINGAPARTMENTS_AVG": 0.0,
89
+ "NONLIVINGAREA_AVG": 0.0035,
90
+ "APARTMENTS_MODE": 0.084,
91
+ "BASEMENTAREA_MODE": 0.0748,
92
+ "YEARS_BEGINEXPLUATATION_MODE": 0.9816,
93
+ "YEARS_BUILD_MODE": 0.7648,
94
+ "COMMONAREA_MODE": 0.0191,
95
+ "ELEVATORS_MODE": 0.0,
96
+ "ENTRANCES_MODE": 0.1379,
97
+ "FLOORSMAX_MODE": 0.1667,
98
+ "FLOORSMIN_MODE": 0.2083,
99
+ "LANDAREA_MODE": 0.0459,
100
+ "LIVINGAPARTMENTS_MODE": 0.0771,
101
+ "LIVINGAREA_MODE": 0.0731,
102
+ "NONLIVINGAPARTMENTS_MODE": 0.0,
103
+ "NONLIVINGAREA_MODE": 0.0011,
104
+ "APARTMENTS_MEDI": 0.0864,
105
+ "BASEMENTAREA_MEDI": 0.0761,
106
+ "YEARS_BEGINEXPLUATATION_MEDI": 0.9816,
107
+ "YEARS_BUILD_MEDI": 0.7585,
108
+ "COMMONAREA_MEDI": 0.0209,
109
+ "ELEVATORS_MEDI": 0.0,
110
+ "ENTRANCES_MEDI": 0.1379,
111
+ "FLOORSMAX_MEDI": 0.1667,
112
+ "FLOORSMIN_MEDI": 0.2083,
113
+ "LANDAREA_MEDI": 0.0488,
114
+ "LIVINGAPARTMENTS_MEDI": 0.0765,
115
+ "LIVINGAREA_MEDI": 0.0749,
116
+ "NONLIVINGAPARTMENTS_MEDI": 0.0,
117
+ "NONLIVINGAREA_MEDI": 0.003,
118
+ "TOTALAREA_MODE": 0.0687,
119
+ "OBS_30_CNT_SOCIAL_CIRCLE": 0.0,
120
+ "DEF_30_CNT_SOCIAL_CIRCLE": 0.0,
121
+ "OBS_60_CNT_SOCIAL_CIRCLE": 0.0,
122
+ "DEF_60_CNT_SOCIAL_CIRCLE": 0.0,
123
+ "DAYS_LAST_PHONE_CHANGE": -755.0,
124
+ "FLAG_DOCUMENT_2": 0.0,
125
+ "FLAG_DOCUMENT_3": 1.0,
126
+ "FLAG_DOCUMENT_4": 0.0,
127
+ "FLAG_DOCUMENT_5": 0.0,
128
+ "FLAG_DOCUMENT_6": 0.0,
129
+ "FLAG_DOCUMENT_7": 0.0,
130
+ "FLAG_DOCUMENT_8": 0.0,
131
+ "FLAG_DOCUMENT_9": 0.0,
132
+ "FLAG_DOCUMENT_10": 0.0,
133
+ "FLAG_DOCUMENT_11": 0.0,
134
+ "FLAG_DOCUMENT_12": 0.0,
135
+ "FLAG_DOCUMENT_13": 0.0,
136
+ "FLAG_DOCUMENT_14": 0.0,
137
+ "FLAG_DOCUMENT_15": 0.0,
138
+ "FLAG_DOCUMENT_16": 0.0,
139
+ "FLAG_DOCUMENT_17": 0.0,
140
+ "FLAG_DOCUMENT_18": 0.0,
141
+ "FLAG_DOCUMENT_19": 0.0,
142
+ "FLAG_DOCUMENT_20": 0.0,
143
+ "FLAG_DOCUMENT_21": 0.0,
144
+ "AMT_REQ_CREDIT_BUREAU_HOUR": 0.0,
145
+ "AMT_REQ_CREDIT_BUREAU_DAY": 0.0,
146
+ "AMT_REQ_CREDIT_BUREAU_WEEK": 0.0,
147
+ "AMT_REQ_CREDIT_BUREAU_MON": 0.0,
148
+ "AMT_REQ_CREDIT_BUREAU_QRT": 0.0,
149
+ "AMT_REQ_CREDIT_BUREAU_YEAR": 1.0,
150
+ "NAME_CONTRACT_TYPE": "Cash loans",
151
+ "CODE_GENDER": "F",
152
+ "FLAG_OWN_CAR": "N",
153
+ "FLAG_OWN_REALTY": "Y",
154
+ "NAME_TYPE_SUITE": "Unaccompanied",
155
+ "NAME_INCOME_TYPE": "Working",
156
+ "NAME_EDUCATION_TYPE": "Secondary / secondary special",
157
+ "NAME_FAMILY_STATUS": "Married",
158
+ "NAME_HOUSING_TYPE": "House / apartment",
159
+ "OCCUPATION_TYPE": "Laborers",
160
+ "WEEKDAY_APPR_PROCESS_START": "TUESDAY",
161
+ "ORGANIZATION_TYPE": "Business Entity Type 3",
162
+ "FONDKAPREMONT_MODE": "reg oper account",
163
+ "HOUSETYPE_MODE": "block of flats",
164
+ "WALLSMATERIAL_MODE": "Panel",
165
+ "EMERGENCYSTATE_MODE": "No",
166
+ }
167
+ return (default_values,)
168
 
169
 
170
  @app.cell
171
  def _(mo):
172
+ # 📌 [3] Create widgets for the top 10 features
173
+ EXT_SOURCE_3 = mo.ui.slider(
174
+ start=0.00,
175
+ stop=0.90,
176
+ step=0.01,
177
+ value=0.5,
178
+ label="EXT_SOURCE_3",
 
 
 
 
 
 
 
179
  )
 
 
 
 
 
 
 
 
 
180
 
181
+ EXT_SOURCE_2 = mo.ui.slider(
182
+ start=0.00,
183
+ stop=0.86,
184
+ step=0.01,
185
+ value=0.5,
186
+ label="EXT_SOURCE_2",
 
187
  )
 
 
 
 
 
 
 
 
188
 
189
+ DAYS_BIRTH = mo.ui.slider(
190
+ start=-25229,
191
+ stop=-7673,
192
+ value=-15743,
193
+ label="DAYS_BIRTH",
194
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
+ EXT_SOURCE_1 = mo.ui.slider(
197
+ start=0.01,
198
+ stop=0.97,
199
+ step=0.01,
200
+ value=0.5,
201
+ label="EXT_SOURCE_1",
202
+ )
203
 
204
+ AMT_ANNUITY = mo.ui.slider(
205
+ start=1980,
206
+ stop=258025,
207
+ step=100,
208
+ value=24885,
209
+ label="AMT_ANNUITY",
210
+ )
211
 
212
+ AMT_CREDIT = mo.ui.slider(
213
+ start=45000,
214
+ stop=4050000,
215
+ step=50000,
216
+ value=512997,
217
+ label="AMT_CREDIT",
218
+ )
219
 
220
+ DAYS_EMPLOYED = mo.ui.slider(
221
+ start=-17583,
222
+ stop=365243,
223
+ value=-1219,
224
+ label="DAYS_EMPLOYED",
225
+ )
226
 
227
+ DAYS_ID_PUBLISH = mo.ui.slider(
228
+ start=-7197,
229
+ stop=0,
230
+ value=-3254,
231
+ label="DAYS_ID_PUBLISH",
232
+ )
233
 
234
+ DAYS_REGISTRATION = mo.ui.slider(
235
+ start=-24672,
236
+ stop=0,
237
+ value=-4492,
238
+ label="DAYS_REGISTRATION",
239
+ )
240
 
241
+ SK_ID_CURR = mo.ui.slider(
242
+ start=100003,
243
+ stop=456253,
244
+ step=100,
245
+ value=277659,
246
+ label="SK_ID_CURR",
247
+ )
248
 
249
+ features_widgets = {
250
+ "EXT_SOURCE_3": EXT_SOURCE_3,
251
+ "EXT_SOURCE_2": EXT_SOURCE_2,
252
+ "DAYS_BIRTH": DAYS_BIRTH,
253
+ "EXT_SOURCE_1": EXT_SOURCE_1,
254
+ "AMT_ANNUITY": AMT_ANNUITY,
255
+ "AMT_CREDIT": AMT_CREDIT,
256
+ "DAYS_EMPLOYED": DAYS_EMPLOYED,
257
+ "DAYS_ID_PUBLISH": DAYS_ID_PUBLISH,
258
+ "DAYS_REGISTRATION": DAYS_REGISTRATION,
259
+ "SK_ID_CURR": SK_ID_CURR,
260
+ }
261
+ return (features_widgets,)
262
 
263
 
264
  @app.cell
265
+ def _(features_widgets, mo):
266
+ # 📌 [4] Create the form with the sliders
267
+ sliders_form = (
268
+ mo.md("""
269
+ ### Enter Client Information
270
+
271
+ {EXT_SOURCE_3}
272
+ {EXT_SOURCE_2}
273
+ {DAYS_BIRTH}
274
+ {EXT_SOURCE_1}
275
+ {AMT_ANNUITY}
276
+ {AMT_CREDIT}
277
+ {DAYS_EMPLOYED}
278
+ {DAYS_ID_PUBLISH}
279
+ {DAYS_REGISTRATION}
280
+ {SK_ID_CURR}
281
+ """)
282
+ .batch(**features_widgets) # Pass the dict unpacked
283
+ .form(show_clear_button=True, bordered=True)
284
+ )
285
+ return (sliders_form,)
286
 
287
 
288
  @app.cell
289
+ def _(sliders_form):
290
+ # 📌 [5] Display the form
291
+ sliders_form
292
  return
293
 
294
 
295
  @app.cell
296
+ def _(default_values, loaded_pipeline, mo, pd, sliders_form):
297
+ # 📌 [6] Get prediction from model
298
+ probability = None
299
 
300
+ # Process form submission
301
+ if sliders_form.value is not None:
302
+ # Copy default values
303
+ prediction_data = default_values.copy()
304
 
305
+ # Update with sliders' submitted values
306
+ prediction_data.update(sliders_form.value)
 
 
 
307
 
308
+ # Create a DataFrame
309
+ predict_df = pd.DataFrame([prediction_data])
310
 
311
+ # Predict probability
312
+ probability = loaded_pipeline.predict_proba(predict_df)[:, 1][0]
313
+ else:
314
+ mo.md("Fill in the form and click **Submit** to get a prediction.")
315
+ return (probability,)
316
 
317
 
318
  @app.cell
319
+ def _(probability):
320
+ # 📌 [7] Display prediction results
321
+ prob_percent = 70.12
322
+ risk = "High Risk"
323
+ direction = "decrease"
324
 
325
+ if probability is not None:
326
+ prob_percent = round(probability * 100, 2)
327
 
328
+ # Define risk category
329
+ if probability < 0.34:
330
+ risk = "Low Risk"
331
+ direction = "increase"
332
+ elif probability < 0.67:
333
+ risk = "Medium Risk"
334
+ direction = None
335
+ else:
336
+ risk = "High Risk"
337
+ direction = "decrease"
338
+ return direction, prob_percent, risk
339
 
340
 
341
  @app.cell
342
  def _(mo):
343
+ mo.Html("<br>")
344
  return
345
 
346
 
347
  @app.cell
348
  def _(mo):
349
+ mo.md("## 🔮 Credit Risk Prediction")
350
  return
351
 
352
 
 
 
 
 
 
 
 
353
  @app.cell
354
  def _(mo):
355
+ mo.Html("<hr><br>")
356
  return
357
 
358
 
359
  @app.cell
360
+ def _(direction, mo, prob_percent, risk):
361
+ interpretation_text = f"""This means there is a {prob_percent}% chance the client will **default on their loan**.
362
+ Risk level is categorized as **{risk}**, which can help guide loan approval decisions.
 
 
 
 
 
 
 
 
 
 
363
  """
 
 
364
 
365
+ result_stat = mo.stat(
366
+ label="🎲 Probability of Payment Difficulties",
367
+ bordered=True,
368
+ value=f"{prob_percent}%",
369
+ caption=risk,
370
+ direction=direction,
371
  )
 
 
 
372
 
373
+ interpretation_stat = mo.stat(
374
+ label="💡 Interpretation",
375
+ bordered=True,
376
+ value="",
377
+ caption=interpretation_text,
378
+ )
379
+ return interpretation_stat, result_stat
380
 
381
 
382
  @app.cell
383
+ def _(interpretation_stat, mo, result_stat):
384
+ mo.vstack(
385
+ items=[
386
+ mo.hstack(
387
+ items=[result_stat, interpretation_stat], widths="equal", gap=1
388
+ ),
389
+ ],
390
+ gap=1,
391
+ heights="equal",
392
  )
393
  return
394
 
395
 
396
  @app.cell
397
  def _(mo):
398
+ mo.Html("<br><hr>")
399
  return
400
 
401
 
402
  @app.cell
403
  def _(mo):
404
  mo.callout(
 
 
 
 
 
 
405
  kind="info",
406
+ value=mo.md(
407
+ """💡 **Want a step-by-step walkthrough instead?**
408
+ Check the Jupyter notebook version here: 👉 [Jupyter notebook](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/tutorial_app.ipynb)""",
409
+ ),
410
  )
411
  return
412
 
413
 
414
  @app.cell
415
  def _(mo):
416
+ mo.md(r"""## 🚀 Model Selection""")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  return
418
 
419
 
420
  @app.cell
421
  def _(mo):
422
+ mo.Html("<hr><br>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  return
424
 
425
 
 
428
  lg_stat = mo.stat(
429
  label="Logistic Regression",
430
  bordered=True,
431
+ value="💪🏻 0.687 📝 0.685",
432
  caption="Scores are consistent across train and test, indicating no overfitting. However, the overall AUC is low, suggesting underfitting — the model is too simple to capture complex patterns.",
433
  direction="decrease",
434
  )
 
436
  rfc_stat = mo.stat(
437
  label="Random Forest Classifier",
438
  bordered=True,
439
+ value="💪🏻 1.0 📝 0.707",
440
  caption="Perfect training AUC indicates severe overfitting — the model memorized the training set. While the test score is better than Logistic Regression, the gap is too large for good generalization.",
441
  direction="decrease",
442
  )
 
444
  rfo_stat = mo.stat(
445
  label="Random Forest with Randomized Search",
446
  bordered=True,
447
+ value="💪🏻 0.820 📝 0.731",
448
  caption="Hyperparameter tuning greatly reduced overfitting. The smaller train–test gap and improved test AUC show better generalization and a strong performance.",
449
  direction="increase",
450
  )
 
452
  lgbm_stat = mo.stat(
453
  label="LightGBM",
454
  bordered=True,
455
+ value="💪🏻 0.852 📝 0.751",
456
  caption="Best overall performance. Small train–test gap and highest test AUC indicate a well-balanced model with strong generalization.",
457
  direction="increase",
458
  )
 
472
 
473
  @app.cell
474
  def _(mo):
475
+ mo.Html("<br>")
476
  return
477
 
478
 
479
  @app.cell
480
  def _(mo):
481
  mo.md(
482
+ r"""Based on a comparison of all the models _(using AUC ROC metric)_, the final model selection is clear."""
483
+ )
484
+ return
485
+
486
+
487
+ @app.cell
488
+ def _(mo):
489
+ mo.Html("<br>")
490
+ return
491
 
492
+
493
+ @app.cell
494
+ def _(mo):
495
+ mo.center(
496
+ mo.md(r"""
497
+ | Model | 💪🏻 Train Score | 📝 Test Score |
498
  | :--- | :---: | :---: |
499
  | Logistic Regression | 0.687 | 0.685 |
500
  | Random Forest Classifier | 1.000 | 0.707 |
501
  | Randomized Search (Tuned RF) | 0.820 | 0.731 |
502
  | **LightGBM** | 0.852 | **0.751** |
503
+ """)
504
+ )
505
+ return
506
+
507
+
508
+ @app.cell
509
+ def _(mo):
510
+ mo.Html("<br>")
511
+ return
512
 
513
+
514
+ @app.cell
515
+ def _(mo):
516
+ mo.md(
517
+ r"""
518
  * The **Logistic Regression** model performed poorly due to underfitting.
519
  * The base **Random Forest** model, while better, suffered from severe overfitting.
520
  * The tuned **Random Forest** model was a significant improvement and a strong contender, achieving a solid `test_score`.
 
526
 
527
  @app.cell
528
  def _(mo):
529
+ mo.Html("<br><hr><br>")
530
+ return
531
+
532
+
533
+ @app.cell
534
+ def _(mo):
535
+ mo.center(
536
+ mo.md(
537
+ "**Connect with me:** 💼 [Linkedin](https://www.linkedin.com/in/alex-turpo/) • 🐱 [GitHub](https://github.com/iBrokeTheCode) • 🤗 [Hugging Face](https://huggingface.co/iBrokeTheCode)"
538
+ )
539
  )
540
  return
541
 
tutorial_app.ipynb CHANGED
@@ -899,7 +899,7 @@
899
  "- Impute values for all columns with missing data (using median as imputing value).\n",
900
  "- Feature scaling with Min-Max scaler\n",
901
  "\n",
902
- "> Want to see how the dataset was processed? You can find the code for the preprocessing steps in [preprocessing.py](<[./src/preprocessing.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/preprocessing.py)>).\n"
903
  ]
904
  },
905
  {
@@ -1980,6 +1980,65 @@
1980
  "\n",
1981
  "default_values\n"
1982
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1983
  }
1984
  ],
1985
  "metadata": {
 
899
  "- Impute values for all columns with missing data (using median as imputing value).\n",
900
  "- Feature scaling with Min-Max scaler\n",
901
  "\n",
902
+ "> Want to see how the dataset was processed? You can find the code for the preprocessing steps in [preprocessing.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/preprocessing.py).\n"
903
  ]
904
  },
905
  {
 
1980
  "\n",
1981
  "default_values\n"
1982
  ]
1983
+ },
1984
+ {
1985
+ "cell_type": "markdown",
1986
+ "id": "4c744b94",
1987
+ "metadata": {},
1988
+ "source": [
1989
+ "**Calculate the minimum and maximum values for each feature in the dataset**\n"
1990
+ ]
1991
+ },
1992
+ {
1993
+ "cell_type": "code",
1994
+ "execution_count": 27,
1995
+ "id": "5ddefb61",
1996
+ "metadata": {},
1997
+ "outputs": [
1998
+ {
1999
+ "data": {
2000
+ "text/plain": [
2001
+ "{'EXT_SOURCE_3': (np.float64(0.0005272652387098),\n",
2002
+ " np.float64(0.8960095494948396)),\n",
2003
+ " 'EXT_SOURCE_2': (np.float64(5.002108762101576e-06),\n",
2004
+ " np.float64(0.8549996664047012)),\n",
2005
+ " 'DAYS_BIRTH': (np.int64(-25229), np.int64(-7673)),\n",
2006
+ " 'EXT_SOURCE_1': (np.float64(0.0145681324124455),\n",
2007
+ " np.float64(0.962692770561306)),\n",
2008
+ " 'AMT_ANNUITY': (np.float64(1980.0), np.float64(258025.5)),\n",
2009
+ " 'AMT_CREDIT': (np.float64(45000.0), np.float64(4050000.0)),\n",
2010
+ " 'DAYS_EMPLOYED': (np.int64(-17583), np.int64(365243)),\n",
2011
+ " 'DAYS_ID_PUBLISH': (np.int64(-7197), np.int64(0)),\n",
2012
+ " 'DAYS_REGISTRATION': (np.float64(-24672.0), np.float64(0.0)),\n",
2013
+ " 'SK_ID_CURR': (np.int64(100003), np.int64(456253))}"
2014
+ ]
2015
+ },
2016
+ "execution_count": 27,
2017
+ "metadata": {},
2018
+ "output_type": "execute_result"
2019
+ }
2020
+ ],
2021
+ "source": [
2022
+ "min_max_values = {\n",
2023
+ " \"EXT_SOURCE_3\": (X_train[\"EXT_SOURCE_3\"].min(), X_train[\"EXT_SOURCE_3\"].max()),\n",
2024
+ " \"EXT_SOURCE_2\": (X_train[\"EXT_SOURCE_2\"].min(), X_train[\"EXT_SOURCE_2\"].max()),\n",
2025
+ " \"DAYS_BIRTH\": (X_train[\"DAYS_BIRTH\"].min(), X_train[\"DAYS_BIRTH\"].max()),\n",
2026
+ " \"EXT_SOURCE_1\": (X_train[\"EXT_SOURCE_1\"].min(), X_train[\"EXT_SOURCE_1\"].max()),\n",
2027
+ " \"AMT_ANNUITY\": (X_train[\"AMT_ANNUITY\"].min(), X_train[\"AMT_ANNUITY\"].max()),\n",
2028
+ " \"AMT_CREDIT\": (X_train[\"AMT_CREDIT\"].min(), X_train[\"AMT_CREDIT\"].max()),\n",
2029
+ " \"DAYS_EMPLOYED\": (X_train[\"DAYS_EMPLOYED\"].min(), X_train[\"DAYS_EMPLOYED\"].max()),\n",
2030
+ " \"DAYS_ID_PUBLISH\": (\n",
2031
+ " X_train[\"DAYS_ID_PUBLISH\"].min(),\n",
2032
+ " X_train[\"DAYS_ID_PUBLISH\"].max(),\n",
2033
+ " ),\n",
2034
+ " \"DAYS_REGISTRATION\": (\n",
2035
+ " X_train[\"DAYS_REGISTRATION\"].min(),\n",
2036
+ " X_train[\"DAYS_REGISTRATION\"].max(),\n",
2037
+ " ),\n",
2038
+ " \"SK_ID_CURR\": (X_train[\"SK_ID_CURR\"].min(), X_train[\"SK_ID_CURR\"].max()),\n",
2039
+ "}\n",
2040
+ "min_max_values"
2041
+ ]
2042
  }
2043
  ],
2044
  "metadata": {