iBrokeTheCode commited on
Commit
2a3fc10
Β·
1 Parent(s): dc5046f

chore: Improve notebook presentation

Browse files
Files changed (2) hide show
  1. app.py +48 -41
  2. src/preprocessing.py +2 -2
app.py CHANGED
@@ -39,8 +39,6 @@ def _():
39
  from src.utils import get_dataset, get_features_target, get_train_test_sets
40
  from src.preprocessing import preprocess_data_pipeline
41
  return (
42
- LogisticRegression,
43
- RandomForestClassifier,
44
  get_dataset,
45
  get_features_target,
46
  get_train_test_sets,
@@ -52,7 +50,6 @@ def _():
52
  plot_occupation,
53
  plot_target_distribution,
54
  preprocess_data_pipeline,
55
- roc_auc_score,
56
  )
57
 
58
 
@@ -278,7 +275,7 @@ def _(mo):
278
  def _(X, get_train_test_sets, y):
279
  X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
280
  X_train.shape, y_train.shape, X_test.shape, y_test.shape
281
- return X_test, X_train, y_test, y_train
282
 
283
 
284
  @app.cell
@@ -310,7 +307,7 @@ def _(X_test, X_train, preprocess_data_pipeline):
310
  train_df=X_train, test_df=X_test
311
  )
312
  train_data.shape, test_data.shape
313
- return test_data, train_data
314
 
315
 
316
  @app.cell
@@ -321,9 +318,7 @@ def _(mo):
321
 
322
  @app.cell
323
  def _(mo):
324
- mo.md(
325
- r"""At this points, we will work with `train_data` and `test_data` as features sets; also `y_train` and `y_test` as target sets."""
326
- )
327
  return
328
 
329
 
@@ -348,14 +343,12 @@ def _(mo):
348
 
349
 
350
  @app.cell
351
- def _(
352
- LogisticRegression,
353
- roc_auc_score,
354
- test_data,
355
- train_data,
356
- y_test,
357
- y_train,
358
- ):
359
  # πŸ“Œ Logistic Regression
360
  log_reg = LogisticRegression(C=0.0001)
361
  log_reg.fit(train_data, y_train)
@@ -372,6 +365,21 @@ def _(
372
  "test_score": roc_auc_score(y_test, lr_test_pred),
373
  }
374
  log_reg_scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  return
376
 
377
 
@@ -382,14 +390,12 @@ def _(mo):
382
 
383
 
384
  @app.cell
385
- def _(
386
- RandomForestClassifier,
387
- roc_auc_score,
388
- test_data,
389
- train_data,
390
- y_test,
391
- y_train,
392
- ):
393
  # πŸ“Œ Random Forest Classifier
394
  rf = RandomForestClassifier(random_state=42, n_jobs=-1)
395
  rf.fit(train_data, y_train)
@@ -402,6 +408,18 @@ def _(
402
  "test_score": roc_auc_score(y_test, rf_test_pred),
403
  }
404
  rf_scores
 
 
 
 
 
 
 
 
 
 
 
 
405
  return
406
 
407
 
@@ -415,7 +433,7 @@ def _(mo):
415
  def _(mo):
416
  mo.md(
417
  r"""
418
- We use this code snippet to use `RandomizedSearchCV`:
419
 
420
  ```py
421
  param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
@@ -442,17 +460,13 @@ def _(mo):
442
  }
443
  rfo_scores
444
  ```
 
 
445
  """
446
  )
447
  return
448
 
449
 
450
- @app.cell
451
- def _(mo):
452
- mo.md(r"""πŸ“ˆ The obtained scores are:""")
453
- return
454
-
455
-
456
  @app.cell
457
  def _():
458
  rfo_scores = {
@@ -465,27 +479,20 @@ def _():
465
 
466
  @app.cell
467
  def _(mo):
468
- mo.md(r"""πŸ₯‡The best results are:""")
469
  return
470
 
471
 
472
  @app.cell
473
- def _(RandomForestClassifier):
474
  optimized_results = {
475
  "best_params_": {"n_estimators": 100, "max_depth": 10},
476
  "best_score_": 0.7296259755147781,
477
- "best_estimator_": RandomForestClassifier(
478
- max_depth=10, n_jobs=-1, random_state=42
479
- ),
480
  }
481
  optimized_results
482
  return
483
 
484
 
485
- @app.cell
486
- def _():
487
- return
488
-
489
-
490
  if __name__ == "__main__":
491
  app.run()
 
39
  from src.utils import get_dataset, get_features_target, get_train_test_sets
40
  from src.preprocessing import preprocess_data_pipeline
41
  return (
 
 
42
  get_dataset,
43
  get_features_target,
44
  get_train_test_sets,
 
50
  plot_occupation,
51
  plot_target_distribution,
52
  preprocess_data_pipeline,
 
53
  )
54
 
55
 
 
275
  def _(X, get_train_test_sets, y):
276
  X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
277
  X_train.shape, y_train.shape, X_test.shape, y_test.shape
278
+ return X_test, X_train
279
 
280
 
281
  @app.cell
 
307
  train_df=X_train, test_df=X_test
308
  )
309
  train_data.shape, test_data.shape
310
+ return
311
 
312
 
313
  @app.cell
 
318
 
319
  @app.cell
320
  def _(mo):
321
+ mo.md(r"""At this points, we will work with `train_data` and `test_data` as features sets; also `y_train` and `y_test` as target sets.""")
 
 
322
  return
323
 
324
 
 
343
 
344
 
345
  @app.cell
346
+ def _(mo):
347
+ mo.md(
348
+ r"""
349
+ We trained our Logistic Regression model using the following code:
350
+
351
+ ```py
 
 
352
  # πŸ“Œ Logistic Regression
353
  log_reg = LogisticRegression(C=0.0001)
354
  log_reg.fit(train_data, y_train)
 
365
  "test_score": roc_auc_score(y_test, lr_test_pred),
366
  }
367
  log_reg_scores
368
+ ```
369
+
370
+ πŸ“ˆ The ROC AUC scores obtained:
371
+ """
372
+ )
373
+ return
374
+
375
+
376
+ @app.cell
377
+ def _():
378
+ lr_scores = {
379
+ "train_score": 0.6868418961663535,
380
+ "test_score": 0.6854973003347028,
381
+ }
382
+ lr_scores
383
  return
384
 
385
 
 
390
 
391
 
392
  @app.cell
393
+ def _(mo):
394
+ mo.md(
395
+ r"""
396
+ We trained our Random Forest Classifier model using the following code:
397
+
398
+ ```py
 
 
399
  # πŸ“Œ Random Forest Classifier
400
  rf = RandomForestClassifier(random_state=42, n_jobs=-1)
401
  rf.fit(train_data, y_train)
 
408
  "test_score": roc_auc_score(y_test, rf_test_pred),
409
  }
410
  rf_scores
411
+ ```
412
+
413
+ πŸ“ˆ The ROC AUC scores obtained:
414
+ """
415
+ )
416
+ return
417
+
418
+
419
+ @app.cell
420
+ def _():
421
+ rf_scores = {"train_score": 1.0, "test_score": 0.7092889612208869}
422
+ rf_scores
423
  return
424
 
425
 
 
433
  def _(mo):
434
  mo.md(
435
  r"""
436
+ We trained the Randomized Search CV using the following code:
437
 
438
  ```py
439
  param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
 
460
  }
461
  rfo_scores
462
  ```
463
+
464
+ πŸ“ˆ The ROC AUC scores obtained:
465
  """
466
  )
467
  return
468
 
469
 
 
 
 
 
 
 
470
  @app.cell
471
  def _():
472
  rfo_scores = {
 
479
 
480
  @app.cell
481
  def _(mo):
482
+ mo.md(r"""πŸ₯‡The best results:""")
483
  return
484
 
485
 
486
  @app.cell
487
+ def _():
488
  optimized_results = {
489
  "best_params_": {"n_estimators": 100, "max_depth": 10},
490
  "best_score_": 0.7296259755147781,
491
+ "best_estimator_": "RandomForestClassifier(max_depth=10, n_jobs=-1, random_state=42)",
 
 
492
  }
493
  optimized_results
494
  return
495
 
496
 
 
 
 
 
 
497
  if __name__ == "__main__":
498
  app.run()
src/preprocessing.py CHANGED
@@ -143,13 +143,13 @@ def preprocess_data_pipeline(
143
  ]
144
  )
145
 
146
- #
147
  preprocessor = ColumnTransformer(
148
  transformers=[
149
  # Tuple format: ('name', transformer, list_of_columns)
150
- ("numerical", numerical_pipeline, numerical_cols),
151
  ("binary", binary_pipeline, binary_cols),
152
  ("multi", multi_pipeline, multi_cols),
 
153
  ],
154
  remainder="passthrough",
155
  )
 
143
  ]
144
  )
145
 
146
+ # Create a ColumnTransformer object with the defined pipelines and transformers
147
  preprocessor = ColumnTransformer(
148
  transformers=[
149
  # Tuple format: ('name', transformer, list_of_columns)
 
150
  ("binary", binary_pipeline, binary_cols),
151
  ("multi", multi_pipeline, multi_cols),
152
+ ("numerical", numerical_pipeline, numerical_cols),
153
  ],
154
  remainder="passthrough",
155
  )