iBrokeTheCode commited on
Commit
dc5046f
ยท
1 Parent(s): ac8c468

chore: Use RandomizedSearchCV to get the best hyper parameters

Browse files
Files changed (1) hide show
  1. app.py +127 -4
app.py CHANGED
@@ -24,6 +24,8 @@ def _():
24
 
25
  from sklearn.linear_model import LogisticRegression
26
  from sklearn.metrics import roc_auc_score
 
 
27
 
28
  from src.plots import (
29
  plot_target_distribution,
@@ -38,6 +40,7 @@ def _():
38
  from src.preprocessing import preprocess_data_pipeline
39
  return (
40
  LogisticRegression,
 
41
  get_dataset,
42
  get_features_target,
43
  get_train_test_sets,
@@ -316,6 +319,14 @@ def _(mo):
316
  return
317
 
318
 
 
 
 
 
 
 
 
 
319
  @app.cell
320
  def _(mo):
321
  mo.md(r"""### 3.1 Logistic Regression""")
@@ -350,19 +361,131 @@ def _(
350
  log_reg.fit(train_data, y_train)
351
 
352
  # Train data predicton (class 1)
353
- log_reg_train = log_reg.predict_proba(train_data)[:, 1]
354
 
355
  # Test data prediction (class 1)
356
- log_reg_test = log_reg.predict_proba(test_data)[:, 1]
357
 
358
  # Get the ROC AUC Score on train and test datasets
359
  log_reg_scores = {
360
- "train_score": roc_auc_score(y_train, log_reg_train),
361
- "test_score": roc_auc_score(y_test, log_reg_test),
362
  }
363
  log_reg_scores
364
  return
365
 
366
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
  if __name__ == "__main__":
368
  app.run()
 
24
 
25
  from sklearn.linear_model import LogisticRegression
26
  from sklearn.metrics import roc_auc_score
27
+ from sklearn.ensemble import RandomForestClassifier
28
+ from sklearn.model_selection import RandomizedSearchCV
29
 
30
  from src.plots import (
31
  plot_target_distribution,
 
40
  from src.preprocessing import preprocess_data_pipeline
41
  return (
42
  LogisticRegression,
43
+ RandomForestClassifier,
44
  get_dataset,
45
  get_features_target,
46
  get_train_test_sets,
 
319
  return
320
 
321
 
322
+ @app.cell
323
+ def _(mo):
324
+ mo.md(
325
+ r"""At this points, we will work with `train_data` and `test_data` as features sets; also `y_train` and `y_test` as target sets."""
326
+ )
327
+ return
328
+
329
+
330
  @app.cell
331
  def _(mo):
332
  mo.md(r"""### 3.1 Logistic Regression""")
 
361
  log_reg.fit(train_data, y_train)
362
 
363
  # Train data predicton (class 1)
364
+ lr_train_pred = log_reg.predict_proba(train_data)[:, 1]
365
 
366
  # Test data prediction (class 1)
367
+ lr_test_pred = log_reg.predict_proba(test_data)[:, 1]
368
 
369
  # Get the ROC AUC Score on train and test datasets
370
  log_reg_scores = {
371
+ "train_score": roc_auc_score(y_train, lr_train_pred),
372
+ "test_score": roc_auc_score(y_test, lr_test_pred),
373
  }
374
  log_reg_scores
375
  return
376
 
377
 
378
+ @app.cell
379
+ def _(mo):
380
+ mo.md(r"""### 3.2 Random Forest Classifier""")
381
+ return
382
+
383
+
384
+ @app.cell
385
+ def _(
386
+ RandomForestClassifier,
387
+ roc_auc_score,
388
+ test_data,
389
+ train_data,
390
+ y_test,
391
+ y_train,
392
+ ):
393
+ # ๐Ÿ“Œ Random Forest Classifier
394
+ rf = RandomForestClassifier(random_state=42, n_jobs=-1)
395
+ rf.fit(train_data, y_train)
396
+
397
+ rf_train_pred = rf.predict_proba(train_data)[:, 1]
398
+ rf_test_pred = rf.predict_proba(test_data)[:, 1]
399
+
400
+ rf_scores = {
401
+ "train_score": roc_auc_score(y_train, rf_train_pred),
402
+ "test_score": roc_auc_score(y_test, rf_test_pred),
403
+ }
404
+ rf_scores
405
+ return
406
+
407
+
408
+ @app.cell
409
+ def _(mo):
410
+ mo.md(r"""### 3.3. Randomized Search with Cross Validations""")
411
+ return
412
+
413
+
414
+ @app.cell
415
+ def _(mo):
416
+ mo.md(
417
+ r"""
418
+ We use this code snippet to use `RandomizedSearchCV`:
419
+
420
+ ```py
421
+ param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
422
+
423
+ rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1)
424
+ rscv = RandomizedSearchCV(
425
+ estimator=rf_optimized,
426
+ param_distributions=param_dist,
427
+ n_iter=5,
428
+ scoring="roc_auc",
429
+ cv=3,
430
+ random_state=42,
431
+ n_jobs=-1,
432
+ )
433
+
434
+ rscv.fit(train_data, y_train)
435
+
436
+ rfo_train_pred = rscv.predict_proba(train_data)[:, 1]
437
+ rfo_test_pred = rscv.predict_proba(test_data)[:, 1]
438
+
439
+ rfo_scores = {
440
+ "train_score": roc_auc_score(y_train, rfo_train_pred),
441
+ "test_score": roc_auc_score(y_test, rfo_test_pred),
442
+ }
443
+ rfo_scores
444
+ ```
445
+ """
446
+ )
447
+ return
448
+
449
+
450
+ @app.cell
451
+ def _(mo):
452
+ mo.md(r"""๐Ÿ“ˆ The obtained scores are:""")
453
+ return
454
+
455
+
456
+ @app.cell
457
+ def _():
458
+ rfo_scores = {
459
+ "train_score": 0.820563139010308,
460
+ "test_score": 0.7304320776838898,
461
+ }
462
+ rfo_scores
463
+ return
464
+
465
+
466
+ @app.cell
467
+ def _(mo):
468
+ mo.md(r"""๐Ÿฅ‡The best results are:""")
469
+ return
470
+
471
+
472
+ @app.cell
473
+ def _(RandomForestClassifier):
474
+ optimized_results = {
475
+ "best_params_": {"n_estimators": 100, "max_depth": 10},
476
+ "best_score_": 0.7296259755147781,
477
+ "best_estimator_": RandomForestClassifier(
478
+ max_depth=10, n_jobs=-1, random_state=42
479
+ ),
480
+ }
481
+ optimized_results
482
+ return
483
+
484
+
485
+ @app.cell
486
+ def _():
487
+ return
488
+
489
+
490
  if __name__ == "__main__":
491
  app.run()