Commit
ยท
dc5046f
1
Parent(s):
ac8c468
chore: Use RandomizedSearchCV to get the best hyper parameters
Browse files
app.py
CHANGED
|
@@ -24,6 +24,8 @@ def _():
|
|
| 24 |
|
| 25 |
from sklearn.linear_model import LogisticRegression
|
| 26 |
from sklearn.metrics import roc_auc_score
|
|
|
|
|
|
|
| 27 |
|
| 28 |
from src.plots import (
|
| 29 |
plot_target_distribution,
|
|
@@ -38,6 +40,7 @@ def _():
|
|
| 38 |
from src.preprocessing import preprocess_data_pipeline
|
| 39 |
return (
|
| 40 |
LogisticRegression,
|
|
|
|
| 41 |
get_dataset,
|
| 42 |
get_features_target,
|
| 43 |
get_train_test_sets,
|
|
@@ -316,6 +319,14 @@ def _(mo):
|
|
| 316 |
return
|
| 317 |
|
| 318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
@app.cell
|
| 320 |
def _(mo):
|
| 321 |
mo.md(r"""### 3.1 Logistic Regression""")
|
|
@@ -350,19 +361,131 @@ def _(
|
|
| 350 |
log_reg.fit(train_data, y_train)
|
| 351 |
|
| 352 |
# Train data predicton (class 1)
|
| 353 |
-
|
| 354 |
|
| 355 |
# Test data prediction (class 1)
|
| 356 |
-
|
| 357 |
|
| 358 |
# Get the ROC AUC Score on train and test datasets
|
| 359 |
log_reg_scores = {
|
| 360 |
-
"train_score": roc_auc_score(y_train,
|
| 361 |
-
"test_score": roc_auc_score(y_test,
|
| 362 |
}
|
| 363 |
log_reg_scores
|
| 364 |
return
|
| 365 |
|
| 366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
if __name__ == "__main__":
|
| 368 |
app.run()
|
|
|
|
| 24 |
|
| 25 |
from sklearn.linear_model import LogisticRegression
|
| 26 |
from sklearn.metrics import roc_auc_score
|
| 27 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 28 |
+
from sklearn.model_selection import RandomizedSearchCV
|
| 29 |
|
| 30 |
from src.plots import (
|
| 31 |
plot_target_distribution,
|
|
|
|
| 40 |
from src.preprocessing import preprocess_data_pipeline
|
| 41 |
return (
|
| 42 |
LogisticRegression,
|
| 43 |
+
RandomForestClassifier,
|
| 44 |
get_dataset,
|
| 45 |
get_features_target,
|
| 46 |
get_train_test_sets,
|
|
|
|
| 319 |
return
|
| 320 |
|
| 321 |
|
| 322 |
+
@app.cell
|
| 323 |
+
def _(mo):
|
| 324 |
+
mo.md(
|
| 325 |
+
r"""At this points, we will work with `train_data` and `test_data` as features sets; also `y_train` and `y_test` as target sets."""
|
| 326 |
+
)
|
| 327 |
+
return
|
| 328 |
+
|
| 329 |
+
|
| 330 |
@app.cell
|
| 331 |
def _(mo):
|
| 332 |
mo.md(r"""### 3.1 Logistic Regression""")
|
|
|
|
| 361 |
log_reg.fit(train_data, y_train)
|
| 362 |
|
| 363 |
# Train data predicton (class 1)
|
| 364 |
+
lr_train_pred = log_reg.predict_proba(train_data)[:, 1]
|
| 365 |
|
| 366 |
# Test data prediction (class 1)
|
| 367 |
+
lr_test_pred = log_reg.predict_proba(test_data)[:, 1]
|
| 368 |
|
| 369 |
# Get the ROC AUC Score on train and test datasets
|
| 370 |
log_reg_scores = {
|
| 371 |
+
"train_score": roc_auc_score(y_train, lr_train_pred),
|
| 372 |
+
"test_score": roc_auc_score(y_test, lr_test_pred),
|
| 373 |
}
|
| 374 |
log_reg_scores
|
| 375 |
return
|
| 376 |
|
| 377 |
|
| 378 |
+
@app.cell
|
| 379 |
+
def _(mo):
|
| 380 |
+
mo.md(r"""### 3.2 Random Forest Classifier""")
|
| 381 |
+
return
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
@app.cell
|
| 385 |
+
def _(
|
| 386 |
+
RandomForestClassifier,
|
| 387 |
+
roc_auc_score,
|
| 388 |
+
test_data,
|
| 389 |
+
train_data,
|
| 390 |
+
y_test,
|
| 391 |
+
y_train,
|
| 392 |
+
):
|
| 393 |
+
# ๐ Random Forest Classifier
|
| 394 |
+
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
|
| 395 |
+
rf.fit(train_data, y_train)
|
| 396 |
+
|
| 397 |
+
rf_train_pred = rf.predict_proba(train_data)[:, 1]
|
| 398 |
+
rf_test_pred = rf.predict_proba(test_data)[:, 1]
|
| 399 |
+
|
| 400 |
+
rf_scores = {
|
| 401 |
+
"train_score": roc_auc_score(y_train, rf_train_pred),
|
| 402 |
+
"test_score": roc_auc_score(y_test, rf_test_pred),
|
| 403 |
+
}
|
| 404 |
+
rf_scores
|
| 405 |
+
return
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
@app.cell
|
| 409 |
+
def _(mo):
|
| 410 |
+
mo.md(r"""### 3.3. Randomized Search with Cross Validations""")
|
| 411 |
+
return
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
@app.cell
|
| 415 |
+
def _(mo):
|
| 416 |
+
mo.md(
|
| 417 |
+
r"""
|
| 418 |
+
We use this code snippet to use `RandomizedSearchCV`:
|
| 419 |
+
|
| 420 |
+
```py
|
| 421 |
+
param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
|
| 422 |
+
|
| 423 |
+
rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1)
|
| 424 |
+
rscv = RandomizedSearchCV(
|
| 425 |
+
estimator=rf_optimized,
|
| 426 |
+
param_distributions=param_dist,
|
| 427 |
+
n_iter=5,
|
| 428 |
+
scoring="roc_auc",
|
| 429 |
+
cv=3,
|
| 430 |
+
random_state=42,
|
| 431 |
+
n_jobs=-1,
|
| 432 |
+
)
|
| 433 |
+
|
| 434 |
+
rscv.fit(train_data, y_train)
|
| 435 |
+
|
| 436 |
+
rfo_train_pred = rscv.predict_proba(train_data)[:, 1]
|
| 437 |
+
rfo_test_pred = rscv.predict_proba(test_data)[:, 1]
|
| 438 |
+
|
| 439 |
+
rfo_scores = {
|
| 440 |
+
"train_score": roc_auc_score(y_train, rfo_train_pred),
|
| 441 |
+
"test_score": roc_auc_score(y_test, rfo_test_pred),
|
| 442 |
+
}
|
| 443 |
+
rfo_scores
|
| 444 |
+
```
|
| 445 |
+
"""
|
| 446 |
+
)
|
| 447 |
+
return
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
@app.cell
|
| 451 |
+
def _(mo):
|
| 452 |
+
mo.md(r"""๐ The obtained scores are:""")
|
| 453 |
+
return
|
| 454 |
+
|
| 455 |
+
|
| 456 |
+
@app.cell
|
| 457 |
+
def _():
|
| 458 |
+
rfo_scores = {
|
| 459 |
+
"train_score": 0.820563139010308,
|
| 460 |
+
"test_score": 0.7304320776838898,
|
| 461 |
+
}
|
| 462 |
+
rfo_scores
|
| 463 |
+
return
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
@app.cell
|
| 467 |
+
def _(mo):
|
| 468 |
+
mo.md(r"""๐ฅThe best results are:""")
|
| 469 |
+
return
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
@app.cell
|
| 473 |
+
def _(RandomForestClassifier):
|
| 474 |
+
optimized_results = {
|
| 475 |
+
"best_params_": {"n_estimators": 100, "max_depth": 10},
|
| 476 |
+
"best_score_": 0.7296259755147781,
|
| 477 |
+
"best_estimator_": RandomForestClassifier(
|
| 478 |
+
max_depth=10, n_jobs=-1, random_state=42
|
| 479 |
+
),
|
| 480 |
+
}
|
| 481 |
+
optimized_results
|
| 482 |
+
return
|
| 483 |
+
|
| 484 |
+
|
| 485 |
+
@app.cell
|
| 486 |
+
def _():
|
| 487 |
+
return
|
| 488 |
+
|
| 489 |
+
|
| 490 |
if __name__ == "__main__":
|
| 491 |
app.run()
|