Commit
Β·
2a3fc10
1
Parent(s):
dc5046f
chore: Improve notebook presentation
Browse files- app.py +48 -41
- src/preprocessing.py +2 -2
app.py
CHANGED
|
@@ -39,8 +39,6 @@ def _():
|
|
| 39 |
from src.utils import get_dataset, get_features_target, get_train_test_sets
|
| 40 |
from src.preprocessing import preprocess_data_pipeline
|
| 41 |
return (
|
| 42 |
-
LogisticRegression,
|
| 43 |
-
RandomForestClassifier,
|
| 44 |
get_dataset,
|
| 45 |
get_features_target,
|
| 46 |
get_train_test_sets,
|
|
@@ -52,7 +50,6 @@ def _():
|
|
| 52 |
plot_occupation,
|
| 53 |
plot_target_distribution,
|
| 54 |
preprocess_data_pipeline,
|
| 55 |
-
roc_auc_score,
|
| 56 |
)
|
| 57 |
|
| 58 |
|
|
@@ -278,7 +275,7 @@ def _(mo):
|
|
| 278 |
def _(X, get_train_test_sets, y):
|
| 279 |
X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
|
| 280 |
X_train.shape, y_train.shape, X_test.shape, y_test.shape
|
| 281 |
-
return X_test, X_train
|
| 282 |
|
| 283 |
|
| 284 |
@app.cell
|
|
@@ -310,7 +307,7 @@ def _(X_test, X_train, preprocess_data_pipeline):
|
|
| 310 |
train_df=X_train, test_df=X_test
|
| 311 |
)
|
| 312 |
train_data.shape, test_data.shape
|
| 313 |
-
return
|
| 314 |
|
| 315 |
|
| 316 |
@app.cell
|
|
@@ -321,9 +318,7 @@ def _(mo):
|
|
| 321 |
|
| 322 |
@app.cell
|
| 323 |
def _(mo):
|
| 324 |
-
mo.md(
|
| 325 |
-
r"""At this points, we will work with `train_data` and `test_data` as features sets; also `y_train` and `y_test` as target sets."""
|
| 326 |
-
)
|
| 327 |
return
|
| 328 |
|
| 329 |
|
|
@@ -348,14 +343,12 @@ def _(mo):
|
|
| 348 |
|
| 349 |
|
| 350 |
@app.cell
|
| 351 |
-
def _(
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
y_train,
|
| 358 |
-
):
|
| 359 |
# π Logistic Regression
|
| 360 |
log_reg = LogisticRegression(C=0.0001)
|
| 361 |
log_reg.fit(train_data, y_train)
|
|
@@ -372,6 +365,21 @@ def _(
|
|
| 372 |
"test_score": roc_auc_score(y_test, lr_test_pred),
|
| 373 |
}
|
| 374 |
log_reg_scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
return
|
| 376 |
|
| 377 |
|
|
@@ -382,14 +390,12 @@ def _(mo):
|
|
| 382 |
|
| 383 |
|
| 384 |
@app.cell
|
| 385 |
-
def _(
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
y_train,
|
| 392 |
-
):
|
| 393 |
# π Random Forest Classifier
|
| 394 |
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
|
| 395 |
rf.fit(train_data, y_train)
|
|
@@ -402,6 +408,18 @@ def _(
|
|
| 402 |
"test_score": roc_auc_score(y_test, rf_test_pred),
|
| 403 |
}
|
| 404 |
rf_scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
return
|
| 406 |
|
| 407 |
|
|
@@ -415,7 +433,7 @@ def _(mo):
|
|
| 415 |
def _(mo):
|
| 416 |
mo.md(
|
| 417 |
r"""
|
| 418 |
-
We
|
| 419 |
|
| 420 |
```py
|
| 421 |
param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
|
|
@@ -442,17 +460,13 @@ def _(mo):
|
|
| 442 |
}
|
| 443 |
rfo_scores
|
| 444 |
```
|
|
|
|
|
|
|
| 445 |
"""
|
| 446 |
)
|
| 447 |
return
|
| 448 |
|
| 449 |
|
| 450 |
-
@app.cell
|
| 451 |
-
def _(mo):
|
| 452 |
-
mo.md(r"""π The obtained scores are:""")
|
| 453 |
-
return
|
| 454 |
-
|
| 455 |
-
|
| 456 |
@app.cell
|
| 457 |
def _():
|
| 458 |
rfo_scores = {
|
|
@@ -465,27 +479,20 @@ def _():
|
|
| 465 |
|
| 466 |
@app.cell
|
| 467 |
def _(mo):
|
| 468 |
-
mo.md(r"""π₯The best results
|
| 469 |
return
|
| 470 |
|
| 471 |
|
| 472 |
@app.cell
|
| 473 |
-
def _(
|
| 474 |
optimized_results = {
|
| 475 |
"best_params_": {"n_estimators": 100, "max_depth": 10},
|
| 476 |
"best_score_": 0.7296259755147781,
|
| 477 |
-
"best_estimator_": RandomForestClassifier(
|
| 478 |
-
max_depth=10, n_jobs=-1, random_state=42
|
| 479 |
-
),
|
| 480 |
}
|
| 481 |
optimized_results
|
| 482 |
return
|
| 483 |
|
| 484 |
|
| 485 |
-
@app.cell
|
| 486 |
-
def _():
|
| 487 |
-
return
|
| 488 |
-
|
| 489 |
-
|
| 490 |
if __name__ == "__main__":
|
| 491 |
app.run()
|
|
|
|
| 39 |
from src.utils import get_dataset, get_features_target, get_train_test_sets
|
| 40 |
from src.preprocessing import preprocess_data_pipeline
|
| 41 |
return (
|
|
|
|
|
|
|
| 42 |
get_dataset,
|
| 43 |
get_features_target,
|
| 44 |
get_train_test_sets,
|
|
|
|
| 50 |
plot_occupation,
|
| 51 |
plot_target_distribution,
|
| 52 |
preprocess_data_pipeline,
|
|
|
|
| 53 |
)
|
| 54 |
|
| 55 |
|
|
|
|
| 275 |
def _(X, get_train_test_sets, y):
|
| 276 |
X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
|
| 277 |
X_train.shape, y_train.shape, X_test.shape, y_test.shape
|
| 278 |
+
return X_test, X_train
|
| 279 |
|
| 280 |
|
| 281 |
@app.cell
|
|
|
|
| 307 |
train_df=X_train, test_df=X_test
|
| 308 |
)
|
| 309 |
train_data.shape, test_data.shape
|
| 310 |
+
return
|
| 311 |
|
| 312 |
|
| 313 |
@app.cell
|
|
|
|
| 318 |
|
| 319 |
@app.cell
|
| 320 |
def _(mo):
|
| 321 |
+
mo.md(r"""At this points, we will work with `train_data` and `test_data` as features sets; also `y_train` and `y_test` as target sets.""")
|
|
|
|
|
|
|
| 322 |
return
|
| 323 |
|
| 324 |
|
|
|
|
| 343 |
|
| 344 |
|
| 345 |
@app.cell
|
| 346 |
+
def _(mo):
|
| 347 |
+
mo.md(
|
| 348 |
+
r"""
|
| 349 |
+
We trained our Logistic Regression model using the following code:
|
| 350 |
+
|
| 351 |
+
```py
|
|
|
|
|
|
|
| 352 |
# π Logistic Regression
|
| 353 |
log_reg = LogisticRegression(C=0.0001)
|
| 354 |
log_reg.fit(train_data, y_train)
|
|
|
|
| 365 |
"test_score": roc_auc_score(y_test, lr_test_pred),
|
| 366 |
}
|
| 367 |
log_reg_scores
|
| 368 |
+
```
|
| 369 |
+
|
| 370 |
+
π The ROC AUC scores obtained:
|
| 371 |
+
"""
|
| 372 |
+
)
|
| 373 |
+
return
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
@app.cell
|
| 377 |
+
def _():
|
| 378 |
+
lr_scores = {
|
| 379 |
+
"train_score": 0.6868418961663535,
|
| 380 |
+
"test_score": 0.6854973003347028,
|
| 381 |
+
}
|
| 382 |
+
lr_scores
|
| 383 |
return
|
| 384 |
|
| 385 |
|
|
|
|
| 390 |
|
| 391 |
|
| 392 |
@app.cell
|
| 393 |
+
def _(mo):
|
| 394 |
+
mo.md(
|
| 395 |
+
r"""
|
| 396 |
+
We trained our Random Forest Classifier model using the following code:
|
| 397 |
+
|
| 398 |
+
```py
|
|
|
|
|
|
|
| 399 |
# π Random Forest Classifier
|
| 400 |
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
|
| 401 |
rf.fit(train_data, y_train)
|
|
|
|
| 408 |
"test_score": roc_auc_score(y_test, rf_test_pred),
|
| 409 |
}
|
| 410 |
rf_scores
|
| 411 |
+
```
|
| 412 |
+
|
| 413 |
+
π The ROC AUC scores obtained:
|
| 414 |
+
"""
|
| 415 |
+
)
|
| 416 |
+
return
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
@app.cell
|
| 420 |
+
def _():
|
| 421 |
+
rf_scores = {"train_score": 1.0, "test_score": 0.7092889612208869}
|
| 422 |
+
rf_scores
|
| 423 |
return
|
| 424 |
|
| 425 |
|
|
|
|
| 433 |
def _(mo):
|
| 434 |
mo.md(
|
| 435 |
r"""
|
| 436 |
+
We trained the Randomized Search CV using the following code:
|
| 437 |
|
| 438 |
```py
|
| 439 |
param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
|
|
|
|
| 460 |
}
|
| 461 |
rfo_scores
|
| 462 |
```
|
| 463 |
+
|
| 464 |
+
π The ROC AUC scores obtained:
|
| 465 |
"""
|
| 466 |
)
|
| 467 |
return
|
| 468 |
|
| 469 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
@app.cell
|
| 471 |
def _():
|
| 472 |
rfo_scores = {
|
|
|
|
| 479 |
|
| 480 |
@app.cell
|
| 481 |
def _(mo):
|
| 482 |
+
mo.md(r"""π₯The best results:""")
|
| 483 |
return
|
| 484 |
|
| 485 |
|
| 486 |
@app.cell
|
| 487 |
+
def _():
|
| 488 |
optimized_results = {
|
| 489 |
"best_params_": {"n_estimators": 100, "max_depth": 10},
|
| 490 |
"best_score_": 0.7296259755147781,
|
| 491 |
+
"best_estimator_": "RandomForestClassifier(max_depth=10, n_jobs=-1, random_state=42)",
|
|
|
|
|
|
|
| 492 |
}
|
| 493 |
optimized_results
|
| 494 |
return
|
| 495 |
|
| 496 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
if __name__ == "__main__":
|
| 498 |
app.run()
|
src/preprocessing.py
CHANGED
|
@@ -143,13 +143,13 @@ def preprocess_data_pipeline(
|
|
| 143 |
]
|
| 144 |
)
|
| 145 |
|
| 146 |
-
#
|
| 147 |
preprocessor = ColumnTransformer(
|
| 148 |
transformers=[
|
| 149 |
# Tuple format: ('name', transformer, list_of_columns)
|
| 150 |
-
("numerical", numerical_pipeline, numerical_cols),
|
| 151 |
("binary", binary_pipeline, binary_cols),
|
| 152 |
("multi", multi_pipeline, multi_cols),
|
|
|
|
| 153 |
],
|
| 154 |
remainder="passthrough",
|
| 155 |
)
|
|
|
|
| 143 |
]
|
| 144 |
)
|
| 145 |
|
| 146 |
+
# Create a ColumnTransformer object with the defined pipelines and transformers
|
| 147 |
preprocessor = ColumnTransformer(
|
| 148 |
transformers=[
|
| 149 |
# Tuple format: ('name', transformer, list_of_columns)
|
|
|
|
| 150 |
("binary", binary_pipeline, binary_cols),
|
| 151 |
("multi", multi_pipeline, multi_cols),
|
| 152 |
+
("numerical", numerical_pipeline, numerical_cols),
|
| 153 |
],
|
| 154 |
remainder="passthrough",
|
| 155 |
)
|