Commit
Β·
ac8c468
1
Parent(s):
9995a6a
chore: Train Logistic Regression model
Browse files
app.py
CHANGED
|
@@ -22,6 +22,9 @@ def _():
|
|
| 22 |
import pandas as pd
|
| 23 |
import seaborn as sns
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
from src.plots import (
|
| 26 |
plot_target_distribution,
|
| 27 |
plot_credit_amounts,
|
|
@@ -32,8 +35,9 @@ def _():
|
|
| 32 |
)
|
| 33 |
from src.theme import custom_palette
|
| 34 |
from src.utils import get_dataset, get_features_target, get_train_test_sets
|
| 35 |
-
from src.preprocessing import
|
| 36 |
return (
|
|
|
|
| 37 |
get_dataset,
|
| 38 |
get_features_target,
|
| 39 |
get_train_test_sets,
|
|
@@ -44,7 +48,8 @@ def _():
|
|
| 44 |
plot_income_type,
|
| 45 |
plot_occupation,
|
| 46 |
plot_target_distribution,
|
| 47 |
-
|
|
|
|
| 48 |
)
|
| 49 |
|
| 50 |
|
|
@@ -270,7 +275,7 @@ def _(mo):
|
|
| 270 |
def _(X, get_train_test_sets, y):
|
| 271 |
X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
|
| 272 |
X_train.shape, y_train.shape, X_test.shape, y_test.shape
|
| 273 |
-
return X_test, X_train
|
| 274 |
|
| 275 |
|
| 276 |
@app.cell
|
|
@@ -297,15 +302,65 @@ def _(mo):
|
|
| 297 |
|
| 298 |
|
| 299 |
@app.cell
|
| 300 |
-
def _(X_test, X_train,
|
| 301 |
-
train_data, test_data =
|
|
|
|
|
|
|
| 302 |
train_data.shape, test_data.shape
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
return
|
| 304 |
|
| 305 |
|
| 306 |
@app.cell
|
| 307 |
def _(mo):
|
| 308 |
-
mo.md("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
return
|
| 310 |
|
| 311 |
|
|
|
|
| 22 |
import pandas as pd
|
| 23 |
import seaborn as sns
|
| 24 |
|
| 25 |
+
from sklearn.linear_model import LogisticRegression
|
| 26 |
+
from sklearn.metrics import roc_auc_score
|
| 27 |
+
|
| 28 |
from src.plots import (
|
| 29 |
plot_target_distribution,
|
| 30 |
plot_credit_amounts,
|
|
|
|
| 35 |
)
|
| 36 |
from src.theme import custom_palette
|
| 37 |
from src.utils import get_dataset, get_features_target, get_train_test_sets
|
| 38 |
+
from src.preprocessing import preprocess_data_pipeline
|
| 39 |
return (
|
| 40 |
+
LogisticRegression,
|
| 41 |
get_dataset,
|
| 42 |
get_features_target,
|
| 43 |
get_train_test_sets,
|
|
|
|
| 48 |
plot_income_type,
|
| 49 |
plot_occupation,
|
| 50 |
plot_target_distribution,
|
| 51 |
+
preprocess_data_pipeline,
|
| 52 |
+
roc_auc_score,
|
| 53 |
)
|
| 54 |
|
| 55 |
|
|
|
|
| 275 |
def _(X, get_train_test_sets, y):
|
| 276 |
X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
|
| 277 |
X_train.shape, y_train.shape, X_test.shape, y_test.shape
|
| 278 |
+
return X_test, X_train, y_test, y_train
|
| 279 |
|
| 280 |
|
| 281 |
@app.cell
|
|
|
|
| 302 |
|
| 303 |
|
| 304 |
@app.cell
|
| 305 |
+
def _(X_test, X_train, preprocess_data_pipeline):
|
| 306 |
+
train_data, test_data = preprocess_data_pipeline(
|
| 307 |
+
train_df=X_train, test_df=X_test
|
| 308 |
+
)
|
| 309 |
train_data.shape, test_data.shape
|
| 310 |
+
return test_data, train_data
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
@app.cell
|
| 314 |
+
def _(mo):
|
| 315 |
+
mo.md("""## 3. Training Models""")
|
| 316 |
return
|
| 317 |
|
| 318 |
|
| 319 |
@app.cell
|
| 320 |
def _(mo):
|
| 321 |
+
mo.md(r"""### 3.1 Logistic Regression""")
|
| 322 |
+
return
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
@app.cell
|
| 326 |
+
def _(mo):
|
| 327 |
+
mo.callout(
|
| 328 |
+
mo.md("""
|
| 329 |
+
In Logistic Regression, C is the inverse of regularization strength:
|
| 330 |
+
|
| 331 |
+
- **Small C** β Stronger regularization β Simpler model, less overfitting risk, but may underfit.
|
| 332 |
+
- **Large C** β Weaker regularization β Model fits training data more closely, but may overfit.
|
| 333 |
+
"""),
|
| 334 |
+
kind="info",
|
| 335 |
+
)
|
| 336 |
+
return
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
@app.cell
|
| 340 |
+
def _(
|
| 341 |
+
LogisticRegression,
|
| 342 |
+
roc_auc_score,
|
| 343 |
+
test_data,
|
| 344 |
+
train_data,
|
| 345 |
+
y_test,
|
| 346 |
+
y_train,
|
| 347 |
+
):
|
| 348 |
+
# π Logistic Regression
|
| 349 |
+
log_reg = LogisticRegression(C=0.0001)
|
| 350 |
+
log_reg.fit(train_data, y_train)
|
| 351 |
+
|
| 352 |
+
# Train data predicton (class 1)
|
| 353 |
+
log_reg_train = log_reg.predict_proba(train_data)[:, 1]
|
| 354 |
+
|
| 355 |
+
# Test data prediction (class 1)
|
| 356 |
+
log_reg_test = log_reg.predict_proba(test_data)[:, 1]
|
| 357 |
+
|
| 358 |
+
# Get the ROC AUC Score on train and test datasets
|
| 359 |
+
log_reg_scores = {
|
| 360 |
+
"train_score": roc_auc_score(y_train, log_reg_train),
|
| 361 |
+
"test_score": roc_auc_score(y_test, log_reg_test),
|
| 362 |
+
}
|
| 363 |
+
log_reg_scores
|
| 364 |
return
|
| 365 |
|
| 366 |
|