iBrokeTheCode commited on
Commit
ac8c468
Β·
1 Parent(s): 9995a6a

chore: Train Logistic Regression model

Browse files
Files changed (1) hide show
  1. app.py +61 -6
app.py CHANGED
@@ -22,6 +22,9 @@ def _():
22
  import pandas as pd
23
  import seaborn as sns
24
 
 
 
 
25
  from src.plots import (
26
  plot_target_distribution,
27
  plot_credit_amounts,
@@ -32,8 +35,9 @@ def _():
32
  )
33
  from src.theme import custom_palette
34
  from src.utils import get_dataset, get_features_target, get_train_test_sets
35
- from src.preprocessing import preprocess_data
36
  return (
 
37
  get_dataset,
38
  get_features_target,
39
  get_train_test_sets,
@@ -44,7 +48,8 @@ def _():
44
  plot_income_type,
45
  plot_occupation,
46
  plot_target_distribution,
47
- preprocess_data,
 
48
  )
49
 
50
 
@@ -270,7 +275,7 @@ def _(mo):
270
  def _(X, get_train_test_sets, y):
271
  X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
272
  X_train.shape, y_train.shape, X_test.shape, y_test.shape
273
- return X_test, X_train
274
 
275
 
276
  @app.cell
@@ -297,15 +302,65 @@ def _(mo):
297
 
298
 
299
  @app.cell
300
- def _(X_test, X_train, preprocess_data):
301
- train_data, test_data = preprocess_data(train_df=X_train, test_df=X_test)
 
 
302
  train_data.shape, test_data.shape
 
 
 
 
 
 
303
  return
304
 
305
 
306
  @app.cell
307
  def _(mo):
308
- mo.md("## 3. Training Models")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  return
310
 
311
 
 
22
  import pandas as pd
23
  import seaborn as sns
24
 
25
+ from sklearn.linear_model import LogisticRegression
26
+ from sklearn.metrics import roc_auc_score
27
+
28
  from src.plots import (
29
  plot_target_distribution,
30
  plot_credit_amounts,
 
35
  )
36
  from src.theme import custom_palette
37
  from src.utils import get_dataset, get_features_target, get_train_test_sets
38
+ from src.preprocessing import preprocess_data_pipeline
39
  return (
40
+ LogisticRegression,
41
  get_dataset,
42
  get_features_target,
43
  get_train_test_sets,
 
48
  plot_income_type,
49
  plot_occupation,
50
  plot_target_distribution,
51
+ preprocess_data_pipeline,
52
+ roc_auc_score,
53
  )
54
 
55
 
 
275
  def _(X, get_train_test_sets, y):
276
  X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
277
  X_train.shape, y_train.shape, X_test.shape, y_test.shape
278
+ return X_test, X_train, y_test, y_train
279
 
280
 
281
  @app.cell
 
302
 
303
 
304
  @app.cell
305
+ def _(X_test, X_train, preprocess_data_pipeline):
306
+ train_data, test_data = preprocess_data_pipeline(
307
+ train_df=X_train, test_df=X_test
308
+ )
309
  train_data.shape, test_data.shape
310
+ return test_data, train_data
311
+
312
+
313
+ @app.cell
314
+ def _(mo):
315
+ mo.md("""## 3. Training Models""")
316
  return
317
 
318
 
319
  @app.cell
320
  def _(mo):
321
+ mo.md(r"""### 3.1 Logistic Regression""")
322
+ return
323
+
324
+
325
+ @app.cell
326
+ def _(mo):
327
+ mo.callout(
328
+ mo.md("""
329
+ In Logistic Regression, C is the inverse of regularization strength:
330
+
331
+ - **Small C** β†’ Stronger regularization β†’ Simpler model, less overfitting risk, but may underfit.
332
+ - **Large C** β†’ Weaker regularization β†’ Model fits training data more closely, but may overfit.
333
+ """),
334
+ kind="info",
335
+ )
336
+ return
337
+
338
+
339
+ @app.cell
340
+ def _(
341
+ LogisticRegression,
342
+ roc_auc_score,
343
+ test_data,
344
+ train_data,
345
+ y_test,
346
+ y_train,
347
+ ):
348
+ # πŸ“Œ Logistic Regression
349
+ log_reg = LogisticRegression(C=0.0001)
350
+ log_reg.fit(train_data, y_train)
351
+
352
+ # Train data predicton (class 1)
353
+ log_reg_train = log_reg.predict_proba(train_data)[:, 1]
354
+
355
+ # Test data prediction (class 1)
356
+ log_reg_test = log_reg.predict_proba(test_data)[:, 1]
357
+
358
+ # Get the ROC AUC Score on train and test datasets
359
+ log_reg_scores = {
360
+ "train_score": roc_auc_score(y_train, log_reg_train),
361
+ "test_score": roc_auc_score(y_test, log_reg_test),
362
+ }
363
+ log_reg_scores
364
  return
365
 
366