Spaces:

iBrokeTheCode
/

Home_Credit_Default_Risk_Prediction

Sleeping

App Files Files Community

Home_Credit_Default_Risk_Prediction / app_bk.py

iBrokeTheCode

chore: Save LightGBM model

c742ac4 4 months ago

raw

history blame contribute delete

16.9 kB

	import marimo

	__generated_with = "0.14.16"
	app = marimo.App()


	@app.cell
	def _():
	import marimo as mo
	return (mo,)


	@app.cell
	def _(mo):
	mo.center(mo.md("# Home Credit Default Risk Prediction"))
	return


	@app.cell
	def _():
	import pandas as pd

	from sklearn.ensemble import RandomForestClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import roc_auc_score
	from sklearn.model_selection import RandomizedSearchCV

	from sklearn.pipeline import Pipeline
	from sklearn.compose import ColumnTransformer
	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder

	from lightgbm import LGBMClassifier

	from src.plots import (
	plot_target_distribution,
	plot_credit_amounts,
	plot_education_levels,
	plot_occupation,
	plot_family_status,
	plot_income_type,
	)
	from src.utils import get_dataset, get_features_target, get_train_test_sets
	from src.preprocessing import preprocess_data_pipeline
	return (
	get_dataset,
	get_features_target,
	get_train_test_sets,
	pd,
	plot_credit_amounts,
	plot_education_levels,
	plot_family_status,
	plot_income_type,
	plot_occupation,
	plot_target_distribution,
	preprocess_data_pipeline,
	)


	@app.cell
	def _(get_dataset, get_features_target):
	df = get_dataset()
	X, y = get_features_target(df)
	return X, df, y


	@app.cell
	def _(mo):
	mo.md("""## 1. Exploratory Data Analysis""")
	return


	@app.cell
	def _(mo):
	mo.callout(
	kind="info",
	value=mo.md(
	"""💡 Want a step-by-step walkthrough instead?
	Check the Jupyter notebook version here: 👉 [Jupyter notebook](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/tutorial_app.ipynb)""",
	),
	)
	return


	@app.cell
	def _(mo):
	mo.md("""### 1.1 Dataset Information""")
	return


	@app.cell
	def _(mo):
	mo.md("""a. Shape of the train and test datasets""")
	return


	@app.cell
	def _(X_test, X_train, df):
	train_samples = "Train dataset samples: {}".format(X_train.shape[0])
	test_samples = "Test dataset samples: {}".format(X_test.shape[0])
	columns_number = "Number of columns: {}".format(df.shape[1])

	train_samples, test_samples, columns_number
	return


	@app.cell
	def _(mo):
	mo.md("""b. Dataset features""")
	return


	@app.cell
	def _(X):
	X.columns
	return


	@app.cell
	def _(mo):
	mo.md("""c. Sample from dataset""")
	return


	@app.cell
	def _(X):
	sample = X.head(5).T
	sample.columns = [
	str(col) for col in sample.columns
	] # fix integer name warning
	sample = sample.astype(str) # avoid numeric conversion issues in viewer
	sample
	return


	@app.cell
	def _(mo):
	mo.md("""d. Target variable Distribution""")
	return


	@app.cell
	def _(df, plot_target_distribution):
	target_table, target_plot = plot_target_distribution(df=df)
	target_table
	return (target_plot,)


	@app.cell
	def _(target_plot):
	target_plot
	return


	@app.cell
	def _(mo):
	mo.md("""e. Number of columns of each data type""")
	return


	@app.cell
	def _(X):
	X.dtypes.value_counts().sort_values(ascending=False)
	return


	@app.cell
	def _(X):
	categorical_cols = (
	X.select_dtypes(include=["object"]).nunique().sort_values(ascending=False)
	)
	categorical_cols
	return


	@app.cell
	def _(mo):
	mo.md("""f. Missing data""")
	return


	@app.cell
	def _(X, pd):
	missing_count = X.isna().sum().sort_values(ascending=False)
	missing_percentage = (missing_count / X.shape[0] * 100).round(2)

	missing_data = pd.DataFrame(
	data={"Count": missing_count, "percentage": missing_percentage}
	)
	missing_data
	return


	@app.cell
	def _(mo):
	mo.md("""### 1.2 Distribution of Variables""")
	return


	@app.cell
	def _(mo):
	mo.md(
	r"""Want to see how these plots were created? You can find the source code for the visualizations in [plots.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/plots.py)."""
	)
	return


	@app.cell
	def _(mo):
	mo.md("""a. Credit Amounts""")
	return


	@app.cell
	def _(X, plot_credit_amounts):
	plot_credit_amounts(df=X)
	return


	@app.cell
	def _(mo):
	mo.md("""b. Education Level of Credit Applicants""")
	return


	@app.cell
	def _(X, plot_education_levels):
	education_table, education_plot = plot_education_levels(df=X)
	education_table
	return (education_plot,)


	@app.cell
	def _(education_plot):
	education_plot
	return


	@app.cell
	def _(mo):
	mo.md("""c. Ocupation of Credit Applicants""")
	return


	@app.cell
	def _(X, plot_occupation):
	occupation_table, occupation_plot = plot_occupation(df=X)
	occupation_table
	return (occupation_plot,)


	@app.cell
	def _(occupation_plot):
	occupation_plot
	return


	@app.cell
	def _(mo):
	mo.md("""d. Family Status of Applicants""")
	return


	@app.cell
	def _(X, plot_family_status):
	family_status_table, family_status_plot = plot_family_status(df=X)
	family_status_table
	return (family_status_plot,)


	@app.cell
	def _(family_status_plot):
	family_status_plot
	return


	@app.cell
	def _(mo):
	mo.md("""e. Income Type of Applicants by Target Variable""")
	return


	@app.cell
	def _(df, plot_income_type):
	plot_income_type(df=df)
	return


	@app.cell
	def _(mo):
	mo.md("""## 2. Preprocessing""")
	return


	@app.cell
	def _(mo):
	mo.md("""a. Separate Train and Test Datasets""")
	return


	@app.cell
	def _(X, get_train_test_sets, y):
	X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
	X_train.shape, y_train.shape, X_test.shape, y_test.shape
	return X_test, X_train


	@app.cell
	def _(mo):
	mo.md("""b. Preprocess Data""")
	return


	@app.cell
	def _(mo):
	mo.md(
	r"""
	This preprocessing perform:

	- Correct outliers/anomalous values in numerical columns (`DAYS_EMPLOYED` column).
	- Encode string categorical features (`dtype object`).
	- If the feature has 2 categories, Binary Encoding is applied.
	- One Hot Encoding for more than 2 categories.
	- Impute values for all columns with missing data (using median as imputing value).
	- Feature scaling with Min-Max scaler

	Want to see how the dataset was processed? You can find the code for the preprocessing steps in [preprocessing.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/preprocessing.py).
	"""
	)
	return


	@app.cell
	def _(X_test, X_train, preprocess_data_pipeline):
	train_data, test_data = preprocess_data_pipeline(
	train_df=X_train, test_df=X_test
	)
	train_data.shape, test_data.shape
	return


	@app.cell
	def _(mo):
	mo.md("""## 3. Training Models""")
	return


	@app.cell
	def _(mo):
	mo.md(
	r"""At this points, we will work with `train_data` and `test_data` as features sets; also `y_train` and `y_test` as target sets."""
	)
	return


	@app.cell
	def _(mo):
	mo.md(r"""### 3.1 Logistic Regression""")
	return


	@app.cell
	def _(mo):
	mo.callout(
	mo.md("""
	In Logistic Regression, C is the inverse of regularization strength:

	- Small C → Stronger regularization → Simpler model, less overfitting risk, but may underfit.
	- Large C → Weaker regularization → Model fits training data more closely, but may overfit.
	"""),
	kind="info",
	)
	return


	@app.cell
	def _(mo):
	mo.md(
	r"""
	We trained our Logistic Regression model using the following code:

	```py
	# 📌 Logistic Regression
	log_reg = LogisticRegression(C=0.0001)
	log_reg.fit(train_data, y_train)

	# Train data predicton (class 1)
	lr_train_pred = log_reg.predict_proba(train_data)[:, 1]

	# Test data prediction (class 1)
	lr_test_pred = log_reg.predict_proba(test_data)[:, 1]

	# Get the ROC AUC Score on train and test datasets
	log_reg_scores = {
	"train_score": roc_auc_score(y_train, lr_train_pred),
	"test_score": roc_auc_score(y_test, lr_test_pred),
	}
	log_reg_scores
	```

	📈 The ROC AUC scores obtained:
	"""
	)
	return


	@app.cell
	def _():
	lr_scores = {
	"train_score": 0.6868418961663535,
	"test_score": 0.6854973003347028,
	}
	lr_scores
	return


	@app.cell
	def _(mo):
	mo.md(r"""### 3.2 Random Forest Classifier""")
	return


	@app.cell
	def _(mo):
	mo.md(
	r"""
	We trained our Random Forest Classifier model using the following code:

	```py
	# 📌 Random Forest Classifier
	rf = RandomForestClassifier(random_state=42, n_jobs=-1)
	rf.fit(train_data, y_train)

	rf_train_pred = rf.predict_proba(train_data)[:, 1]
	rf_test_pred = rf.predict_proba(test_data)[:, 1]

	rf_scores = {
	"train_score": roc_auc_score(y_train, rf_train_pred),
	"test_score": roc_auc_score(y_test, rf_test_pred),
	}
	rf_scores
	```

	📈 The ROC AUC scores obtained:
	"""
	)
	return


	@app.cell
	def _():
	rf_scores = {"train_score": 1.0, "test_score": 0.7066811557903828}
	rf_scores
	return


	@app.cell
	def _(mo):
	mo.md(r"""### 3.3. Randomized Search with Cross Validations""")
	return


	@app.cell
	def _(mo):
	mo.md(
	r"""
	We trained the Randomized Search CV using the following code:

	```py
	# 📌 RandomizedSearchCV
	param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}

	rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1)
	rscv = RandomizedSearchCV(
	estimator=rf_optimized,
	param_distributions=param_dist,
	n_iter=5,
	scoring="roc_auc",
	cv=3,
	random_state=42,
	n_jobs=-1,
	)

	rscv.fit(train_data, y_train)

	rfo_train_pred = rscv.predict_proba(train_data)[:, 1]
	rfo_test_pred = rscv.predict_proba(test_data)[:, 1]

	rfo_scores = {
	"train_score": roc_auc_score(y_train, rfo_train_pred),
	"test_score": roc_auc_score(y_test, rfo_test_pred),
	}
	rfo_scores
	```

	📈 The ROC AUC scores obtained:
	"""
	)
	return


	@app.cell
	def _():
	rfo_scores = {
	"train_score": 0.8196620915431655,
	"test_score": 0.7308385425476998,
	}
	rfo_scores
	return


	@app.cell
	def _(mo):
	mo.md(r"""🥇The best results:""")
	return


	@app.cell
	def _():
	optimized_results = {
	"best_params_": {"n_estimators": 100, "max_depth": 10},
	"best_score_": 0.7296259755147781,
	"best_estimator_": "RandomForestClassifier(max_depth=10, n_jobs=-1, random_state=42)",
	}
	optimized_results
	return


	@app.cell
	def _(mo):
	mo.md(r"""### 3.4 LightGBM""")
	return


	@app.cell
	def _(mo):
	mo.md(
	r"""
	We trained our LightGBM Classifier model using the following code:

	```py
	# 📌 LightGBM
	import warnings

	warnings.filterwarnings(
	"ignore", message="X does not have valid feature names"
	)

	# 📌 Get numerical and categorical variables (binary and mutiple)
	num_cols = X_train.select_dtypes(include="number").columns.to_list()
	cat_cols = X_train.select_dtypes(include="object").columns.to_list()

	binary_cols = [col for col in cat_cols if X_train[col].nunique() == 2]
	multi_cols = [col for col in cat_cols if X_train[col].nunique() > 2]

	# 📌 [1] Create the pipelines for different data types
	numerical_pipeline = Pipeline(
	steps=[
	("imputer", SimpleImputer(strategy="median")),
	("scaler", MinMaxScaler()),
	]
	)

	binary_pipeline = Pipeline(
	steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("ordinal", OrdinalEncoder()),
	("scaler", MinMaxScaler()),
	]
	)

	multi_pipeline = Pipeline(
	steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	(
	"onehot",
	OneHotEncoder(handle_unknown="ignore", sparse_output=False),
	),
	("scaler", MinMaxScaler()),
	]
	)

	# 📌 [2] Create the preprocessor using ColumnTransformer
	preprocessor = ColumnTransformer(
	transformers=[
	("binary", binary_pipeline, binary_cols),
	("multi", multi_pipeline, multi_cols),
	("numerical", numerical_pipeline, num_cols),
	],
	remainder="passthrough",
	)

	# 📌 [3] Create the Final Pipeline that combines the preprocessor and the model
	lgbm = LGBMClassifier(
	n_estimators=500,
	learning_rate=0.05,
	max_depth=-1,
	random_state=42,
	class_weight="balanced",
	n_jobs=-1,
	)

	lgbm_pipeline = Pipeline(
	steps=[("preprocessor", preprocessor), ("classifier", lgbm)]
	)

	# 📌 [4] Fit the Final Pipeline on the ORIGINAL, unprocessed data
	# The pipeline takes care of all the preprocessing internally.
	lgbm_pipeline.fit(X_train, y_train)

	lgbm_train_pred = lgbm_pipeline.predict_proba(X_train)[:, 1]
	lgbm_test_pred = lgbm_pipeline.predict_proba(X_test)[:, 1]

	lgbm_scores = {
	"train_score": roc_auc_score(y_train, lgbm_train_pred),
	"test_score": roc_auc_score(y_test, lgbm_test_pred),
	}
	lgbm_scores
	```

	📈 The ROC AUC scores obtained:
	"""
	)
	return


	@app.cell
	def _():
	lgbm_scores = {
	"train_score": 0.8523466410959462,
	"test_score": 0.7514895868142193,
	}
	lgbm_scores
	return


	@app.cell
	def _(mo):
	mo.md(r"""## 4. Model Performance Analysis""")
	return


	@app.cell
	def _(mo):
	lg_stat = mo.stat(
	label="Logistic Regression",
	bordered=True,
	value="🏋️ 0.687 🔎 0.685",
	caption="Scores are consistent across train and test, indicating no overfitting. However, the overall AUC is low, suggesting underfitting — the model is too simple to capture complex patterns.",
	direction="decrease",
	)

	rfc_stat = mo.stat(
	label="Random Forest Classifier",
	bordered=True,
	value="🏋️ 1.0 🔎 0.707",
	caption="Perfect training AUC indicates severe overfitting — the model memorized the training set. While the test score is better than Logistic Regression, the gap is too large for good generalization.",
	direction="decrease",
	)

	rfo_stat = mo.stat(
	label="Random Forest with Randomized Search",
	bordered=True,
	value="🏋️ 0.820 🔎 0.731",
	caption="Hyperparameter tuning greatly reduced overfitting. The smaller train–test gap and improved test AUC show better generalization and a strong performance.",
	direction="increase",
	)

	lgbm_stat = mo.stat(
	label="LightGBM",
	bordered=True,
	value="🏋️ 0.852 🔎 0.751",
	caption="Best overall performance. Small train–test gap and highest test AUC indicate a well-balanced model with strong generalization.",
	direction="increase",
	)

	mo.vstack(
	items=[
	mo.hstack(items=[lg_stat, rfc_stat], widths="equal", gap=1),
	mo.hstack(items=[rfo_stat, lgbm_stat], widths="equal", gap=1),
	],
	gap=1,
	heights="equal",
	align="center",
	justify="center",
	)
	return


	@app.cell
	def _(mo):
	mo.md(r"""## 5. Model Selection""")
	return


	@app.cell
	def _(mo):
	mo.md(
	r"""
	Based on a comparison of all the models, the final model selection is clear.

	\| Model \| Train Score (AUC ROC) \| Test Score (AUC ROC) \|
	\| :--- \| :---: \| :---: \|
	\| Logistic Regression \| 0.687 \| 0.685 \|
	\| Random Forest Classifier \| 1.000 \| 0.707 \|
	\| Randomized Search (Tuned RF) \| 0.820 \| 0.731 \|
	\| LightGBM \| 0.852 \| 0.751 \|

	* The Logistic Regression model performed poorly due to underfitting.
	* The base Random Forest model, while better, suffered from severe overfitting.
	* The tuned Random Forest model was a significant improvement and a strong contender, achieving a solid `test_score`.
	* However, the LightGBM model ultimately demonstrated the best performance, achieving the highest ROC AUC test score of 0.751. This indicates that it is the most robust and accurate model for predicting loan repayment risk on unseen data.
	"""
	)
	return


	@app.cell
	def _(mo):
	mo.callout(
	kind="success",
	value="🥇 Therefore, we will select the LightGBM model as our final choice for deployment.",
	)
	return


	if __name__ == "__main__":
	app.run()