|
|
import os |
|
|
import pandas as pd |
|
|
import logging |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score |
|
|
import joblib |
|
|
from torch.utils.tensorboard import SummaryWriter |
|
|
from tabulate import tabulate |
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
format="%(asctime)s - %(levelname)s - %(message)s", |
|
|
level=logging.INFO |
|
|
) |
|
|
|
|
|
|
|
|
model_path = "model/logistic_model.joblib" |
|
|
vectorizer_path = "model/vectorizer.joblib" |
|
|
log_dir = "logs" |
|
|
metrics_log_path = os.path.join(log_dir, "metrics_log.txt") |
|
|
|
|
|
|
|
|
os.makedirs("model", exist_ok=True) |
|
|
os.makedirs(log_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
writer = SummaryWriter(log_dir=log_dir) |
|
|
|
|
|
|
|
|
file_path = "dataset/rwandan_names.csv" |
|
|
logging.info("Loading dataset from file...") |
|
|
data = pd.read_csv(file_path) |
|
|
logging.info("Dataset loaded. Checking for missing values...") |
|
|
|
|
|
|
|
|
data = data.dropna(subset=['name', 'gender']) |
|
|
logging.info(f"Dataset loaded with {len(data)} records after dropping NaNs.") |
|
|
|
|
|
|
|
|
logging.info("Splitting dataset into training, validation, and test sets...") |
|
|
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42) |
|
|
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42) |
|
|
logging.info(f"Training set size: {len(train_data)}") |
|
|
logging.info(f"Validation set size: {len(val_data)}") |
|
|
logging.info(f"Test set size: {len(test_data)}") |
|
|
|
|
|
|
|
|
logging.info("Initializing vectorizer and transforming training data...") |
|
|
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3)) |
|
|
X_train = vectorizer.fit_transform(train_data['name']) |
|
|
logging.info("Training data transformed.") |
|
|
|
|
|
logging.info("Transforming validation and test data...") |
|
|
X_val = vectorizer.transform(val_data['name']) |
|
|
X_test = vectorizer.transform(test_data['name']) |
|
|
logging.info("Validation and test data transformation complete.") |
|
|
|
|
|
|
|
|
y_train = train_data['gender'] |
|
|
y_val = val_data['gender'] |
|
|
y_test = test_data['gender'] |
|
|
|
|
|
|
|
|
logging.info("Initializing and training the Logistic Regression model...") |
|
|
model = LogisticRegression(max_iter=1000) |
|
|
model.fit(X_train, y_train) |
|
|
logging.info("Model training complete.") |
|
|
|
|
|
|
|
|
metrics_summary = [] |
|
|
|
|
|
|
|
|
def calculate_metrics(y_true, y_pred, dataset_type="Validation", step=0): |
|
|
accuracy = accuracy_score(y_true, y_pred) |
|
|
precision = precision_score(y_true, y_pred, pos_label="female", average="binary") |
|
|
recall = recall_score(y_true, y_pred, pos_label="female", average="binary") |
|
|
f1 = f1_score(y_true, y_pred, pos_label="female", average="binary") |
|
|
|
|
|
|
|
|
writer.add_scalar(f"{dataset_type}/Accuracy", accuracy, step) |
|
|
writer.add_scalar(f"{dataset_type}/Precision", precision, step) |
|
|
writer.add_scalar(f"{dataset_type}/Recall", recall, step) |
|
|
writer.add_scalar(f"{dataset_type}/F1-score", f1, step) |
|
|
|
|
|
|
|
|
metrics_summary.append([dataset_type, accuracy, precision, recall, f1]) |
|
|
|
|
|
|
|
|
with open(metrics_log_path, "a") as log_file: |
|
|
log_file.write(f"\n{dataset_type} Metrics:\n") |
|
|
log_file.write(f"Accuracy: {accuracy:.4f}\nPrecision: {precision:.4f}\nRecall: {recall:.4f}\nF1-score: {f1:.4f}\n") |
|
|
|
|
|
logging.info(f"{dataset_type} Metrics - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}") |
|
|
|
|
|
|
|
|
logging.info("Calculating metrics for validation set...") |
|
|
y_val_pred = model.predict(X_val) |
|
|
calculate_metrics(y_val, y_val_pred, dataset_type="Validation", step=1) |
|
|
|
|
|
logging.info("Calculating metrics for test set...") |
|
|
y_test_pred = model.predict(X_test) |
|
|
calculate_metrics(y_test, y_test_pred, dataset_type="Test", step=2) |
|
|
|
|
|
|
|
|
logging.info("Saving model and vectorizer to disk...") |
|
|
joblib.dump(model, model_path) |
|
|
joblib.dump(vectorizer, vectorizer_path) |
|
|
logging.info(f"Model saved to {model_path}") |
|
|
logging.info(f"Vectorizer saved to {vectorizer_path}") |
|
|
logging.info(f"Metrics logged to {metrics_log_path}") |
|
|
|
|
|
|
|
|
print("\nFinal Metrics Summary:") |
|
|
print(tabulate(metrics_summary, headers=["Dataset", "Accuracy", "Precision", "Recall", "F1-Score"], floatfmt=".4f")) |
|
|
|
|
|
|
|
|
writer.close() |
|
|
logging.info("Training and logging completed.") |