File size: 44,637 Bytes
ead9c37 ca89c11 ead9c37 ca89c11 83527bc 310a651 83527bc 310a651 83527bc 310a651 83527bc 719d51e ead9c37 310a651 c29bcf3 310a651 83527bc 52d71b1 310a651 52d71b1 310a651 52d71b1 310a651 83527bc ca89c11 c29bcf3 719d51e 3a989cc 719d51e 3a989cc 719d51e ead9c37 719d51e ead9c37 719d51e ead9c37 719d51e ead9c37 ca89c11 ead9c37 ca89c11 310a651 52d71b1 310a651 719d51e ead9c37 52d71b1 34841ba 52d71b1 34841ba 52d71b1 34841ba 52d71b1 34841ba 52d71b1 ca89c11 310a651 ead9c37 310a651 ead9c37 719d51e 310a651 719d51e 310a651 719d51e ca89c11 310a651 719d51e 310a651 719d51e 310a651 719d51e 310a651 719d51e 310a651 719d51e 310a651 ca89c11 310a651 52d71b1 310a651 52d71b1 719d51e ca89c11 52d71b1 ca89c11 310a651 52d71b1 ca89c11 52d71b1 ca89c11 52d71b1 ead9c37 ca89c11 52d71b1 310a651 52d71b1 310a651 ca89c11 52d71b1 ca89c11 310a651 ca89c11 310a651 719d51e c29bcf3 310a651 719d51e 310a651 ca89c11 719d51e 310a651 ca89c11 310a651 719d51e 310a651 ca89c11 310a651 0fffa64 52d71b1 ca89c11 ead9c37 719d51e 52d71b1 ead9c37 719d51e 52d71b1 310a651 52d71b1 310a651 52d71b1 ca89c11 ead9c37 719d51e ead9c37 ca89c11 310a651 ca89c11 34841ba ead9c37 34841ba ead9c37 34841ba ead9c37 34841ba 310a651 ca89c11 ead9c37 34841ba ead9c37 310a651 ead9c37 310a651 ead9c37 310a651 ca89c11 310a651 ead9c37 310a651 ca89c11 ead9c37 310a651 ead9c37 310a651 ead9c37 52d71b1 310a651 ca89c11 310a651 ead9c37 310a651 ead9c37 ca89c11 310a651 ca89c11 310a651 34841ba 310a651 34841ba ead9c37 34841ba ca89c11 310a651 ead9c37 719d51e 310a651 ca89c11 310a651 ead9c37 ca89c11 310a651 52d71b1 310a651 ca89c11 ead9c37 310a651 ca89c11 ead9c37 310a651 ca89c11 310a651 ca89c11 ead9c37 ca89c11 310a651 52d71b1 310a651 ca89c11 310a651 ca89c11 310a651 ead9c37 719d51e ca89c11 310a651 ca89c11 310a651 ca89c11 ead9c37 ca89c11 310a651 ca89c11 310a651 52d71b1 ca89c11 ead9c37 310a651 ca89c11 ead9c37 52d71b1 719d51e 52d71b1 34841ba 52d71b1 34841ba c29bcf3 34841ba 52d71b1 ead9c37 52d71b1 ead9c37 52d71b1 34841ba ead9c37 34841ba 52d71b1 ead9c37 52d71b1 34841ba 52d71b1 310a651 ead9c37 310a651 ead9c37 52d71b1 ca89c11 310a651 ca89c11 719d51e ead9c37 719d51e ead9c37 719d51e ead9c37 719d51e ead9c37 719d51e 310a651 ca89c11 34841ba 719d51e 34841ba 310a651 ca89c11 34841ba 310a651 ca89c11 719d51e 34841ba ead9c37 34841ba ca89c11 ead9c37 ca89c11 310a651 719d51e ca89c11 ead9c37 52d71b1 ca89c11 719d51e ca89c11 ead9c37 310a651 ead9c37 52d71b1 ead9c37 310a651 ca89c11 310a651 ca89c11 310a651 719d51e ead9c37 310a651 83527bc ca89c11 83527bc ead9c37 719d51e ead9c37 719d51e 3a989cc ead9c37 719d51e 52d71b1 3a989cc ead9c37 3a989cc ead9c37 3a989cc ead9c37 3a989cc 719d51e ca89c11 52d71b1 ca89c11 52d71b1 ca89c11 83527bc c29bcf3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 |
# File: model/train.py (MODIFIED)
# Enhanced version with comprehensive cross-validation implementation
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, confusion_matrix, classification_report,
precision_recall_curve, roc_curve
)
from sklearn.model_selection import (
train_test_split, cross_val_score, GridSearchCV,
StratifiedKFold, validation_curve, cross_validate
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from pathlib import Path
import logging
import json
import joblib
import hashlib
import sys
import os
import time
from datetime import datetime, timedelta
from typing import Dict, Tuple, Optional, Any, List
import warnings
import re
warnings.filterwarnings('ignore')
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('/tmp/model_training.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def preprocess_text_function(texts):
"""
Standalone function for text preprocessing - pickle-safe
"""
def clean_single_text(text):
# Convert to string
text = str(text)
# Remove URLs
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
# Remove email addresses
text = re.sub(r'\S+@\S+', '', text)
# Remove excessive punctuation
text = re.sub(r'[!]{2,}', '!', text)
text = re.sub(r'[?]{2,}', '?', text)
text = re.sub(r'[.]{3,}', '...', text)
# Remove non-alphabetic characters except spaces and basic punctuation
text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
return text.strip().lower()
# Process all texts
processed = []
for text in texts:
processed.append(clean_single_text(text))
return processed
class ProgressTracker:
"""Progress tracking with time estimation"""
def __init__(self, total_steps: int, description: str = "Training"):
self.total_steps = total_steps
self.current_step = 0
self.start_time = time.time()
self.description = description
self.step_times = []
def update(self, step_name: str = ""):
"""Update progress and print status"""
self.current_step += 1
current_time = time.time()
elapsed = current_time - self.start_time
# Calculate progress percentage
progress_pct = (self.current_step / self.total_steps) * 100
# Estimate remaining time
if self.current_step > 0:
avg_time_per_step = elapsed / self.current_step
remaining_steps = self.total_steps - self.current_step
eta_seconds = avg_time_per_step * remaining_steps
eta = timedelta(seconds=int(eta_seconds))
else:
eta = "calculating..."
# Create progress bar
bar_length = 30
filled_length = int(bar_length * self.current_step // self.total_steps)
bar = 'β' * filled_length + 'β' * (bar_length - filled_length)
# Print progress (this will be visible in Streamlit logs)
status_msg = f"\r{self.description}: [{bar}] {progress_pct:.1f}% | Step {self.current_step}/{self.total_steps}"
if step_name:
status_msg += f" | {step_name}"
if eta != "calculating...":
status_msg += f" | ETA: {eta}"
print(status_msg, end='', flush=True)
# Also output JSON for Streamlit parsing (if needed)
progress_json = {
"type": "progress",
"step": self.current_step,
"total": self.total_steps,
"percentage": progress_pct,
"eta": str(eta) if eta != "calculating..." else None,
"step_name": step_name,
"elapsed": elapsed
}
print(f"\nPROGRESS_JSON: {json.dumps(progress_json)}")
# Store step time for better estimation
if len(self.step_times) >= 3: # Keep last 3 step times for moving average
self.step_times.pop(0)
self.step_times.append(current_time - (self.start_time + sum(self.step_times)))
def finish(self):
"""Complete progress tracking"""
total_time = time.time() - self.start_time
print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5) -> Dict:
"""Estimate training time based on dataset characteristics"""
# Base time estimates (in seconds) based on empirical testing
base_times = {
'preprocessing': max(0.1, dataset_size * 0.001), # ~1ms per sample
'vectorization': max(0.5, dataset_size * 0.01), # ~10ms per sample
'feature_selection': max(0.2, dataset_size * 0.005), # ~5ms per sample
'simple_training': max(1.0, dataset_size * 0.02), # ~20ms per sample
'evaluation': max(0.5, dataset_size * 0.01), # ~10ms per sample
}
# Hyperparameter tuning multipliers
tuning_multipliers = {
'logistic_regression': 8 if enable_tuning else 1, # 8 param combinations
'random_forest': 12 if enable_tuning else 1, # 12 param combinations
}
# Cross-validation multiplier
cv_multiplier = cv_folds if dataset_size > 100 else 1
# Calculate estimates
estimates = {}
# Preprocessing steps
estimates['data_loading'] = 0.5
estimates['preprocessing'] = base_times['preprocessing']
estimates['vectorization'] = base_times['vectorization']
estimates['feature_selection'] = base_times['feature_selection']
# Model training (now includes CV)
for model_name, multiplier in tuning_multipliers.items():
model_time = base_times['simple_training'] * multiplier * cv_multiplier
estimates[f'{model_name}_training'] = model_time
estimates[f'{model_name}_evaluation'] = base_times['evaluation']
# Cross-validation overhead
estimates['cross_validation'] = base_times['simple_training'] * cv_folds * 0.5
# Model saving
estimates['model_saving'] = 1.0
# Total estimate
total_estimate = sum(estimates.values())
# Add 20% buffer for overhead
total_estimate *= 1.2
return {
'detailed_estimates': estimates,
'total_seconds': total_estimate,
'total_formatted': str(timedelta(seconds=int(total_estimate))),
'dataset_size': dataset_size,
'enable_tuning': enable_tuning,
'cv_folds': cv_folds
}
class CrossValidationManager:
"""Advanced cross-validation management with comprehensive metrics"""
def __init__(self, cv_folds: int = 5, random_state: int = 42):
self.cv_folds = cv_folds
self.random_state = random_state
self.cv_results = {}
def create_cv_strategy(self, X, y) -> StratifiedKFold:
"""Create appropriate CV strategy based on data characteristics"""
# Calculate appropriate CV folds for small datasets
n_samples = len(X)
min_samples_per_fold = 3 # Minimum samples per fold
max_folds = n_samples // min_samples_per_fold
# Adjust folds based on data size and class distribution
unique_classes = np.unique(y)
min_class_count = min([np.sum(y == cls) for cls in unique_classes])
# Ensure each fold has at least one sample from each class
max_folds_by_class = min_class_count
actual_folds = max(2, min(self.cv_folds, max_folds, max_folds_by_class))
logger.info(f"Using {actual_folds} CV folds (requested: {self.cv_folds})")
return StratifiedKFold(
n_splits=actual_folds,
shuffle=True,
random_state=self.random_state
)
def perform_cross_validation(self, pipeline, X, y, cv_strategy=None) -> Dict:
"""Perform comprehensive cross-validation with multiple metrics"""
if cv_strategy is None:
cv_strategy = self.create_cv_strategy(X, y)
logger.info(f"Starting cross-validation with {cv_strategy.n_splits} folds...")
# Define scoring metrics
scoring_metrics = {
'accuracy': 'accuracy',
'precision': 'precision_weighted',
'recall': 'recall_weighted',
'f1': 'f1_weighted',
'roc_auc': 'roc_auc'
}
try:
# Perform cross-validation
cv_scores = cross_validate(
pipeline, X, y,
cv=cv_strategy,
scoring=scoring_metrics,
return_train_score=True,
n_jobs=1, # Use single job for stability
verbose=0
)
# Process results
cv_results = {
'n_splits': cv_strategy.n_splits,
'test_scores': {},
'train_scores': {},
'fold_results': []
}
# Calculate statistics for each metric
for metric_name in scoring_metrics.keys():
test_key = f'test_{metric_name}'
train_key = f'train_{metric_name}'
if test_key in cv_scores:
test_scores = cv_scores[test_key]
cv_results['test_scores'][metric_name] = {
'mean': float(np.mean(test_scores)),
'std': float(np.std(test_scores)),
'min': float(np.min(test_scores)),
'max': float(np.max(test_scores)),
'scores': test_scores.tolist()
}
if train_key in cv_scores:
train_scores = cv_scores[train_key]
cv_results['train_scores'][metric_name] = {
'mean': float(np.mean(train_scores)),
'std': float(np.std(train_scores)),
'min': float(np.min(train_scores)),
'max': float(np.max(train_scores)),
'scores': train_scores.tolist()
}
# Store individual fold results
for fold_idx in range(cv_strategy.n_splits):
fold_result = {
'fold': fold_idx + 1,
'test_scores': {},
'train_scores': {}
}
for metric_name in scoring_metrics.keys():
test_key = f'test_{metric_name}'
train_key = f'train_{metric_name}'
if test_key in cv_scores:
fold_result['test_scores'][metric_name] = float(cv_scores[test_key][fold_idx])
if train_key in cv_scores:
fold_result['train_scores'][metric_name] = float(cv_scores[train_key][fold_idx])
cv_results['fold_results'].append(fold_result)
# Calculate overfitting indicators
if 'accuracy' in cv_results['test_scores'] and 'accuracy' in cv_results['train_scores']:
train_mean = cv_results['train_scores']['accuracy']['mean']
test_mean = cv_results['test_scores']['accuracy']['mean']
cv_results['overfitting_score'] = float(train_mean - test_mean)
# Calculate stability metrics
if 'accuracy' in cv_results['test_scores']:
test_std = cv_results['test_scores']['accuracy']['std']
test_mean = cv_results['test_scores']['accuracy']['mean']
cv_results['stability_score'] = float(1 - (test_std / test_mean)) if test_mean > 0 else 0
logger.info(f"Cross-validation completed successfully")
logger.info(f"Mean test accuracy: {cv_results['test_scores'].get('accuracy', {}).get('mean', 'N/A'):.4f}")
logger.info(f"Mean test F1: {cv_results['test_scores'].get('f1', {}).get('mean', 'N/A'):.4f}")
return cv_results
except Exception as e:
logger.error(f"Cross-validation failed: {e}")
return {
'error': str(e),
'n_splits': cv_strategy.n_splits if cv_strategy else self.cv_folds,
'fallback': True
}
def compare_cv_results(self, results1: Dict, results2: Dict, metric: str = 'f1') -> Dict:
"""Compare cross-validation results between two models"""
try:
if 'error' in results1 or 'error' in results2:
return {'error': 'Cannot compare results with errors'}
scores1 = results1['test_scores'][metric]['scores']
scores2 = results2['test_scores'][metric]['scores']
# Paired t-test
from scipy import stats
t_stat, p_value = stats.ttest_rel(scores1, scores2)
comparison = {
'metric': metric,
'model1_mean': results1['test_scores'][metric]['mean'],
'model2_mean': results2['test_scores'][metric]['mean'],
'model1_std': results1['test_scores'][metric]['std'],
'model2_std': results2['test_scores'][metric]['std'],
'difference': results2['test_scores'][metric]['mean'] - results1['test_scores'][metric]['mean'],
'paired_ttest': {
't_statistic': float(t_stat),
'p_value': float(p_value),
'significant': p_value < 0.05
},
'effect_size': float(abs(t_stat) / np.sqrt(len(scores1))) if len(scores1) > 0 else 0
}
return comparison
except Exception as e:
logger.error(f"CV comparison failed: {e}")
return {'error': str(e)}
class RobustModelTrainer:
"""Production-ready model trainer with comprehensive cross-validation"""
def __init__(self):
self.setup_paths()
self.setup_training_config()
self.setup_models()
self.progress_tracker = None
self.cv_manager = CrossValidationManager()
def setup_paths(self):
"""Setup all necessary paths with proper permissions"""
self.base_dir = Path("/tmp")
self.data_dir = self.base_dir / "data"
self.model_dir = self.base_dir / "model"
self.results_dir = self.base_dir / "results"
# Create directories with proper permissions
for dir_path in [self.data_dir, self.model_dir, self.results_dir]:
dir_path.mkdir(parents=True, exist_ok=True)
# Ensure write permissions
try:
dir_path.chmod(0o755)
except:
pass
# File paths
self.data_path = self.data_dir / "combined_dataset.csv"
self.model_path = Path("/tmp/model.pkl") # Direct path to avoid permission issues
self.vectorizer_path = Path("/tmp/vectorizer.pkl")
self.pipeline_path = Path("/tmp/pipeline.pkl")
self.metadata_path = Path("/tmp/metadata.json")
self.evaluation_path = self.results_dir / "evaluation_results.json"
def setup_training_config(self):
"""Setup training configuration with CV parameters"""
self.test_size = 0.2
self.validation_size = 0.1
self.random_state = 42
self.cv_folds = 5 # Primary CV folds
self.max_features = 5000 # Reduced for speed
self.min_df = 1 # More lenient for small datasets
self.max_df = 0.95
self.ngram_range = (1, 2) # Reduced for speed
self.max_iter = 500 # Reduced for speed
self.class_weight = 'balanced'
self.feature_selection_k = 2000 # Reduced for speed
def setup_models(self):
"""Setup model configurations for comparison"""
self.models = {
'logistic_regression': {
'model': LogisticRegression(
max_iter=self.max_iter,
class_weight=self.class_weight,
random_state=self.random_state,
n_jobs=-1 # Use all cores
),
'param_grid': {
'model__C': [0.1, 1, 10], # Reduced grid
'model__penalty': ['l2']
}
},
'random_forest': {
'model': RandomForestClassifier(
n_estimators=50, # Reduced for speed
class_weight=self.class_weight,
random_state=self.random_state,
n_jobs=-1 # Use all cores
),
'param_grid': {
'model__n_estimators': [50, 100], # Reduced grid
'model__max_depth': [10, None]
}
}
}
def load_and_validate_data(self) -> Tuple[bool, Optional[pd.DataFrame], str]:
"""Load and validate training data"""
try:
logger.info("Loading training data...")
if self.progress_tracker:
self.progress_tracker.update("Loading data")
if not self.data_path.exists():
return False, None, f"Data file not found: {self.data_path}"
# Load data
df = pd.read_csv(self.data_path)
# Basic validation
if df.empty:
return False, None, "Dataset is empty"
required_columns = ['text', 'label']
missing_columns = [
col for col in required_columns if col not in df.columns]
if missing_columns:
return False, None, f"Missing required columns: {missing_columns}"
# Remove missing values
initial_count = len(df)
df = df.dropna(subset=required_columns)
if len(df) < initial_count:
logger.warning(
f"Removed {initial_count - len(df)} rows with missing values")
# Validate text content
df = df[df['text'].astype(str).str.len() > 10]
# Validate labels
unique_labels = df['label'].unique()
if len(unique_labels) < 2:
return False, None, f"Need at least 2 classes, found: {unique_labels}"
# Check minimum sample size for CV
min_samples_for_cv = self.cv_folds * 2 # At least 2 samples per fold
if len(df) < min_samples_for_cv:
logger.warning(f"Dataset size ({len(df)}) is small for {self.cv_folds}-fold CV")
# Adjust CV folds for small datasets
self.cv_manager.cv_folds = max(2, len(df) // 3)
logger.info(f"Adjusted CV folds to {self.cv_manager.cv_folds}")
# Check class balance
label_counts = df['label'].value_counts()
min_class_ratio = label_counts.min() / label_counts.max()
if min_class_ratio < 0.1:
logger.warning(
f"Severe class imbalance detected: {min_class_ratio:.3f}")
logger.info(
f"Data validation successful: {len(df)} samples, {len(unique_labels)} classes")
logger.info(f"Class distribution: {label_counts.to_dict()}")
return True, df, "Data loaded successfully"
except Exception as e:
error_msg = f"Error loading data: {str(e)}"
logger.error(error_msg)
return False, None, error_msg
def create_preprocessing_pipeline(self) -> Pipeline:
"""Create preprocessing pipeline"""
if self.progress_tracker:
self.progress_tracker.update("Creating pipeline")
# Use the standalone function instead of lambda
text_preprocessor = FunctionTransformer(
func=preprocess_text_function,
validate=False
)
# TF-IDF vectorization with optimized parameters
vectorizer = TfidfVectorizer(
max_features=self.max_features,
min_df=self.min_df,
max_df=self.max_df,
ngram_range=self.ngram_range,
stop_words='english',
sublinear_tf=True,
norm='l2'
)
# Feature selection
feature_selector = SelectKBest(
score_func=chi2,
k=min(self.feature_selection_k, self.max_features)
)
# Create pipeline
pipeline = Pipeline([
('preprocess', text_preprocessor),
('vectorize', vectorizer),
('feature_select', feature_selector),
('model', None) # Will be set during training
])
return pipeline
def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
"""Comprehensive model evaluation with cross-validation integration"""
if self.progress_tracker:
self.progress_tracker.update("Evaluating model")
# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
# Basic metrics
metrics = {
'accuracy': float(accuracy_score(y_test, y_pred)),
'precision': float(precision_score(y_test, y_pred, average='weighted')),
'recall': float(recall_score(y_test, y_pred, average='weighted')),
'f1': float(f1_score(y_test, y_pred, average='weighted')),
'roc_auc': float(roc_auc_score(y_test, y_pred_proba))
}
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
metrics['confusion_matrix'] = cm.tolist()
# Cross-validation on full dataset
if X_train is not None and y_train is not None:
# Combine train and test for full dataset CV
X_full = np.concatenate([X_train, X_test])
y_full = np.concatenate([y_train, y_test])
logger.info("Performing cross-validation on full dataset...")
cv_results = self.cv_manager.perform_cross_validation(model, X_full, y_full)
metrics['cross_validation'] = cv_results
# Log CV results
if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
cv_f1_mean = cv_results['test_scores']['f1']['mean']
cv_f1_std = cv_results['test_scores']['f1']['std']
logger.info(f"CV F1 Score: {cv_f1_mean:.4f} (Β±{cv_f1_std:.4f})")
# Training accuracy for overfitting detection
try:
if X_train is not None and y_train is not None:
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
metrics['train_accuracy'] = float(train_accuracy)
metrics['overfitting_score'] = float(
train_accuracy - metrics['accuracy'])
except Exception as e:
logger.warning(f"Overfitting detection failed: {e}")
return metrics
def hyperparameter_tuning_with_cv(self, pipeline, X_train, y_train, model_name: str) -> Tuple[Any, Dict]:
"""Perform hyperparameter tuning with nested cross-validation"""
if self.progress_tracker:
self.progress_tracker.update(f"Tuning {model_name} with CV")
try:
# Set the model in the pipeline
pipeline.set_params(model=self.models[model_name]['model'])
# Skip hyperparameter tuning for very small datasets
if len(X_train) < 20:
logger.info(f"Skipping hyperparameter tuning for {model_name} due to small dataset")
pipeline.fit(X_train, y_train)
# Still perform CV evaluation
cv_results = self.cv_manager.perform_cross_validation(pipeline, X_train, y_train)
return pipeline, {
'best_params': 'default_parameters',
'best_score': cv_results.get('test_scores', {}).get('f1', {}).get('mean', 'not_calculated'),
'best_estimator': pipeline,
'cross_validation': cv_results,
'note': 'Hyperparameter tuning skipped for small dataset'
}
# Get parameter grid
param_grid = self.models[model_name]['param_grid']
# Create CV strategy
cv_strategy = self.cv_manager.create_cv_strategy(X_train, y_train)
# Create GridSearchCV with nested cross-validation
grid_search = GridSearchCV(
pipeline,
param_grid,
cv=cv_strategy,
scoring='f1_weighted',
n_jobs=1, # Single job for stability
verbose=0, # Reduce verbosity for speed
return_train_score=True # For overfitting analysis
)
# Fit grid search
logger.info(f"Starting hyperparameter tuning for {model_name}...")
grid_search.fit(X_train, y_train)
# Perform additional CV on best model
logger.info(f"Performing final CV evaluation for {model_name}...")
best_cv_results = self.cv_manager.perform_cross_validation(
grid_search.best_estimator_, X_train, y_train, cv_strategy
)
# Extract results
tuning_results = {
'best_params': grid_search.best_params_,
'best_score': float(grid_search.best_score_),
'best_estimator': grid_search.best_estimator_,
'cv_folds_used': cv_strategy.n_splits,
'cross_validation': best_cv_results,
'grid_search_results': {
'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
'mean_train_scores': grid_search.cv_results_.get('mean_train_score', []).tolist() if 'mean_train_score' in grid_search.cv_results_ else [],
'params': grid_search.cv_results_['params']
}
}
logger.info(f"Hyperparameter tuning completed for {model_name}")
logger.info(f"Best CV score: {grid_search.best_score_:.4f}")
logger.info(f"Best params: {grid_search.best_params_}")
if 'test_scores' in best_cv_results and 'f1' in best_cv_results['test_scores']:
final_f1 = best_cv_results['test_scores']['f1']['mean']
final_f1_std = best_cv_results['test_scores']['f1']['std']
logger.info(f"Final CV F1: {final_f1:.4f} (Β±{final_f1_std:.4f})")
return grid_search.best_estimator_, tuning_results
except Exception as e:
logger.error(f"Hyperparameter tuning failed for {model_name}: {str(e)}")
# Return basic model if tuning fails
try:
pipeline.set_params(model=self.models[model_name]['model'])
pipeline.fit(X_train, y_train)
# Perform basic CV
cv_results = self.cv_manager.perform_cross_validation(pipeline, X_train, y_train)
return pipeline, {
'error': str(e),
'fallback': 'simple_training',
'cross_validation': cv_results
}
except Exception as e2:
logger.error(f"Fallback training also failed for {model_name}: {str(e2)}")
raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
"""Train and evaluate multiple models with comprehensive CV"""
results = {}
for model_name in self.models.keys():
logger.info(f"Training {model_name} with cross-validation...")
try:
# Create pipeline
pipeline = self.create_preprocessing_pipeline()
# Hyperparameter tuning with CV
best_model, tuning_results = self.hyperparameter_tuning_with_cv(
pipeline, X_train, y_train, model_name
)
# Comprehensive evaluation (includes additional CV)
evaluation_metrics = self.comprehensive_evaluation(
best_model, X_test, y_test, X_train, y_train
)
# Store results
results[model_name] = {
'model': best_model,
'tuning_results': tuning_results,
'evaluation_metrics': evaluation_metrics,
'training_time': datetime.now().isoformat()
}
# Log results
test_f1 = evaluation_metrics['f1']
cv_results = evaluation_metrics.get('cross_validation', {})
cv_f1_mean = cv_results.get('test_scores', {}).get('f1', {}).get('mean', 'N/A')
cv_f1_std = cv_results.get('test_scores', {}).get('f1', {}).get('std', 'N/A')
logger.info(f"Model {model_name} - Test F1: {test_f1:.4f}, "
f"CV F1: {cv_f1_mean:.4f if cv_f1_mean != 'N/A' else cv_f1_mean} "
f"(Β±{cv_f1_std:.4f if cv_f1_std != 'N/A' else cv_f1_std})")
except Exception as e:
logger.error(f"Training failed for {model_name}: {str(e)}")
results[model_name] = {'error': str(e)}
return results
def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
"""Select the best performing model based on CV results"""
if self.progress_tracker:
self.progress_tracker.update("Selecting best model")
best_model_name = None
best_model = None
best_score = -1
best_metrics = None
for model_name, result in results.items():
if 'error' in result:
continue
# Prioritize CV F1 score if available, fallback to test F1
cv_results = result['evaluation_metrics'].get('cross_validation', {})
if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
f1_score = cv_results['test_scores']['f1']['mean']
score_type = "CV F1"
else:
f1_score = result['evaluation_metrics']['f1']
score_type = "Test F1"
if f1_score > best_score:
best_score = f1_score
best_model_name = model_name
best_model = result['model']
best_metrics = result['evaluation_metrics']
if best_model_name is None:
raise ValueError("No models trained successfully")
logger.info(f"Best model: {best_model_name} with {score_type} score: {best_score:.4f}")
return best_model_name, best_model, best_metrics
def save_model_artifacts(self, model, model_name: str, metrics: Dict, results: Dict) -> bool:
"""Save model artifacts and enhanced metadata with CV results"""
try:
if self.progress_tracker:
self.progress_tracker.update("Saving model")
# Save the full pipeline with error handling
try:
joblib.dump(model, self.pipeline_path)
logger.info(f"β
Saved pipeline to {self.pipeline_path}")
except Exception as e:
logger.error(f"Failed to save pipeline: {e}")
# Try alternative path
alt_pipeline_path = Path("/tmp") / "pipeline.pkl"
joblib.dump(model, alt_pipeline_path)
logger.info(f"β
Saved pipeline to {alt_pipeline_path}")
# Save individual components for backward compatibility
try:
if hasattr(model, 'named_steps') and 'model' in model.named_steps:
joblib.dump(model.named_steps['model'], self.model_path)
logger.info(f"β
Saved model to {self.model_path}")
except Exception as e:
logger.warning(f"Could not save model component: {e}")
try:
if hasattr(model, 'named_steps') and 'vectorize' in model.named_steps:
joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
logger.info(f"β
Saved vectorizer to {self.vectorizer_path}")
except Exception as e:
logger.warning(f"Could not save vectorizer component: {e}")
# Generate data hash
data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
# Extract CV results
cv_results = metrics.get('cross_validation', {})
# Create enhanced metadata with CV information
metadata = {
'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
'model_type': model_name,
'data_version': data_hash,
'test_accuracy': metrics['accuracy'],
'test_f1': metrics['f1'],
'test_precision': metrics['precision'],
'test_recall': metrics['recall'],
'test_roc_auc': metrics['roc_auc'],
'overfitting_score': metrics.get('overfitting_score', 'Unknown'),
'timestamp': datetime.now().isoformat(),
'training_config': {
'test_size': self.test_size,
'cv_folds': self.cv_folds,
'max_features': self.max_features,
'ngram_range': self.ngram_range,
'feature_selection_k': self.feature_selection_k
}
}
# Add comprehensive CV results to metadata
if cv_results and 'test_scores' in cv_results:
metadata['cross_validation'] = {
'n_splits': cv_results.get('n_splits', self.cv_folds),
'test_scores': cv_results['test_scores'],
'train_scores': cv_results.get('train_scores', {}),
'overfitting_score': cv_results.get('overfitting_score', 'Unknown'),
'stability_score': cv_results.get('stability_score', 'Unknown'),
'individual_fold_results': cv_results.get('fold_results', [])
}
# Add summary statistics
if 'f1' in cv_results['test_scores']:
metadata['cv_f1_mean'] = cv_results['test_scores']['f1']['mean']
metadata['cv_f1_std'] = cv_results['test_scores']['f1']['std']
metadata['cv_f1_min'] = cv_results['test_scores']['f1']['min']
metadata['cv_f1_max'] = cv_results['test_scores']['f1']['max']
if 'accuracy' in cv_results['test_scores']:
metadata['cv_accuracy_mean'] = cv_results['test_scores']['accuracy']['mean']
metadata['cv_accuracy_std'] = cv_results['test_scores']['accuracy']['std']
# Add model comparison results if available
if len(results) > 1:
model_comparison = {}
for other_model_name, other_result in results.items():
if other_model_name != model_name and 'error' not in other_result:
other_cv = other_result['evaluation_metrics'].get('cross_validation', {})
if cv_results and other_cv:
comparison = self.cv_manager.compare_cv_results(cv_results, other_cv)
model_comparison[other_model_name] = comparison
if model_comparison:
metadata['model_comparison'] = model_comparison
# Save metadata with error handling
try:
with open(self.metadata_path, 'w') as f:
json.dump(metadata, f, indent=2)
logger.info(f"β
Saved enhanced metadata to {self.metadata_path}")
except Exception as e:
logger.warning(f"Could not save metadata: {e}")
logger.info(f"β
Model artifacts saved successfully with CV results")
return True
except Exception as e:
logger.error(f"Failed to save model artifacts: {str(e)}")
# Try to save at least the core pipeline
try:
joblib.dump(model, Path("/tmp/pipeline_backup.pkl"))
logger.info("β
Saved backup pipeline")
return True
except Exception as e2:
logger.error(f"Failed to save backup pipeline: {str(e2)}")
return False
def train_model(self, data_path: str = None) -> Tuple[bool, str]:
"""Main training function with comprehensive CV pipeline"""
try:
logger.info("Starting enhanced model training with cross-validation...")
# Override data path if provided
if data_path:
self.data_path = Path(data_path)
# Load and validate data
success, df, message = self.load_and_validate_data()
if not success:
return False, message
# Estimate training time and setup progress tracker
time_estimate = estimate_training_time(
len(df),
enable_tuning=True,
cv_folds=self.cv_folds
)
print(f"\nπ Enhanced Training Configuration:")
print(f"Dataset size: {len(df)} samples")
print(f"Cross-validation folds: {self.cv_folds}")
print(f"Estimated time: {time_estimate['total_formatted']}")
print(f"Models to train: {len(self.models)}")
print(f"Hyperparameter tuning: Enabled")
print()
# Setup progress tracker (increased steps for CV)
total_steps = 4 + (len(self.models) * 3) + 1 # Load, split, 3*models (tune+cv+eval), select, save
self.progress_tracker = ProgressTracker(total_steps, "CV Training Progress")
# Prepare data
X = df['text'].values
y = df['label'].values
# Train-test split with smart handling for small datasets
self.progress_tracker.update("Splitting data")
# Ensure minimum test size for very small datasets
if len(X) < 10:
test_size = max(0.1, 1/len(X)) # At least 1 sample for test
else:
test_size = self.test_size
# Check if stratification is possible
label_counts = pd.Series(y).value_counts()
min_class_count = label_counts.min()
can_stratify = min_class_count >= 2 and len(y) >= 4
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=test_size,
stratify=y if can_stratify else None,
random_state=self.random_state
)
logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
# Additional validation for very small datasets
if len(X_train) < 3:
logger.warning(f"Very small training set: {len(X_train)} samples. CV results may be unreliable.")
if len(X_test) < 1:
return False, "Cannot create test set. Dataset too small."
# Train and evaluate models with CV
results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
# Select best model
best_model_name, best_model, best_metrics = self.select_best_model(results)
# Save model artifacts with CV results
if not self.save_model_artifacts(best_model, best_model_name, best_metrics, results):
return False, "Failed to save model artifacts"
# Finish progress tracking
self.progress_tracker.finish()
# Create success message with CV information
cv_results = best_metrics.get('cross_validation', {})
cv_info = ""
if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
cv_f1_mean = cv_results['test_scores']['f1']['mean']
cv_f1_std = cv_results['test_scores']['f1']['std']
cv_info = f", CV F1: {cv_f1_mean:.4f} (Β±{cv_f1_std:.4f})"
success_message = (
f"Enhanced model training completed successfully. "
f"Best model: {best_model_name} "
f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info})"
)
logger.info(success_message)
return True, success_message
except Exception as e:
if self.progress_tracker:
print() # New line after progress bar
error_message = f"Enhanced model training failed: {str(e)}"
logger.error(error_message)
return False, error_message
def main():
"""Main execution function with enhanced CV support"""
import argparse
# Parse command line arguments
parser = argparse.ArgumentParser(description='Train fake news detection model with cross-validation')
parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
args = parser.parse_args()
trainer = RobustModelTrainer()
# Apply CV folds from command line
if args.cv_folds:
trainer.cv_folds = args.cv_folds
trainer.cv_manager.cv_folds = args.cv_folds
# Load custom configuration if provided
if args.config_path and Path(args.config_path).exists():
try:
with open(args.config_path, 'r') as f:
config = json.load(f)
# Apply configuration
trainer.test_size = config.get('test_size', trainer.test_size)
trainer.cv_folds = config.get('cv_folds', trainer.cv_folds)
trainer.cv_manager.cv_folds = trainer.cv_folds
trainer.max_features = config.get('max_features', trainer.max_features)
trainer.ngram_range = tuple(config.get('ngram_range', trainer.ngram_range))
# Filter models if specified
selected_models = config.get('selected_models')
if selected_models and len(selected_models) < len(trainer.models):
all_models = trainer.models.copy()
trainer.models = {k: v for k, v in all_models.items() if k in selected_models}
# Update feature selection based on max_features
trainer.feature_selection_k = min(trainer.feature_selection_k, trainer.max_features)
logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds: {config}")
except Exception as e:
logger.warning(f"Failed to load configuration: {e}, using defaults")
success, message = trainer.train_model(data_path=args.data_path)
if success:
print(f"β
{message}")
else:
print(f"β {message}")
exit(1)
if __name__ == "__main__":
main() |