Ahmedik95316 commited on
Commit
187f752
·
1 Parent(s): 0616f70

Create error_handler.py

Browse files

Adding Tests for MLOps Infrastructure Enhancement

Files changed (1) hide show
  1. utils/error_handler.py +862 -0
utils/error_handler.py ADDED
@@ -0,0 +1,862 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/error_handler.py
2
+ # Production-ready error handling system for MLOps grade enhancement
3
+
4
+ import functools
5
+ import traceback
6
+ import sys
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ from typing import Dict, Any, Optional, Callable, Union, Type
10
+ from contextlib import contextmanager
11
+ from enum import Enum
12
+ import json
13
+
14
+ # Import structured logger
15
+ try:
16
+ from .structured_logger import StructuredLogger, EventType, LogLevel, MLOpsLoggers
17
+ STRUCTURED_LOGGING_AVAILABLE = True
18
+ except ImportError:
19
+ STRUCTURED_LOGGING_AVAILABLE = False
20
+ # Fallback to standard logging
21
+ import logging
22
+
23
+
24
+ class ErrorSeverity(Enum):
25
+ """Error severity levels for classification and handling"""
26
+ LOW = "low" # Non-critical errors that don't affect core functionality
27
+ MEDIUM = "medium" # Errors that degrade performance but allow continuation
28
+ HIGH = "high" # Critical errors that require immediate attention
29
+ CRITICAL = "critical" # System-breaking errors that require emergency response
30
+
31
+
32
+ class ErrorCategory(Enum):
33
+ """Error categories for better classification and handling"""
34
+ # Data-related errors
35
+ DATA_VALIDATION = "data_validation"
36
+ DATA_LOADING = "data_loading"
37
+ DATA_PREPROCESSING = "data_preprocessing"
38
+ DATA_QUALITY = "data_quality"
39
+
40
+ # Model-related errors
41
+ MODEL_TRAINING = "model_training"
42
+ MODEL_VALIDATION = "model_validation"
43
+ MODEL_LOADING = "model_loading"
44
+ MODEL_PREDICTION = "model_prediction"
45
+
46
+ # Feature engineering errors
47
+ FEATURE_EXTRACTION = "feature_extraction"
48
+ FEATURE_SELECTION = "feature_selection"
49
+
50
+ # System-related errors
51
+ RESOURCE_CONSTRAINT = "resource_constraint"
52
+ CONFIGURATION = "configuration"
53
+ DEPENDENCY = "dependency"
54
+ IO_OPERATION = "io_operation"
55
+
56
+ # API and service errors
57
+ API_ERROR = "api_error"
58
+ AUTHENTICATION = "authentication"
59
+ VALIDATION = "validation"
60
+
61
+ # External service errors
62
+ EXTERNAL_SERVICE = "external_service"
63
+ NETWORK = "network"
64
+
65
+ # Unknown/uncategorized errors
66
+ UNKNOWN = "unknown"
67
+
68
+
69
+ class MLOpsError(Exception):
70
+ """Base exception class for MLOps-related errors"""
71
+
72
+ def __init__(self,
73
+ message: str,
74
+ category: ErrorCategory = ErrorCategory.UNKNOWN,
75
+ severity: ErrorSeverity = ErrorSeverity.MEDIUM,
76
+ component: str = None,
77
+ metadata: Dict[str, Any] = None,
78
+ suggestion: str = None,
79
+ original_error: Exception = None):
80
+
81
+ self.message = message
82
+ self.category = category
83
+ self.severity = severity
84
+ self.component = component
85
+ self.metadata = metadata or {}
86
+ self.suggestion = suggestion
87
+ self.original_error = original_error
88
+ self.timestamp = datetime.now().isoformat()
89
+
90
+ super().__init__(self.message)
91
+
92
+ def to_dict(self) -> Dict[str, Any]:
93
+ """Convert error to dictionary for logging/serialization"""
94
+ return {
95
+ 'message': self.message,
96
+ 'category': self.category.value,
97
+ 'severity': self.severity.value,
98
+ 'component': self.component,
99
+ 'metadata': self.metadata,
100
+ 'suggestion': self.suggestion,
101
+ 'timestamp': self.timestamp,
102
+ 'original_error': {
103
+ 'type': type(self.original_error).__name__ if self.original_error else None,
104
+ 'message': str(self.original_error) if self.original_error else None
105
+ }
106
+ }
107
+
108
+
109
+ # Specific error types for different scenarios
110
+ class DataValidationError(MLOpsError):
111
+ """Error in data validation"""
112
+ def __init__(self, message: str, **kwargs):
113
+ super().__init__(message, category=ErrorCategory.DATA_VALIDATION,
114
+ severity=ErrorSeverity.HIGH, **kwargs)
115
+
116
+
117
+ class ModelTrainingError(MLOpsError):
118
+ """Error during model training"""
119
+ def __init__(self, message: str, **kwargs):
120
+ super().__init__(message, category=ErrorCategory.MODEL_TRAINING,
121
+ severity=ErrorSeverity.HIGH, **kwargs)
122
+
123
+
124
+ class ResourceConstraintError(MLOpsError):
125
+ """Error due to resource constraints (CPU/Memory)"""
126
+ def __init__(self, message: str, **kwargs):
127
+ super().__init__(message, category=ErrorCategory.RESOURCE_CONSTRAINT,
128
+ severity=ErrorSeverity.MEDIUM, **kwargs)
129
+
130
+
131
+ class ConfigurationError(MLOpsError):
132
+ """Error in configuration or setup"""
133
+ def __init__(self, message: str, **kwargs):
134
+ super().__init__(message, category=ErrorCategory.CONFIGURATION,
135
+ severity=ErrorSeverity.HIGH, **kwargs)
136
+
137
+
138
+ class FeatureEngineeringError(MLOpsError):
139
+ """Error in feature engineering process"""
140
+ def __init__(self, message: str, **kwargs):
141
+ super().__init__(message, category=ErrorCategory.FEATURE_EXTRACTION,
142
+ severity=ErrorSeverity.MEDIUM, **kwargs)
143
+
144
+
145
+ class ErrorHandler:
146
+ """Centralized error handling with logging, recovery, and monitoring"""
147
+
148
+ def __init__(self, component: str, logger: Optional[StructuredLogger] = None):
149
+ self.component = component
150
+ self.error_count = {} # Track error frequency
151
+ self.recovery_strategies = {} # Store recovery functions
152
+
153
+ # Setup logger
154
+ if STRUCTURED_LOGGING_AVAILABLE and logger is None:
155
+ self.logger = MLOpsLoggers.get_logger(component)
156
+ elif logger:
157
+ self.logger = logger
158
+ else:
159
+ # Fallback to standard logging
160
+ import logging
161
+ self.logger = logging.getLogger(component)
162
+
163
+ def register_recovery_strategy(self,
164
+ error_category: ErrorCategory,
165
+ recovery_func: Callable):
166
+ """Register recovery strategy for specific error category"""
167
+ self.recovery_strategies[error_category] = recovery_func
168
+
169
+ def handle_error(self,
170
+ error: Exception,
171
+ context: Dict[str, Any] = None,
172
+ category: ErrorCategory = None,
173
+ severity: ErrorSeverity = None,
174
+ suggestion: str = None,
175
+ attempt_recovery: bool = True) -> Dict[str, Any]:
176
+ """
177
+ Central error handling method
178
+
179
+ Returns:
180
+ Dict with error details and recovery status
181
+ """
182
+
183
+ # Convert to MLOpsError if not already
184
+ if not isinstance(error, MLOpsError):
185
+ mlops_error = MLOpsError(
186
+ message=str(error),
187
+ category=category or self._classify_error(error),
188
+ severity=severity or self._determine_severity(error),
189
+ component=self.component,
190
+ metadata=context or {},
191
+ suggestion=suggestion,
192
+ original_error=error
193
+ )
194
+ else:
195
+ mlops_error = error
196
+
197
+ # Track error frequency
198
+ error_key = f"{mlops_error.category.value}:{type(error).__name__}"
199
+ self.error_count[error_key] = self.error_count.get(error_key, 0) + 1
200
+
201
+ # Log error
202
+ self._log_error(mlops_error, context)
203
+
204
+ # Attempt recovery if enabled
205
+ recovery_result = None
206
+ if attempt_recovery and mlops_error.category in self.recovery_strategies:
207
+ try:
208
+ recovery_result = self.recovery_strategies[mlops_error.category](mlops_error, context)
209
+ self._log_recovery_attempt(mlops_error, recovery_result)
210
+ except Exception as recovery_error:
211
+ self._log_recovery_failure(mlops_error, recovery_error)
212
+
213
+ return {
214
+ 'error': mlops_error.to_dict(),
215
+ 'recovery_attempted': recovery_result is not None,
216
+ 'recovery_successful': recovery_result is not None and recovery_result.get('success', False),
217
+ 'recovery_result': recovery_result,
218
+ 'error_count': self.error_count.get(error_key, 1)
219
+ }
220
+
221
+ def _classify_error(self, error: Exception) -> ErrorCategory:
222
+ """Automatically classify error based on type and message"""
223
+ error_type = type(error).__name__.lower()
224
+ error_message = str(error).lower()
225
+
226
+ # Data-related errors
227
+ if any(keyword in error_message for keyword in ['data', 'dataframe', 'csv', 'dataset']):
228
+ if any(keyword in error_message for keyword in ['validation', 'invalid', 'format']):
229
+ return ErrorCategory.DATA_VALIDATION
230
+ elif any(keyword in error_message for keyword in ['load', 'read', 'file']):
231
+ return ErrorCategory.DATA_LOADING
232
+ else:
233
+ return ErrorCategory.DATA_PREPROCESSING
234
+
235
+ # Model-related errors
236
+ if any(keyword in error_message for keyword in ['model', 'training', 'fit', 'predict']):
237
+ if 'training' in error_message or 'fit' in error_message:
238
+ return ErrorCategory.MODEL_TRAINING
239
+ elif 'predict' in error_message:
240
+ return ErrorCategory.MODEL_PREDICTION
241
+ else:
242
+ return ErrorCategory.MODEL_VALIDATION
243
+
244
+ # Resource constraints
245
+ if any(keyword in error_message for keyword in ['memory', 'cpu', 'resource', 'timeout']):
246
+ return ErrorCategory.RESOURCE_CONSTRAINT
247
+
248
+ # IO errors
249
+ if 'ioerror' in error_type or any(keyword in error_message for keyword in ['file', 'path', 'directory']):
250
+ return ErrorCategory.IO_OPERATION
251
+
252
+ # Configuration errors
253
+ if any(keyword in error_message for keyword in ['config', 'parameter', 'argument']):
254
+ return ErrorCategory.CONFIGURATION
255
+
256
+ # Feature engineering
257
+ if any(keyword in error_message for keyword in ['feature', 'transform', 'vectoriz']):
258
+ return ErrorCategory.FEATURE_EXTRACTION
259
+
260
+ # API errors
261
+ if any(keyword in error_message for keyword in ['api', 'request', 'response', 'http']):
262
+ return ErrorCategory.API_ERROR
263
+
264
+ return ErrorCategory.UNKNOWN
265
+
266
+ def _determine_severity(self, error: Exception) -> ErrorSeverity:
267
+ """Determine error severity based on error type and context"""
268
+ error_type = type(error).__name__.lower()
269
+ error_message = str(error).lower()
270
+
271
+ # Critical system errors
272
+ if error_type in ['systemexit', 'keyboardinterrupt', 'memoryerror']:
273
+ return ErrorSeverity.CRITICAL
274
+
275
+ # High severity - prevents core functionality
276
+ if any(keyword in error_message for keyword in ['training failed', 'model not found', 'critical']):
277
+ return ErrorSeverity.HIGH
278
+
279
+ # Medium severity - degrades performance
280
+ if any(keyword in error_message for keyword in ['warning', 'timeout', 'resource']):
281
+ return ErrorSeverity.MEDIUM
282
+
283
+ # Default to medium for unknown errors
284
+ return ErrorSeverity.MEDIUM
285
+
286
+ def _log_error(self, error: MLOpsError, context: Dict[str, Any]):
287
+ """Log error with structured logging"""
288
+ if STRUCTURED_LOGGING_AVAILABLE:
289
+ log_level = self._get_log_level_for_severity(error.severity)
290
+
291
+ self.logger.log(
292
+ level=log_level,
293
+ event_type=EventType.MODEL_TRAINING_ERROR,
294
+ message=f"Error in {self.component}: {error.message}",
295
+ component=self.component,
296
+ metadata={
297
+ 'error_category': error.category.value,
298
+ 'error_severity': error.severity.value,
299
+ 'error_metadata': error.metadata,
300
+ 'context': context or {},
301
+ 'suggestion': error.suggestion,
302
+ 'error_count': self.error_count.get(f"{error.category.value}:{type(error.original_error).__name__}", 1)
303
+ },
304
+ tags=[error.category.value, error.severity.value, 'error_handling']
305
+ )
306
+ else:
307
+ # Fallback logging
308
+ self.logger.error(f"Error in {self.component}: {error.message}")
309
+
310
+ def _get_log_level_for_severity(self, severity: ErrorSeverity) -> LogLevel:
311
+ """Map error severity to log level"""
312
+ severity_to_log_level = {
313
+ ErrorSeverity.LOW: LogLevel.WARNING,
314
+ ErrorSeverity.MEDIUM: LogLevel.ERROR,
315
+ ErrorSeverity.HIGH: LogLevel.ERROR,
316
+ ErrorSeverity.CRITICAL: LogLevel.CRITICAL
317
+ }
318
+ return severity_to_log_level.get(severity, LogLevel.ERROR)
319
+
320
+ def _log_recovery_attempt(self, error: MLOpsError, recovery_result: Dict[str, Any]):
321
+ """Log recovery attempt results"""
322
+ if STRUCTURED_LOGGING_AVAILABLE:
323
+ success = recovery_result.get('success', False)
324
+ event_type = EventType.MODEL_TRAINING_COMPLETE if success else EventType.MODEL_TRAINING_ERROR
325
+
326
+ self.logger.info(
327
+ event_type,
328
+ f"Recovery {'succeeded' if success else 'failed'} for {error.category.value} error",
329
+ component=self.component,
330
+ metadata={
331
+ 'original_error': error.message,
332
+ 'recovery_result': recovery_result,
333
+ 'error_category': error.category.value
334
+ },
335
+ tags=['error_recovery', 'automated_recovery']
336
+ )
337
+
338
+ def _log_recovery_failure(self, error: MLOpsError, recovery_error: Exception):
339
+ """Log recovery failure"""
340
+ if STRUCTURED_LOGGING_AVAILABLE:
341
+ self.logger.error(
342
+ EventType.MODEL_TRAINING_ERROR,
343
+ f"Recovery failed for {error.category.value} error: {str(recovery_error)}",
344
+ component=self.component,
345
+ metadata={
346
+ 'original_error': error.message,
347
+ 'recovery_error': str(recovery_error),
348
+ 'error_category': error.category.value
349
+ },
350
+ tags=['error_recovery', 'recovery_failure']
351
+ )
352
+
353
+
354
+ # Decorator for automatic error handling
355
+ def handle_errors(component: str = None,
356
+ category: ErrorCategory = None,
357
+ severity: ErrorSeverity = None,
358
+ attempt_recovery: bool = True,
359
+ reraise: bool = True):
360
+ """Decorator for automatic error handling"""
361
+ def decorator(func):
362
+ @functools.wraps(func)
363
+ def wrapper(*args, **kwargs):
364
+ comp_name = component or func.__module__
365
+ error_handler = ErrorHandler(comp_name)
366
+
367
+ try:
368
+ return func(*args, **kwargs)
369
+ except Exception as e:
370
+ # Handle the error
371
+ result = error_handler.handle_error(
372
+ error=e,
373
+ context={
374
+ 'function': func.__name__,
375
+ 'args_count': len(args),
376
+ 'kwargs_count': len(kwargs)
377
+ },
378
+ category=category,
379
+ severity=severity,
380
+ attempt_recovery=attempt_recovery
381
+ )
382
+
383
+ # Re-raise if specified, otherwise return error result
384
+ if reraise:
385
+ raise
386
+ else:
387
+ return result
388
+
389
+ return wrapper
390
+ return decorator
391
+
392
+
393
+ # Context manager for error handling
394
+ @contextmanager
395
+ def error_handling_context(component: str,
396
+ operation: str,
397
+ category: ErrorCategory = None,
398
+ severity: ErrorSeverity = None,
399
+ metadata: Dict[str, Any] = None):
400
+ """Context manager for handling errors within a specific operation"""
401
+ error_handler = ErrorHandler(component)
402
+
403
+ try:
404
+ yield error_handler
405
+ except Exception as e:
406
+ result = error_handler.handle_error(
407
+ error=e,
408
+ context={
409
+ 'operation': operation,
410
+ **(metadata or {})
411
+ },
412
+ category=category,
413
+ severity=severity
414
+ )
415
+ # Always re-raise in context manager
416
+ raise
417
+
418
+
419
+ # Recovery strategies for common scenarios
420
+ class RecoveryStrategies:
421
+ """Common recovery strategies for different error categories"""
422
+
423
+ @staticmethod
424
+ def data_loading_recovery(error: MLOpsError, context: Dict[str, Any]) -> Dict[str, Any]:
425
+ """Recovery strategy for data loading errors"""
426
+ try:
427
+ # Try alternative data sources or fallback datasets
428
+ if 'file_path' in context:
429
+ # Try backup locations
430
+ backup_paths = [
431
+ Path(context['file_path']).with_suffix('.backup.csv'),
432
+ Path('/tmp/data/fallback_dataset.csv'),
433
+ Path('/tmp/data/combined_dataset.csv')
434
+ ]
435
+
436
+ for backup_path in backup_paths:
437
+ if backup_path.exists():
438
+ return {
439
+ 'success': True,
440
+ 'recovery_method': 'fallback_data_source',
441
+ 'fallback_path': str(backup_path)
442
+ }
443
+
444
+ return {'success': False, 'reason': 'No fallback data sources available'}
445
+
446
+ except Exception as e:
447
+ return {'success': False, 'error': str(e)}
448
+
449
+ @staticmethod
450
+ def model_training_recovery(error: MLOpsError, context: Dict[str, Any]) -> Dict[str, Any]:
451
+ """Recovery strategy for model training errors"""
452
+ try:
453
+ # Common recovery strategies for training failures
454
+ recovery_methods = []
455
+
456
+ # Reduce model complexity
457
+ if 'resource' in str(error.message).lower():
458
+ recovery_methods.append('reduce_model_complexity')
459
+
460
+ # Fallback to simpler model
461
+ if 'lightgbm' in str(error.message).lower():
462
+ recovery_methods.append('fallback_to_logistic_regression')
463
+
464
+ # Reduce dataset size for memory issues
465
+ if 'memory' in str(error.message).lower():
466
+ recovery_methods.append('reduce_dataset_size')
467
+
468
+ return {
469
+ 'success': len(recovery_methods) > 0,
470
+ 'recovery_methods': recovery_methods,
471
+ 'suggestion': 'Apply suggested recovery methods and retry training'
472
+ }
473
+
474
+ except Exception as e:
475
+ return {'success': False, 'error': str(e)}
476
+
477
+ @staticmethod
478
+ def feature_engineering_recovery(error: MLOpsError, context: Dict[str, Any]) -> Dict[str, Any]:
479
+ """Recovery strategy for feature engineering errors"""
480
+ try:
481
+ # Fallback to standard TF-IDF if enhanced features fail
482
+ if 'enhanced' in str(error.message).lower():
483
+ return {
484
+ 'success': True,
485
+ 'recovery_method': 'fallback_to_standard_features',
486
+ 'suggestion': 'Switch to standard TF-IDF features and continue training'
487
+ }
488
+
489
+ return {'success': False, 'reason': 'No applicable recovery method'}
490
+
491
+ except Exception as e:
492
+ return {'success': False, 'error': str(e)}
493
+
494
+
495
+ # CPU constraint specific error handling for HuggingFace Spaces
496
+ class CPUConstraintHandler:
497
+ """Specialized handler for CPU constraint issues in HuggingFace Spaces"""
498
+
499
+ def __init__(self, component: str):
500
+ self.component = component
501
+ self.error_handler = ErrorHandler(component)
502
+
503
+ # Register CPU-specific recovery strategies
504
+ self.error_handler.register_recovery_strategy(
505
+ ErrorCategory.RESOURCE_CONSTRAINT,
506
+ self._cpu_recovery_strategy
507
+ )
508
+
509
+ def _cpu_recovery_strategy(self, error: MLOpsError, context: Dict[str, Any]) -> Dict[str, Any]:
510
+ """Recovery strategy specifically for CPU constraints"""
511
+ try:
512
+ recovery_actions = []
513
+
514
+ # Reduce parallel processing
515
+ if 'n_jobs' in str(error.message) or 'parallel' in str(error.message):
516
+ recovery_actions.append('force_single_threading')
517
+
518
+ # Reduce model complexity for CPU efficiency
519
+ if 'training' in context.get('operation', '').lower():
520
+ recovery_actions.extend([
521
+ 'reduce_cv_folds',
522
+ 'simplify_hyperparameter_grid',
523
+ 'disable_ensemble_if_slow'
524
+ ])
525
+
526
+ # Memory optimization for CPU-bound systems
527
+ if 'memory' in str(error.message).lower():
528
+ recovery_actions.extend([
529
+ 'reduce_feature_dimensions',
530
+ 'batch_processing',
531
+ 'garbage_collection'
532
+ ])
533
+
534
+ return {
535
+ 'success': len(recovery_actions) > 0,
536
+ 'recovery_actions': recovery_actions,
537
+ 'cpu_optimizations': True,
538
+ 'environment': 'huggingface_spaces'
539
+ }
540
+
541
+ except Exception as e:
542
+ return {'success': False, 'error': str(e)}
543
+
544
+ def monitor_and_handle_cpu_issues(self,
545
+ operation_func: Callable,
546
+ *args,
547
+ timeout_seconds: int = 300,
548
+ **kwargs) -> Any:
549
+ """Monitor operation for CPU issues and handle automatically"""
550
+ import time
551
+ import signal
552
+
553
+ start_time = time.time()
554
+
555
+ def timeout_handler(signum, frame):
556
+ raise ResourceConstraintError(
557
+ f"Operation {operation_func.__name__} exceeded CPU time limit ({timeout_seconds}s)",
558
+ component=self.component,
559
+ metadata={
560
+ 'timeout_seconds': timeout_seconds,
561
+ 'operation': operation_func.__name__,
562
+ 'environment': 'cpu_constrained'
563
+ },
564
+ suggestion="Reduce model complexity or dataset size for CPU-constrained environment"
565
+ )
566
+
567
+ # Set timeout signal
568
+ signal.signal(signal.SIGALRM, timeout_handler)
569
+ signal.alarm(timeout_seconds)
570
+
571
+ try:
572
+ result = operation_func(*args, **kwargs)
573
+ execution_time = time.time() - start_time
574
+
575
+ # Log performance if slow
576
+ if execution_time > timeout_seconds * 0.8: # 80% of timeout
577
+ if STRUCTURED_LOGGING_AVAILABLE:
578
+ logger = MLOpsLoggers.get_monitoring_logger()
579
+ logger.log_cpu_constraint_warning(
580
+ component=self.component,
581
+ operation=operation_func.__name__,
582
+ resource_usage={
583
+ 'execution_time_seconds': execution_time,
584
+ 'timeout_threshold': timeout_seconds,
585
+ 'cpu_efficiency': 'low'
586
+ }
587
+ )
588
+
589
+ return result
590
+
591
+ except Exception as e:
592
+ execution_time = time.time() - start_time
593
+
594
+ # Handle error with CPU constraint context
595
+ self.error_handler.handle_error(
596
+ error=e,
597
+ context={
598
+ 'operation': operation_func.__name__,
599
+ 'execution_time': execution_time,
600
+ 'timeout_limit': timeout_seconds,
601
+ 'environment': 'cpu_constrained'
602
+ },
603
+ category=ErrorCategory.RESOURCE_CONSTRAINT,
604
+ severity=ErrorSeverity.HIGH
605
+ )
606
+ raise
607
+
608
+ finally:
609
+ # Clear timeout
610
+ signal.alarm(0)
611
+
612
+
613
+ # Integration utilities for existing codebase
614
+ def setup_error_handling() -> Dict[str, ErrorHandler]:
615
+ """Setup error handlers for all MLOps components"""
616
+ handlers = {}
617
+
618
+ components = [
619
+ 'model_trainer',
620
+ 'model_retrainer',
621
+ 'data_processor',
622
+ 'feature_engineer',
623
+ 'api_server',
624
+ 'monitoring'
625
+ ]
626
+
627
+ for component in components:
628
+ handler = ErrorHandler(component)
629
+
630
+ # Register common recovery strategies
631
+ handler.register_recovery_strategy(
632
+ ErrorCategory.DATA_LOADING,
633
+ RecoveryStrategies.data_loading_recovery
634
+ )
635
+ handler.register_recovery_strategy(
636
+ ErrorCategory.MODEL_TRAINING,
637
+ RecoveryStrategies.model_training_recovery
638
+ )
639
+ handler.register_recovery_strategy(
640
+ ErrorCategory.FEATURE_EXTRACTION,
641
+ RecoveryStrategies.feature_engineering_recovery
642
+ )
643
+
644
+ handlers[component] = handler
645
+
646
+ return handlers
647
+
648
+
649
+ def get_error_handler(component: str) -> ErrorHandler:
650
+ """Get error handler for specific component"""
651
+ return ErrorHandler(component)
652
+
653
+
654
+ # Example integration functions
655
+ def integrate_with_retrain_py():
656
+ """Example integration with retrain.py for robust error handling"""
657
+
658
+ # Setup error handler for retraining component
659
+ error_handler = ErrorHandler('model_retrainer')
660
+
661
+ # Register specific recovery strategies
662
+ error_handler.register_recovery_strategy(
663
+ ErrorCategory.MODEL_TRAINING,
664
+ lambda error, context: {
665
+ 'success': True,
666
+ 'recovery_method': 'fallback_to_individual_models',
667
+ 'suggestion': 'Disable ensemble and use best individual model'
668
+ }
669
+ )
670
+
671
+ return error_handler
672
+
673
+
674
+ def integrate_with_train_py():
675
+ """Example integration with train.py for comprehensive error handling"""
676
+
677
+ # Setup error handler for training component
678
+ error_handler = ErrorHandler('model_trainer')
679
+
680
+ # CPU constraint handler for HuggingFace Spaces
681
+ cpu_handler = CPUConstraintHandler('model_trainer')
682
+
683
+ return error_handler, cpu_handler
684
+
685
+
686
+ # Error reporting and analytics
687
+ class ErrorReporter:
688
+ """Collect and report error analytics for MLOps monitoring"""
689
+
690
+ def __init__(self, report_file: Path = None):
691
+ self.report_file = report_file or Path("/tmp/logs/error_report.json")
692
+ self.error_stats = {}
693
+
694
+ def record_error(self, error_info: Dict[str, Any]):
695
+ """Record error for analytics"""
696
+ category = error_info.get('error', {}).get('category', 'unknown')
697
+ severity = error_info.get('error', {}).get('severity', 'medium')
698
+
699
+ key = f"{category}:{severity}"
700
+
701
+ if key not in self.error_stats:
702
+ self.error_stats[key] = {
703
+ 'count': 0,
704
+ 'first_seen': datetime.now().isoformat(),
705
+ 'last_seen': datetime.now().isoformat(),
706
+ 'recovery_attempts': 0,
707
+ 'recovery_successes': 0
708
+ }
709
+
710
+ stats = self.error_stats[key]
711
+ stats['count'] += 1
712
+ stats['last_seen'] = datetime.now().isoformat()
713
+
714
+ if error_info.get('recovery_attempted', False):
715
+ stats['recovery_attempts'] += 1
716
+ if error_info.get('recovery_successful', False):
717
+ stats['recovery_successes'] += 1
718
+
719
+ def generate_report(self) -> Dict[str, Any]:
720
+ """Generate error analytics report"""
721
+ total_errors = sum(stats['count'] for stats in self.error_stats.values())
722
+ total_recovery_attempts = sum(stats['recovery_attempts'] for stats in self.error_stats.values())
723
+ total_recovery_successes = sum(stats['recovery_successes'] for stats in self.error_stats.values())
724
+
725
+ recovery_rate = (total_recovery_successes / total_recovery_attempts * 100) if total_recovery_attempts > 0 else 0
726
+
727
+ return {
728
+ 'report_timestamp': datetime.now().isoformat(),
729
+ 'summary': {
730
+ 'total_errors': total_errors,
731
+ 'unique_error_types': len(self.error_stats),
732
+ 'recovery_attempts': total_recovery_attempts,
733
+ 'recovery_successes': total_recovery_successes,
734
+ 'recovery_rate_percent': recovery_rate
735
+ },
736
+ 'error_breakdown': self.error_stats,
737
+ 'recommendations': self._generate_recommendations()
738
+ }
739
+
740
+ def _generate_recommendations(self) -> list:
741
+ """Generate recommendations based on error patterns"""
742
+ recommendations = []
743
+
744
+ # High frequency errors
745
+ high_freq_errors = {k: v for k, v in self.error_stats.items() if v['count'] > 5}
746
+ if high_freq_errors:
747
+ recommendations.append({
748
+ 'type': 'high_frequency_errors',
749
+ 'message': f'Address frequently occurring errors: {", ".join(high_freq_errors.keys())}',
750
+ 'priority': 'high'
751
+ })
752
+
753
+ # Low recovery rates
754
+ low_recovery_errors = {
755
+ k: v for k, v in self.error_stats.items()
756
+ if v['recovery_attempts'] > 0 and (v['recovery_successes'] / v['recovery_attempts']) < 0.5
757
+ }
758
+ if low_recovery_errors:
759
+ recommendations.append({
760
+ 'type': 'low_recovery_rate',
761
+ 'message': 'Improve recovery strategies for poorly recovering error types',
762
+ 'priority': 'medium',
763
+ 'affected_errors': list(low_recovery_errors.keys())
764
+ })
765
+
766
+ # Resource constraint patterns
767
+ resource_errors = {k: v for k, v in self.error_stats.items() if 'resource_constraint' in k}
768
+ if resource_errors:
769
+ recommendations.append({
770
+ 'type': 'resource_optimization',
771
+ 'message': 'Consider CPU/memory optimizations for resource constraint errors',
772
+ 'priority': 'high',
773
+ 'suggestion': 'Review HuggingFace Spaces constraints and optimize accordingly'
774
+ })
775
+
776
+ return recommendations
777
+
778
+ def save_report(self):
779
+ """Save error report to file"""
780
+ report = self.generate_report()
781
+
782
+ self.report_file.parent.mkdir(parents=True, exist_ok=True)
783
+
784
+ with open(self.report_file, 'w') as f:
785
+ json.dump(report, f, indent=2)
786
+
787
+ return report
788
+
789
+
790
+ # Global error reporter instance
791
+ _global_error_reporter = None
792
+
793
+ def get_global_error_reporter() -> ErrorReporter:
794
+ """Get global error reporter instance"""
795
+ global _global_error_reporter
796
+ if _global_error_reporter is None:
797
+ _global_error_reporter = ErrorReporter()
798
+ return _global_error_reporter
799
+
800
+
801
+ if __name__ == "__main__":
802
+ # Example usage and testing
803
+ print("Testing error handling system...")
804
+
805
+ # Test basic error handling
806
+ error_handler = ErrorHandler('test_component')
807
+
808
+ try:
809
+ raise ValueError("Test error for demonstration")
810
+ except Exception as e:
811
+ result = error_handler.handle_error(
812
+ error=e,
813
+ context={'test': True},
814
+ category=ErrorCategory.DATA_VALIDATION,
815
+ severity=ErrorSeverity.MEDIUM,
816
+ suggestion="This is a test error for demonstration purposes"
817
+ )
818
+ print("Error handling result:", result)
819
+
820
+ # Test decorator
821
+ @handle_errors(component='test_decorator', category=ErrorCategory.MODEL_TRAINING)
822
+ def test_function_with_error():
823
+ raise ModelTrainingError("Test model training error")
824
+
825
+ try:
826
+ test_function_with_error()
827
+ except Exception as e:
828
+ print("Decorator handled error:", type(e).__name__)
829
+
830
+ # Test CPU constraint handler
831
+ cpu_handler = CPUConstraintHandler('test_cpu')
832
+
833
+ def slow_operation():
834
+ import time
835
+ time.sleep(0.1) # Simulate work
836
+ return "completed"
837
+
838
+ try:
839
+ result = cpu_handler.monitor_and_handle_cpu_issues(slow_operation, timeout_seconds=1)
840
+ print("CPU monitoring result:", result)
841
+ except Exception as e:
842
+ print("CPU constraint error:", str(e))
843
+
844
+ # Test error reporting
845
+ reporter = get_global_error_reporter()
846
+
847
+ # Record some test errors
848
+ test_error_info = {
849
+ 'error': {
850
+ 'category': 'model_training',
851
+ 'severity': 'high',
852
+ 'message': 'Test error for reporting'
853
+ },
854
+ 'recovery_attempted': True,
855
+ 'recovery_successful': False
856
+ }
857
+
858
+ reporter.record_error(test_error_info)
859
+ report = reporter.generate_report()
860
+ print("Error report:", json.dumps(report, indent=2))
861
+
862
+ print("Error handling system test completed successfully!")