Ahmedik95316 commited on
Commit
36b9049
·
verified ·
1 Parent(s): f5e699e

Update app/fastapi_server.py

Browse files
Files changed (1) hide show
  1. app/fastapi_server.py +260 -129
app/fastapi_server.py CHANGED
@@ -678,6 +678,57 @@ async def predict(
678
  detail="Model is not available. Please try again later."
679
  )
680
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
681
  # Prepare request data for routing
682
  request_data = {
683
  'client_id': client_ip,
@@ -738,48 +789,15 @@ async def predict(
738
  processing_time=processing_time
739
  )
740
 
741
- # Validation logging - NEW ADDITION
742
- validation_entry = {
743
- 'timestamp': datetime.now().isoformat(),
744
- 'text_length': len(request.text),
745
- 'prediction': label,
746
- 'confidence': confidence,
747
- 'validation_passed': confidence > 0.6, # Define validation threshold
748
- 'quality_score': confidence,
749
- 'model_version': model_manager.model_metadata.get('model_version', 'unknown'),
750
- 'processing_time': processing_time,
751
- 'client_ip': client_ip,
752
- 'environment': environment
753
- }
754
-
755
- # Save to validation log
756
- try:
757
- validation_log_path = path_manager.get_logs_path("validation_log.json")
758
- if validation_log_path.exists():
759
- with open(validation_log_path, 'r') as f:
760
- validation_data = json.load(f)
761
- else:
762
- validation_data = []
763
-
764
- validation_data.append(validation_entry)
765
-
766
- # Keep only last 1000 entries to prevent file from growing too large
767
- if len(validation_data) > 1000:
768
- validation_data = validation_data[-1000:]
769
-
770
- with open(validation_log_path, 'w') as f:
771
- json.dump(validation_data, f, indent=2)
772
- except Exception as e:
773
- logger.warning(f"Could not save validation log: {e}")
774
-
775
- # Log prediction (background task)
776
  background_tasks.add_task(
777
- log_prediction,
778
  request.text,
779
  label,
780
  confidence,
781
  client_ip,
782
- processing_time
 
783
  )
784
 
785
  return response
@@ -825,6 +843,50 @@ async def predict(
825
  )
826
 
827
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
828
  @app.post("/predict/batch", response_model=BatchPredictionResponse)
829
  async def predict_batch(
830
  request: BatchPredictionRequest,
@@ -1338,79 +1400,119 @@ async def get_metrics():
1338
  )
1339
 
1340
  def get_validation_stats():
1341
- """Get validation statistics from various sources"""
1342
- try:
1343
- stats = {
1344
- 'last_updated': datetime.now().isoformat(),
1345
- 'total_validations': 0,
1346
- 'total_articles': 0,
1347
- 'total_valid_articles': 0,
1348
- 'average_quality_score': 0.0,
1349
- 'source_statistics': {},
1350
- 'validation_history': [],
1351
- 'quality_trends': []
1352
- }
1353
-
1354
- # Try to load validation data from logs
1355
- validation_log_path = path_manager.get_logs_path("validation_log.json")
1356
- if validation_log_path.exists():
1357
- with open(validation_log_path, 'r') as f:
1358
- validation_data = json.load(f)
1359
- if validation_data:
1360
- stats['total_validations'] = len(validation_data)
1361
- stats['validation_history'] = validation_data[-10:] # Last 10 entries
1362
-
1363
- # Try to load prediction data for article count
1364
- prediction_log_path = path_manager.get_logs_path("prediction_log.json")
1365
- if prediction_log_path.exists():
1366
- with open(prediction_log_path, 'r') as f:
1367
- prediction_data = json.load(f)
1368
- if prediction_data:
1369
- stats['total_articles'] = len(prediction_data)
1370
-
1371
- # Calculate success rate (predictions with high confidence)
1372
- high_confidence_predictions = [
1373
- p for p in prediction_data
1374
- if p.get('confidence', 0) > 0.7
1375
- ]
1376
- stats['total_valid_articles'] = len(high_confidence_predictions)
1377
-
1378
- # Calculate average confidence as quality score
1379
- if prediction_data:
1380
- avg_confidence = sum(p.get('confidence', 0) for p in prediction_data) / len(prediction_data)
1381
- stats['average_quality_score'] = avg_confidence
1382
-
1383
- # Load activity log for additional metrics
1384
- activity_log_path = path_manager.get_activity_log_path()
1385
- if activity_log_path.exists():
1386
- with open(activity_log_path, 'r') as f:
1387
- activity_data = json.load(f)
1388
- if activity_data:
1389
- stats['last_updated'] = activity_data[-1].get('timestamp', datetime.now().isoformat())
1390
-
1391
- # Try to load monitoring data for additional validation metrics
1392
- monitoring_log_path = path_manager.get_logs_path("monitoring_log.json")
1393
- if monitoring_log_path.exists():
1394
- with open(monitoring_log_path, 'r') as f:
1395
- monitoring_data = json.load(f)
1396
- if monitoring_data:
1397
- # Extract quality trends from monitoring data
1398
- quality_entries = [
1399
- {
1400
- 'timestamp': entry.get('timestamp'),
1401
- 'quality_score': entry.get('quality_score', 0)
1402
- }
1403
- for entry in monitoring_data
1404
- if entry.get('quality_score') is not None
1405
- ]
1406
- stats['quality_trends'] = quality_entries[-10:]
1407
-
1408
- return stats if any(stats[k] for k in ['total_validations', 'total_articles']) else None
1409
-
1410
- except Exception as e:
1411
- logger.warning(f"Could not load validation stats: {e}")
1412
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1413
 
 
1414
  @app.get("/validation/statistics")
1415
  async def get_validation_statistics():
1416
  """Get comprehensive validation statistics"""
@@ -1454,18 +1556,56 @@ async def get_validation_statistics():
1454
  # Adding fallback to build quality report from metadata if generate_quality_report fails; improved error handling, logging, and richer report structure
1455
  @app.get("/validation/quality-report")
1456
  async def get_quality_report():
1457
- """Get comprehensive data quality report"""
1458
  try:
1459
- # First try the existing generate_quality_report function
1460
- try:
1461
- report = generate_quality_report()
 
 
 
 
 
1462
 
1463
- if report and 'error' not in report:
1464
- return report
1465
- except Exception as e:
1466
- logger.warning(f"generate_quality_report failed: {e}, falling back to metadata")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1467
 
1468
- # Fallback: Generate report from model metadata
1469
  metadata_path = path_manager.get_metadata_path()
1470
 
1471
  if not metadata_path.exists():
@@ -1477,9 +1617,10 @@ async def get_quality_report():
1477
  with open(metadata_path, 'r') as f:
1478
  metadata = json.load(f)
1479
 
1480
- # Create quality report from metadata
1481
  quality_report = {
1482
  "report_timestamp": datetime.now().isoformat(),
 
1483
  "overall_statistics": {
1484
  "total_articles": (metadata.get('train_size', 0) + metadata.get('test_size', 0)),
1485
  "overall_success_rate": 0.85 if metadata.get('test_f1', 0) > 0.7 else 0.65
@@ -1509,16 +1650,6 @@ async def get_quality_report():
1509
 
1510
  except HTTPException:
1511
  raise
1512
- except FileNotFoundError:
1513
- raise HTTPException(
1514
- status_code=404,
1515
- detail="No validation statistics available"
1516
- )
1517
- except json.JSONDecodeError:
1518
- raise HTTPException(
1519
- status_code=500,
1520
- detail="Invalid metadata format"
1521
- )
1522
  except Exception as e:
1523
  logger.error(f"Failed to generate quality report: {e}")
1524
  raise HTTPException(
 
678
  detail="Model is not available. Please try again later."
679
  )
680
 
681
+ # NEW: Data validation before prediction
682
+ try:
683
+ from data.data_validator import DataValidator
684
+ from data.validation_schemas import ValidationLevel
685
+
686
+ validator = DataValidator()
687
+ validation_result = validator.validate_text(request.text)
688
+
689
+ # Log validation result
690
+ validation_entry = {
691
+ 'timestamp': datetime.now().isoformat(),
692
+ 'text_length': len(request.text),
693
+ 'validation_level': validation_result.validation_level.value,
694
+ 'quality_score': validation_result.quality_score,
695
+ 'issues': [issue.dict() for issue in validation_result.issues],
696
+ 'passed_validation': validation_result.validation_level != ValidationLevel.INVALID,
697
+ 'client_ip': client_ip,
698
+ 'user_agent': user_agent
699
+ }
700
+
701
+ # Save validation results
702
+ try:
703
+ validation_log_path = path_manager.get_logs_path("validation_log.json")
704
+ if validation_log_path.exists():
705
+ with open(validation_log_path, 'r') as f:
706
+ validation_data = json.load(f)
707
+ else:
708
+ validation_data = []
709
+
710
+ validation_data.append(validation_entry)
711
+
712
+ # Keep only last 1000 entries
713
+ if len(validation_data) > 1000:
714
+ validation_data = validation_data[-1000:]
715
+
716
+ with open(validation_log_path, 'w') as f:
717
+ json.dump(validation_data, f, indent=2)
718
+ except Exception as e:
719
+ logger.warning(f"Could not save validation log: {e}")
720
+
721
+ # Block invalid inputs
722
+ if validation_result.validation_level == ValidationLevel.INVALID:
723
+ raise HTTPException(
724
+ status_code=400,
725
+ detail=f"Input validation failed: {validation_result.issues[0].message if validation_result.issues else 'Invalid input'}"
726
+ )
727
+
728
+ except ImportError:
729
+ logger.warning("Data validation components not available, proceeding without validation")
730
+ validation_result = None
731
+
732
  # Prepare request data for routing
733
  request_data = {
734
  'client_id': client_ip,
 
789
  processing_time=processing_time
790
  )
791
 
792
+ # Log prediction (background task) - ENHANCED with validation info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
793
  background_tasks.add_task(
794
+ log_prediction_with_validation,
795
  request.text,
796
  label,
797
  confidence,
798
  client_ip,
799
+ processing_time,
800
+ validation_result
801
  )
802
 
803
  return response
 
843
  )
844
 
845
 
846
+ async def log_prediction_with_validation(text: str, prediction: str, confidence: float,
847
+ client_ip: str, processing_time: float,
848
+ validation_result=None):
849
+ """Enhanced logging function that includes validation data"""
850
+ try:
851
+ prediction_entry = {
852
+ 'timestamp': datetime.now().isoformat(),
853
+ 'prediction': prediction,
854
+ 'confidence': confidence,
855
+ 'processing_time': processing_time,
856
+ 'client_ip': client_ip,
857
+ 'text_length': len(text),
858
+ 'text_preview': text[:100] + "..." if len(text) > 100 else text
859
+ }
860
+
861
+ # Add validation information if available
862
+ if validation_result:
863
+ prediction_entry.update({
864
+ 'validation_level': validation_result.validation_level.value,
865
+ 'quality_score': validation_result.quality_score,
866
+ 'validation_issues_count': len(validation_result.issues)
867
+ })
868
+
869
+ prediction_log_path = path_manager.get_logs_path("prediction_log.json")
870
+
871
+ if prediction_log_path.exists():
872
+ with open(prediction_log_path, 'r') as f:
873
+ prediction_data = json.load(f)
874
+ else:
875
+ prediction_data = []
876
+
877
+ prediction_data.append(prediction_entry)
878
+
879
+ # Keep only last 1000 entries
880
+ if len(prediction_data) > 1000:
881
+ prediction_data = prediction_data[-1000:]
882
+
883
+ with open(prediction_log_path, 'w') as f:
884
+ json.dump(prediction_data, f, indent=2)
885
+
886
+ except Exception as e:
887
+ logger.error(f"Failed to log prediction: {e}")
888
+
889
+
890
  @app.post("/predict/batch", response_model=BatchPredictionResponse)
891
  async def predict_batch(
892
  request: BatchPredictionRequest,
 
1400
  )
1401
 
1402
  def get_validation_stats():
1403
+ """Get validation statistics from actual validation logs"""
1404
+ try:
1405
+ stats = {
1406
+ 'last_updated': datetime.now().isoformat(),
1407
+ 'total_validations': 0,
1408
+ 'total_articles': 0,
1409
+ 'total_valid_articles': 0,
1410
+ 'average_quality_score': 0.0,
1411
+ 'validation_breakdown': {},
1412
+ 'source_statistics': {},
1413
+ 'validation_history': [],
1414
+ 'quality_trends': []
1415
+ }
1416
+
1417
+ # Load actual validation data
1418
+ validation_log_path = path_manager.get_logs_path("validation_log.json")
1419
+ if validation_log_path.exists():
1420
+ with open(validation_log_path, 'r') as f:
1421
+ validation_data = json.load(f)
1422
+
1423
+ if validation_data:
1424
+ stats['total_validations'] = len(validation_data)
1425
+ stats['total_articles'] = len(validation_data)
1426
+
1427
+ # Analyze validation levels
1428
+ level_counts = {}
1429
+ quality_scores = []
1430
+
1431
+ for entry in validation_data:
1432
+ level = entry.get('validation_level', 'unknown')
1433
+ level_counts[level] = level_counts.get(level, 0) + 1
1434
+
1435
+ if entry.get('quality_score'):
1436
+ quality_scores.append(entry['quality_score'])
1437
+
1438
+ if entry.get('passed_validation', False):
1439
+ stats['total_valid_articles'] += 1
1440
+
1441
+ stats['validation_breakdown'] = level_counts
1442
+ stats['average_quality_score'] = sum(quality_scores) / len(quality_scores) if quality_scores else 0.0
1443
+ stats['validation_history'] = validation_data[-10:] # Last 10
1444
+
1445
+ # Quality trends over time
1446
+ for entry in validation_data[-20:]: # Last 20 for trends
1447
+ if entry.get('quality_score') is not None:
1448
+ stats['quality_trends'].append({
1449
+ 'timestamp': entry.get('timestamp'),
1450
+ 'quality_score': entry.get('quality_score')
1451
+ })
1452
+
1453
+ return stats if stats['total_validations'] > 0 else None
1454
+
1455
+ except Exception as e:
1456
+ logger.warning(f"Could not load validation stats: {e}")
1457
+ return None
1458
+
1459
+
1460
+ # Data Quality Report Endpoint
1461
+ @app.get("/validation/quality-report")
1462
+ async def get_validation_quality_report():
1463
+ """Get detailed validation quality report"""
1464
+ try:
1465
+ stats = get_validation_stats()
1466
+
1467
+ if not stats:
1468
+ return {
1469
+ 'error': 'No validation data available',
1470
+ 'message': 'No validation statistics available yet'
1471
+ }
1472
+
1473
+ # Generate quality assessment
1474
+ avg_quality = stats.get('average_quality_score', 0)
1475
+ validation_breakdown = stats.get('validation_breakdown', {})
1476
+
1477
+ quality_level = 'poor'
1478
+ if avg_quality > 0.8:
1479
+ quality_level = 'excellent'
1480
+ elif avg_quality > 0.6:
1481
+ quality_level = 'good'
1482
+ elif avg_quality > 0.4:
1483
+ quality_level = 'fair'
1484
+
1485
+ # Generate recommendations
1486
+ recommendations = []
1487
+ invalid_count = validation_breakdown.get('INVALID', 0)
1488
+ total = stats.get('total_validations', 1)
1489
+
1490
+ if invalid_count / total > 0.1:
1491
+ recommendations.append("High rate of invalid inputs detected - consider input preprocessing")
1492
+
1493
+ if avg_quality < 0.5:
1494
+ recommendations.append("Low average quality scores - review data sources")
1495
+
1496
+ return {
1497
+ 'overall_statistics': {
1498
+ 'total_articles': stats.get('total_articles', 0),
1499
+ 'overall_success_rate': stats.get('total_valid_articles', 0) / max(stats.get('total_articles', 1), 1)
1500
+ },
1501
+ 'quality_assessment': {
1502
+ 'quality_level': quality_level,
1503
+ 'average_quality_score': avg_quality
1504
+ },
1505
+ 'validation_breakdown': validation_breakdown,
1506
+ 'recommendations': recommendations,
1507
+ 'timestamp': datetime.now().isoformat()
1508
+ }
1509
+
1510
+ except Exception as e:
1511
+ logger.error(f"Quality report generation failed: {e}")
1512
+ raise HTTPException(status_code=500, detail="Failed to generate quality report")
1513
+
1514
 
1515
+ # Statistics Validation Endpoint
1516
  @app.get("/validation/statistics")
1517
  async def get_validation_statistics():
1518
  """Get comprehensive validation statistics"""
 
1556
  # Adding fallback to build quality report from metadata if generate_quality_report fails; improved error handling, logging, and richer report structure
1557
  @app.get("/validation/quality-report")
1558
  async def get_quality_report():
1559
+ """Get comprehensive data quality report with real validation data"""
1560
  try:
1561
+ # Try to get real validation statistics
1562
+ validation_stats = get_validation_stats()
1563
+
1564
+ if validation_stats and validation_stats.get('total_validations', 0) > 0:
1565
+ # Generate report from real validation data
1566
+ avg_quality = validation_stats.get('average_quality_score', 0.0)
1567
+ breakdown = validation_stats.get('validation_breakdown', {})
1568
+ total_validations = validation_stats.get('total_validations', 0)
1569
 
1570
+ # Assess quality level
1571
+ if avg_quality > 0.8:
1572
+ quality_level = "excellent"
1573
+ elif avg_quality > 0.6:
1574
+ quality_level = "good"
1575
+ elif avg_quality > 0.4:
1576
+ quality_level = "fair"
1577
+ else:
1578
+ quality_level = "poor"
1579
+
1580
+ # Generate recommendations
1581
+ recommendations = []
1582
+ invalid_rate = breakdown.get('INVALID', 0) / max(total_validations, 1)
1583
+
1584
+ if invalid_rate > 0.1:
1585
+ recommendations.append("High rate of invalid inputs - consider input preprocessing")
1586
+ if avg_quality < 0.5:
1587
+ recommendations.append("Low average quality scores - review data sources")
1588
+ if breakdown.get('LOW', 0) / max(total_validations, 1) > 0.2:
1589
+ recommendations.append("Many low-quality inputs detected - implement content filtering")
1590
+
1591
+ return {
1592
+ "report_timestamp": datetime.now().isoformat(),
1593
+ "data_source": "real_validation_logs",
1594
+ "overall_statistics": {
1595
+ "total_articles": validation_stats.get('total_articles', 0),
1596
+ "total_validations": total_validations,
1597
+ "overall_success_rate": validation_stats.get('total_valid_articles', 0) / max(validation_stats.get('total_articles', 1), 1)
1598
+ },
1599
+ "quality_assessment": {
1600
+ "quality_level": quality_level,
1601
+ "average_quality_score": avg_quality
1602
+ },
1603
+ "validation_breakdown": breakdown,
1604
+ "recommendations": recommendations,
1605
+ "quality_trends": validation_stats.get('quality_trends', [])
1606
+ }
1607
 
1608
+ # Fallback to existing metadata-based approach
1609
  metadata_path = path_manager.get_metadata_path()
1610
 
1611
  if not metadata_path.exists():
 
1617
  with open(metadata_path, 'r') as f:
1618
  metadata = json.load(f)
1619
 
1620
+ # Create quality report from metadata (existing code)
1621
  quality_report = {
1622
  "report_timestamp": datetime.now().isoformat(),
1623
+ "data_source": "model_metadata",
1624
  "overall_statistics": {
1625
  "total_articles": (metadata.get('train_size', 0) + metadata.get('test_size', 0)),
1626
  "overall_success_rate": 0.85 if metadata.get('test_f1', 0) > 0.7 else 0.65
 
1650
 
1651
  except HTTPException:
1652
  raise
 
 
 
 
 
 
 
 
 
 
1653
  except Exception as e:
1654
  logger.error(f"Failed to generate quality report: {e}")
1655
  raise HTTPException(