Ahmedik95316 commited on
Commit
cecd6fa
·
1 Parent(s): 92a45c5

Update app/streamlit_app.py

Browse files

Adding Data Validation Schemas

Files changed (1) hide show
  1. app/streamlit_app.py +77 -191
app/streamlit_app.py CHANGED
@@ -199,6 +199,17 @@ class StreamlitAppManager:
199
  logger.warning(f"Could not fetch validation health: {e}")
200
  return None
201
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  # Initialize app manager
204
  app_manager = StreamlitAppManager()
@@ -539,6 +550,63 @@ def render_cv_results_section():
539
  error_msg = cv_results.get('error', 'Unknown error') if cv_results else 'No CV results available'
540
  st.warning(f"Cross-validation results not available: {error_msg}")
541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  def render_model_comparison_section():
543
  """Render model comparison results section"""
544
  st.subheader("⚖️ Model Comparison Results")
@@ -1431,12 +1499,10 @@ def main():
1431
  with tab5:
1432
  show_logs_section()
1433
 
1434
-
1435
  # Tab 6: System Status
1436
  with tab6:
1437
  render_system_status()
1438
 
1439
-
1440
  def render_system_status():
1441
  """Render system status tab"""
1442
  st.header("System Status & Monitoring")
@@ -1460,7 +1526,7 @@ def render_system_status():
1460
  with col1:
1461
  st.write(f"**Environment:** {env_info['environment']}")
1462
  st.write(f"**Base Directory:** {env_info['base_dir']}")
1463
- st.write(f"**Working Directory:** {env_info['current_working_directory']}")
1464
 
1465
  with col2:
1466
  st.write(f"**Data Directory:** {env_info['data_dir']}")
@@ -1484,13 +1550,11 @@ def render_system_status():
1484
  else:
1485
  st.error("🔴 System Status: Unhealthy")
1486
 
1487
- # Detailed health metrics
1488
  col1, col2, col3 = st.columns(3)
1489
-
1490
  with col1:
1491
  st.subheader("🤖 Model Health")
1492
  model_health = health_data.get('model_health', {})
1493
-
1494
  for key, value in model_health.items():
1495
  if key not in ['test_prediction', 'model_path', 'data_path', 'environment']:
1496
  display_key = key.replace('_', ' ').title()
@@ -1500,57 +1564,22 @@ def render_system_status():
1500
  else:
1501
  st.write(f"**{display_key}:** {value}")
1502
 
1503
- with col2:
1504
- st.subheader("💻 System Resources")
1505
- system_health = health_data.get('system_health', {})
1506
-
1507
- for key, value in system_health.items():
1508
- if isinstance(value, (int, float)):
1509
- st.metric(key.replace('_', ' ').title(),
1510
- f"{value:.1f}%")
1511
-
1512
- with col3:
1513
- st.subheader("🔗 API Health")
1514
- api_health = health_data.get('api_health', {})
1515
-
1516
- for key, value in api_health.items():
1517
- st.write(
1518
- f"**{key.replace('_', ' ').title()}:** {value}")
1519
-
1520
- # Environment details from API
1521
- env_data = health_data.get('environment_info', {})
1522
- if env_data:
1523
- st.subheader("📊 File Availability")
1524
-
1525
- col1, col2 = st.columns(2)
1526
-
1527
- with col1:
1528
- st.write("**Datasets:**")
1529
- datasets = env_data.get('available_datasets', {})
1530
- for name, exists in datasets.items():
1531
- status = "✅" if exists else "❌"
1532
- st.write(f"{status} {name}")
1533
-
1534
- with col2:
1535
- st.write("**Models:**")
1536
- models = env_data.get('available_models', {})
1537
- for name, exists in models.items():
1538
- status = "✅" if exists else "❌"
1539
- st.write(f"{status} {name}")
1540
-
1541
  except Exception as e:
1542
  st.error(f"Failed to get health status: {e}")
1543
-
1544
  else:
1545
  st.error("🔴 API Service is not available")
1546
 
 
 
 
 
 
 
1547
  # Model information
1548
  st.subheader("🎯 Model Information")
1549
-
1550
  metadata = load_json_file(path_manager.get_metadata_path(), {})
1551
  if metadata:
1552
  col1, col2 = st.columns(2)
1553
-
1554
  with col1:
1555
  for key in ['model_version', 'test_accuracy', 'test_f1', 'model_type']:
1556
  if key in metadata:
@@ -1560,7 +1589,6 @@ def render_system_status():
1560
  st.metric(display_key, f"{value:.4f}")
1561
  else:
1562
  st.metric(display_key, str(value))
1563
-
1564
  with col2:
1565
  for key in ['train_size', 'timestamp', 'environment']:
1566
  if key in metadata:
@@ -1568,114 +1596,14 @@ def render_system_status():
1568
  value = metadata[key]
1569
  if key == 'timestamp':
1570
  try:
1571
- dt = datetime.fromisoformat(
1572
- value.replace('Z', '+00:00'))
1573
  value = dt.strftime('%Y-%m-%d %H:%M:%S')
1574
  except:
1575
  pass
1576
  st.write(f"**{display_key}:** {value}")
1577
-
1578
  else:
1579
  st.warning("No model metadata available")
1580
 
1581
- st.divider()
1582
- show_validation_status()
1583
-
1584
- # Recent activity
1585
- st.subheader("📜 Recent Activity")
1586
-
1587
- activity_log = load_json_file(path_manager.get_activity_log_path(), [])
1588
- if activity_log:
1589
- recent_activities = activity_log[-10:] if len(
1590
- activity_log) > 10 else activity_log
1591
-
1592
- for entry in reversed(recent_activities):
1593
- timestamp = entry.get('timestamp', 'Unknown')
1594
- event = entry.get('event', 'Unknown event')
1595
- level = entry.get('level', 'INFO')
1596
-
1597
- if level == 'ERROR':
1598
- st.error(f"🔴 {timestamp} - {event}")
1599
- elif level == 'WARNING':
1600
- st.warning(f"🟡 {timestamp} - {event}")
1601
- else:
1602
- st.info(f"🔵 {timestamp} - {event}")
1603
-
1604
- else:
1605
- st.info("No recent activity logs found")
1606
-
1607
- # File system status
1608
- st.subheader("📁 File System Status")
1609
-
1610
- critical_files = [
1611
- (path_manager.get_model_file_path(), "Main Model"),
1612
- (path_manager.get_vectorizer_path(), "Vectorizer"),
1613
- (path_manager.get_combined_dataset_path(), "Training Dataset"),
1614
- (path_manager.get_metadata_path(), "Model Metadata")
1615
- ]
1616
-
1617
- col1, col2 = st.columns(2)
1618
-
1619
- with col1:
1620
- st.write("**Critical Files:**")
1621
- for file_path, description in critical_files:
1622
- if file_path.exists():
1623
- st.success(f"✅ {description}")
1624
- else:
1625
- st.error(f"❌ {description}")
1626
-
1627
- with col2:
1628
- # Disk usage information
1629
- try:
1630
- import shutil
1631
-
1632
- # Check disk usage for the base directory
1633
- base_path = path_manager.base_paths['base']
1634
- total, used, free = shutil.disk_usage(base_path)
1635
-
1636
- st.write("**Disk Usage:**")
1637
- st.write(f"Total: {total // (1024**3)} GB")
1638
- st.write(f"Used: {used // (1024**3)} GB")
1639
- st.write(f"Free: {free // (1024**3)} GB")
1640
-
1641
- usage_percent = (used / total) * 100
1642
- if usage_percent > 90:
1643
- st.error(f"⚠️ Disk usage: {usage_percent:.1f}%")
1644
- elif usage_percent > 75:
1645
- st.warning(f"⚠️ Disk usage: {usage_percent:.1f}%")
1646
- else:
1647
- st.success(f"✅ Disk usage: {usage_percent:.1f}%")
1648
-
1649
- except Exception as e:
1650
- st.error(f"Cannot check disk usage: {e}")
1651
-
1652
- # Initialize system button
1653
- if st.button("🔧 Initialize System", help="Run system initialization if components are missing"):
1654
- with st.spinner("Running system initialization..."):
1655
- try:
1656
- result = subprocess.run(
1657
- [sys.executable, str(path_manager.base_paths['base'] / "initialize_system.py")],
1658
- capture_output=True,
1659
- text=True,
1660
- timeout=300,
1661
- cwd=str(path_manager.base_paths['base'])
1662
- )
1663
-
1664
- if result.returncode == 0:
1665
- st.success(
1666
- "✅ System initialization completed successfully!")
1667
- st.code(result.stdout)
1668
- time.sleep(2)
1669
- st.rerun()
1670
- else:
1671
- st.error("❌ System initialization failed")
1672
- st.code(result.stderr)
1673
-
1674
- except subprocess.TimeoutExpired:
1675
- st.error("⏰ Initialization timed out")
1676
- except Exception as e:
1677
- st.error(f"❌ Initialization error: {e}")
1678
-
1679
 
1680
  # Auto-refresh logic
1681
  if st.session_state.auto_refresh:
@@ -1683,48 +1611,6 @@ if st.session_state.auto_refresh:
1683
  if time_since_refresh > timedelta(seconds=app_manager.config['refresh_interval']):
1684
  st.session_state.last_refresh = datetime.now()
1685
  st.rerun()
1686
-
1687
-
1688
- def show_validation_status():
1689
- """Display validation system status"""
1690
- st.subheader("Data Validation Status")
1691
-
1692
- validation_health = app_manager.get_validation_health_from_api()
1693
- validation_stats = app_manager.get_validation_statistics_from_api()
1694
-
1695
- if validation_health:
1696
- health_data = validation_health.get('validation_health', {})
1697
- overall_status = health_data.get('overall_status', 'unknown')
1698
-
1699
- if overall_status == 'healthy':
1700
- st.success("Validation System: Healthy")
1701
- elif overall_status == 'degraded':
1702
- st.warning("Validation System: Degraded")
1703
- else:
1704
- st.error("Validation System: Unhealthy")
1705
-
1706
- if validation_stats and validation_stats.get('statistics_available'):
1707
- overall_metrics = validation_stats.get('overall_metrics', {})
1708
-
1709
- col1, col2, col3, col4 = st.columns(4)
1710
-
1711
- with col1:
1712
- st.metric("Total Validations", overall_metrics.get('total_validations', 0))
1713
-
1714
- with col2:
1715
- st.metric("Articles Processed", overall_metrics.get('total_articles_processed', 0))
1716
-
1717
- with col3:
1718
- success_rate = overall_metrics.get('overall_success_rate', 0)
1719
- st.metric("Success Rate", f"{success_rate:.1%}")
1720
-
1721
- with col4:
1722
- quality_score = overall_metrics.get('average_quality_score', 0)
1723
- st.metric("Quality Score", f"{quality_score:.3f}")
1724
-
1725
- else:
1726
- st.info("No validation statistics available yet")
1727
-
1728
 
1729
  # Run main application
1730
  if __name__ == "__main__":
 
199
  logger.warning(f"Could not fetch validation health: {e}")
200
  return None
201
 
202
+ def get_validation_quality_report_from_api(self):
203
+ """Get validation quality report from API"""
204
+ try:
205
+ if not self.api_available:
206
+ return None
207
+ response = self.session.get(f"{self.config['api_url']}/validation/quality-report", timeout=10)
208
+ return response.json() if response.status_code == 200 else None
209
+ except Exception as e:
210
+ logger.warning(f"Could not fetch quality report: {e}")
211
+ return None
212
+
213
 
214
  # Initialize app manager
215
  app_manager = StreamlitAppManager()
 
550
  error_msg = cv_results.get('error', 'Unknown error') if cv_results else 'No CV results available'
551
  st.warning(f"Cross-validation results not available: {error_msg}")
552
 
553
+ def render_validation_statistics_section():
554
+ """Render validation statistics section"""
555
+ st.subheader("📊 Data Validation Statistics")
556
+
557
+ validation_stats = app_manager.get_validation_statistics_from_api()
558
+
559
+ if validation_stats and validation_stats.get('statistics_available'):
560
+ overall_metrics = validation_stats.get('overall_metrics', {})
561
+
562
+ col1, col2, col3, col4 = st.columns(4)
563
+ with col1:
564
+ st.metric("Total Validations", overall_metrics.get('total_validations', 0))
565
+ with col2:
566
+ st.metric("Articles Processed", overall_metrics.get('total_articles_processed', 0))
567
+ with col3:
568
+ success_rate = overall_metrics.get('overall_success_rate', 0)
569
+ st.metric("Success Rate", f"{success_rate:.1%}")
570
+ with col4:
571
+ quality_score = overall_metrics.get('average_quality_score', 0)
572
+ st.metric("Avg Quality", f"{quality_score:.3f}")
573
+ else:
574
+ st.info("No validation statistics available yet.")
575
+
576
+ def render_validation_quality_report():
577
+ """Render validation quality report section"""
578
+ st.subheader("📋 Data Quality Report")
579
+
580
+ quality_report = app_manager.get_validation_quality_report_from_api()
581
+
582
+ if quality_report and 'error' not in quality_report:
583
+ overall_stats = quality_report.get('overall_statistics', {})
584
+ quality_assessment = quality_report.get('quality_assessment', {})
585
+
586
+ col1, col2 = st.columns(2)
587
+ with col1:
588
+ st.metric("Total Articles", overall_stats.get('total_articles', 0))
589
+ st.metric("Success Rate", f"{overall_stats.get('overall_success_rate', 0):.1%}")
590
+ with col2:
591
+ quality_level = quality_assessment.get('quality_level', 'unknown')
592
+ if quality_level == 'excellent':
593
+ st.success(f"Quality Level: {quality_level.title()}")
594
+ elif quality_level == 'good':
595
+ st.info(f"Quality Level: {quality_level.title()}")
596
+ elif quality_level == 'fair':
597
+ st.warning(f"Quality Level: {quality_level.title()}")
598
+ else:
599
+ st.error(f"Quality Level: {quality_level.title()}")
600
+
601
+ recommendations = quality_report.get('recommendations', [])
602
+ if recommendations:
603
+ st.subheader("💡 Recommendations")
604
+ for i, rec in enumerate(recommendations, 1):
605
+ st.write(f"{i}. {rec}")
606
+ else:
607
+ st.info("Quality report not available yet.")
608
+
609
+
610
  def render_model_comparison_section():
611
  """Render model comparison results section"""
612
  st.subheader("⚖️ Model Comparison Results")
 
1499
  with tab5:
1500
  show_logs_section()
1501
 
 
1502
  # Tab 6: System Status
1503
  with tab6:
1504
  render_system_status()
1505
 
 
1506
  def render_system_status():
1507
  """Render system status tab"""
1508
  st.header("System Status & Monitoring")
 
1526
  with col1:
1527
  st.write(f"**Environment:** {env_info['environment']}")
1528
  st.write(f"**Base Directory:** {env_info['base_dir']}")
1529
+ st.write(f"**Working Directory:** {env_info.get('current_working_directory', 'N/A')}")
1530
 
1531
  with col2:
1532
  st.write(f"**Data Directory:** {env_info['data_dir']}")
 
1550
  else:
1551
  st.error("🔴 System Status: Unhealthy")
1552
 
1553
+ # Basic health display
1554
  col1, col2, col3 = st.columns(3)
 
1555
  with col1:
1556
  st.subheader("🤖 Model Health")
1557
  model_health = health_data.get('model_health', {})
 
1558
  for key, value in model_health.items():
1559
  if key not in ['test_prediction', 'model_path', 'data_path', 'environment']:
1560
  display_key = key.replace('_', ' ').title()
 
1564
  else:
1565
  st.write(f"**{display_key}:** {value}")
1566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1567
  except Exception as e:
1568
  st.error(f"Failed to get health status: {e}")
 
1569
  else:
1570
  st.error("🔴 API Service is not available")
1571
 
1572
+ # Add the validation sections as specified in the document
1573
+ st.divider()
1574
+ render_validation_statistics_section()
1575
+ st.divider()
1576
+ render_validation_quality_report()
1577
+
1578
  # Model information
1579
  st.subheader("🎯 Model Information")
 
1580
  metadata = load_json_file(path_manager.get_metadata_path(), {})
1581
  if metadata:
1582
  col1, col2 = st.columns(2)
 
1583
  with col1:
1584
  for key in ['model_version', 'test_accuracy', 'test_f1', 'model_type']:
1585
  if key in metadata:
 
1589
  st.metric(display_key, f"{value:.4f}")
1590
  else:
1591
  st.metric(display_key, str(value))
 
1592
  with col2:
1593
  for key in ['train_size', 'timestamp', 'environment']:
1594
  if key in metadata:
 
1596
  value = metadata[key]
1597
  if key == 'timestamp':
1598
  try:
1599
+ dt = datetime.fromisoformat(value.replace('Z', '+00:00'))
 
1600
  value = dt.strftime('%Y-%m-%d %H:%M:%S')
1601
  except:
1602
  pass
1603
  st.write(f"**{display_key}:** {value}")
 
1604
  else:
1605
  st.warning("No model metadata available")
1606
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1607
 
1608
  # Auto-refresh logic
1609
  if st.session_state.auto_refresh:
 
1611
  if time_since_refresh > timedelta(seconds=app_manager.config['refresh_interval']):
1612
  st.session_state.last_refresh = datetime.now()
1613
  st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1614
 
1615
  # Run main application
1616
  if __name__ == "__main__":