Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| π§ͺ Comprehensive Testing Framework for Ultra-Advanced Food Recognition | |
| ==================================================================== | |
| Testing suite for evaluating the state-of-the-art ensemble model | |
| performance, accuracy, and robustness. | |
| Evaluates: | |
| - Model accuracy across different food categories | |
| - Ensemble agreement and confidence calibration | |
| - Image quality robustness | |
| - Hallucination detection effectiveness | |
| - Speed and memory usage | |
| - Cross-cultural food recognition | |
| Author: AI Assistant | |
| Version: 1.0.0 - Comprehensive Testing Suite | |
| """ | |
| import os | |
| import time | |
| import json | |
| import asyncio | |
| import statistics | |
| from typing import Dict, List, Any, Tuple | |
| from PIL import Image, ImageDraw, ImageFont | |
| import numpy as np | |
| import requests | |
| from io import BytesIO | |
| # Import our model | |
| from app import UltraAdvancedFoodRecognizer, FOOD_CATEGORIES, select_device | |
| class FoodRecognitionTester: | |
| """Comprehensive testing framework for food recognition model.""" | |
| def __init__(self): | |
| self.device = select_device() | |
| print(f"π§ͺ Initializing test framework on {self.device.upper()}") | |
| self.recognizer = UltraAdvancedFoodRecognizer(self.device) | |
| self.test_results = {} | |
| def create_synthetic_test_images(self) -> List[Tuple[Image.Image, str, str]]: | |
| """Create synthetic test images for basic functionality testing.""" | |
| test_images = [] | |
| # Create simple colored rectangles representing different foods | |
| test_cases = [ | |
| ("apple", (220, 20, 60), "fruits"), # Red apple | |
| ("banana", (255, 255, 0), "fruits"), # Yellow banana | |
| ("broccoli", (34, 139, 34), "vegetables"), # Green broccoli | |
| ("carrot", (255, 140, 0), "vegetables"), # Orange carrot | |
| ("bread", (222, 184, 135), "grains_carbs"), # Brown bread | |
| ("pizza", (255, 69, 0), "prepared_dishes"), # Reddish pizza | |
| ] | |
| for food_name, color, category in test_cases: | |
| # Create a 224x224 image with the specified color | |
| img = Image.new('RGB', (224, 224), color) | |
| # Add some texture (simple noise) | |
| draw = ImageDraw.Draw(img) | |
| for i in range(50): | |
| x = np.random.randint(0, 224) | |
| y = np.random.randint(0, 224) | |
| noise_color = tuple(max(0, min(255, c + np.random.randint(-30, 30))) for c in color) | |
| draw.point((x, y), fill=noise_color) | |
| test_images.append((img, food_name, category)) | |
| return test_images | |
| def test_basic_functionality(self) -> Dict[str, Any]: | |
| """Test basic model functionality.""" | |
| print("π Testing basic functionality...") | |
| test_images = self.create_synthetic_test_images() | |
| results = { | |
| "total_tests": len(test_images), | |
| "passed": 0, | |
| "failed": 0, | |
| "details": [] | |
| } | |
| for img, expected_food, expected_category in test_images: | |
| try: | |
| start_time = time.time() | |
| # Test food detection | |
| is_food, food_confidence, _ = self.recognizer.detect_food_advanced(img) | |
| # Test food analysis | |
| analysis = self.recognizer.analyze_food(img) | |
| processing_time = time.time() - start_time | |
| test_result = { | |
| "expected_food": expected_food, | |
| "expected_category": expected_category, | |
| "detected_food": analysis["primary_label"], | |
| "confidence": analysis["confidence"], | |
| "is_food_detected": is_food, | |
| "food_detection_confidence": food_confidence, | |
| "processing_time_ms": round(processing_time * 1000, 2), | |
| "status": "passed" if is_food and analysis["confidence"] > 0.1 else "failed" | |
| } | |
| if test_result["status"] == "passed": | |
| results["passed"] += 1 | |
| else: | |
| results["failed"] += 1 | |
| results["details"].append(test_result) | |
| except Exception as e: | |
| results["failed"] += 1 | |
| results["details"].append({ | |
| "expected_food": expected_food, | |
| "error": str(e), | |
| "status": "error" | |
| }) | |
| return results | |
| def test_ensemble_agreement(self) -> Dict[str, Any]: | |
| """Test ensemble model agreement and consistency.""" | |
| print("π€ Testing ensemble agreement...") | |
| test_images = self.create_synthetic_test_images() | |
| agreement_scores = [] | |
| confidence_consistency = [] | |
| for img, food_name, _ in test_images: | |
| try: | |
| analysis = self.recognizer.analyze_food(img) | |
| ensemble_details = analysis.get("ensemble_details", []) | |
| if len(ensemble_details) > 1: | |
| # Calculate label agreement | |
| labels = [pred["label"] for pred in ensemble_details] | |
| label_counts = {} | |
| for label in labels: | |
| label_counts[label] = label_counts.get(label, 0) + 1 | |
| max_agreement = max(label_counts.values()) | |
| agreement_ratio = max_agreement / len(labels) | |
| agreement_scores.append(agreement_ratio) | |
| # Calculate confidence consistency | |
| confidences = [pred["confidence"] for pred in ensemble_details] | |
| conf_std = np.std(confidences) | |
| confidence_consistency.append(1.0 - min(conf_std, 1.0)) | |
| except Exception as e: | |
| print(f"Error testing {food_name}: {e}") | |
| return { | |
| "average_agreement": statistics.mean(agreement_scores) if agreement_scores else 0, | |
| "agreement_std": statistics.stdev(agreement_scores) if len(agreement_scores) > 1 else 0, | |
| "confidence_consistency": statistics.mean(confidence_consistency) if confidence_consistency else 0, | |
| "tests_run": len(agreement_scores) | |
| } | |
| def test_image_quality_robustness(self) -> Dict[str, Any]: | |
| """Test model performance on various image qualities.""" | |
| print("πΈ Testing image quality robustness...") | |
| # Create base test image | |
| base_img = Image.new('RGB', (224, 224), (220, 20, 60)) # Red apple | |
| quality_tests = [] | |
| # Test different qualities | |
| for brightness in [0.5, 0.8, 1.0, 1.2, 1.5]: | |
| from PIL import ImageEnhance | |
| enhancer = ImageEnhance.Brightness(base_img) | |
| bright_img = enhancer.enhance(brightness) | |
| try: | |
| analysis = self.recognizer.analyze_food(bright_img) | |
| quality_tests.append({ | |
| "test_type": "brightness", | |
| "factor": brightness, | |
| "confidence": analysis["confidence"], | |
| "quality_score": analysis["visual_features"].get("estimated_quality", 0), | |
| "hallucination_risk": analysis.get("confidence_analysis", {}).get("hallucination_risk", "unknown") | |
| }) | |
| except Exception as e: | |
| quality_tests.append({ | |
| "test_type": "brightness", | |
| "factor": brightness, | |
| "error": str(e) | |
| }) | |
| # Test blur simulation (reduced sharpness) | |
| for sharpness in [0.3, 0.5, 0.8, 1.0, 1.5]: | |
| from PIL import ImageEnhance | |
| enhancer = ImageEnhance.Sharpness(base_img) | |
| sharp_img = enhancer.enhance(sharpness) | |
| try: | |
| analysis = self.recognizer.analyze_food(sharp_img) | |
| quality_tests.append({ | |
| "test_type": "sharpness", | |
| "factor": sharpness, | |
| "confidence": analysis["confidence"], | |
| "quality_score": analysis["visual_features"].get("estimated_quality", 0), | |
| "hallucination_risk": analysis.get("confidence_analysis", {}).get("hallucination_risk", "unknown") | |
| }) | |
| except Exception as e: | |
| quality_tests.append({ | |
| "test_type": "sharpness", | |
| "factor": sharpness, | |
| "error": str(e) | |
| }) | |
| return { | |
| "total_quality_tests": len(quality_tests), | |
| "quality_test_details": quality_tests, | |
| "robustness_score": sum(1 for test in quality_tests if test.get("confidence", 0) > 0.3) / len(quality_tests) | |
| } | |
| def test_performance_benchmarks(self) -> Dict[str, Any]: | |
| """Test model performance and speed.""" | |
| print("β‘ Testing performance benchmarks...") | |
| test_images = self.create_synthetic_test_images() | |
| processing_times = [] | |
| memory_usage = [] | |
| import psutil | |
| import os | |
| process = psutil.Process(os.getpid()) | |
| for img, _, _ in test_images: | |
| # Measure memory before | |
| mem_before = process.memory_info().rss / 1024 / 1024 # MB | |
| # Time the inference | |
| start_time = time.time() | |
| try: | |
| analysis = self.recognizer.analyze_food(img) | |
| processing_time = time.time() - start_time | |
| processing_times.append(processing_time * 1000) # Convert to ms | |
| # Measure memory after | |
| mem_after = process.memory_info().rss / 1024 / 1024 # MB | |
| memory_usage.append(mem_after - mem_before) | |
| except Exception as e: | |
| print(f"Performance test error: {e}") | |
| return { | |
| "average_processing_time_ms": statistics.mean(processing_times) if processing_times else 0, | |
| "min_processing_time_ms": min(processing_times) if processing_times else 0, | |
| "max_processing_time_ms": max(processing_times) if processing_times else 0, | |
| "processing_time_std": statistics.stdev(processing_times) if len(processing_times) > 1 else 0, | |
| "average_memory_delta_mb": statistics.mean(memory_usage) if memory_usage else 0, | |
| "total_tests": len(processing_times) | |
| } | |
| def test_category_coverage(self) -> Dict[str, Any]: | |
| """Test coverage across food categories.""" | |
| print("π Testing category coverage...") | |
| category_stats = {} | |
| for category in FOOD_CATEGORIES: | |
| # Create simple test for each category | |
| img = Image.new('RGB', (224, 224), (100, 150, 200)) # Generic blue | |
| try: | |
| analysis = self.recognizer.analyze_food(img, custom_categories=[category]) | |
| category_stats[category] = { | |
| "confidence": analysis["confidence"], | |
| "detected": analysis["primary_label"], | |
| "status": "tested" | |
| } | |
| except Exception as e: | |
| category_stats[category] = { | |
| "error": str(e), | |
| "status": "error" | |
| } | |
| successful_tests = sum(1 for stat in category_stats.values() if stat["status"] == "tested") | |
| return { | |
| "total_categories": len(FOOD_CATEGORIES), | |
| "successfully_tested": successful_tests, | |
| "coverage_percentage": (successful_tests / len(FOOD_CATEGORIES)) * 100, | |
| "category_details": category_stats | |
| } | |
| def run_comprehensive_test_suite(self) -> Dict[str, Any]: | |
| """Run the complete test suite.""" | |
| print("π Starting comprehensive test suite...") | |
| print("=" * 60) | |
| start_time = time.time() | |
| # Run all tests | |
| test_results = { | |
| "test_timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), | |
| "device": self.device, | |
| "model_config": { | |
| "clip_model": self.recognizer.config.clip_model, | |
| "total_categories": len(FOOD_CATEGORIES), | |
| "models_loaded": self.recognizer.models_loaded | |
| } | |
| } | |
| # 1. Basic functionality | |
| test_results["basic_functionality"] = self.test_basic_functionality() | |
| # 2. Ensemble agreement | |
| test_results["ensemble_agreement"] = self.test_ensemble_agreement() | |
| # 3. Image quality robustness | |
| test_results["quality_robustness"] = self.test_image_quality_robustness() | |
| # 4. Performance benchmarks | |
| test_results["performance"] = self.test_performance_benchmarks() | |
| # 5. Category coverage | |
| test_results["category_coverage"] = self.test_category_coverage() | |
| total_time = time.time() - start_time | |
| test_results["total_test_time_seconds"] = round(total_time, 2) | |
| # Calculate overall score | |
| basic_score = test_results["basic_functionality"]["passed"] / max(test_results["basic_functionality"]["total_tests"], 1) | |
| ensemble_score = test_results["ensemble_agreement"]["average_agreement"] | |
| quality_score = test_results["quality_robustness"]["robustness_score"] | |
| coverage_score = test_results["category_coverage"]["coverage_percentage"] / 100 | |
| overall_score = (basic_score + ensemble_score + quality_score + coverage_score) / 4 | |
| test_results["overall_score"] = round(overall_score * 100, 2) | |
| print("=" * 60) | |
| print(f"β Test suite completed in {total_time:.2f} seconds") | |
| print(f"π Overall Score: {test_results['overall_score']}%") | |
| print("=" * 60) | |
| return test_results | |
| def main(): | |
| """Run the testing framework.""" | |
| tester = FoodRecognitionTester() | |
| results = tester.run_comprehensive_test_suite() | |
| # Save results | |
| with open("test_results.json", "w") as f: | |
| json.dump(results, f, indent=2) | |
| print(f"π Test results saved to test_results.json") | |
| # Print summary | |
| print("\nπ TEST SUMMARY:") | |
| print(f"Overall Score: {results['overall_score']}%") | |
| print(f"Basic Tests: {results['basic_functionality']['passed']}/{results['basic_functionality']['total_tests']} passed") | |
| print(f"Ensemble Agreement: {results['ensemble_agreement']['average_agreement']:.2%}") | |
| print(f"Quality Robustness: {results['quality_robustness']['robustness_score']:.2%}") | |
| print(f"Category Coverage: {results['category_coverage']['coverage_percentage']:.1f}%") | |
| print(f"Avg Processing Time: {results['performance']['average_processing_time_ms']:.1f}ms") | |
| if __name__ == "__main__": | |
| main() |