| """ |
| GPU Monitoring Example for SPARKNET |
| Demonstrates GPU management and monitoring capabilities |
| """ |
|
|
| import sys |
| from pathlib import Path |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
| from src.utils.gpu_manager import get_gpu_manager |
| from src.utils.logging import setup_logging |
| from loguru import logger |
| import time |
|
|
|
|
| def main(): |
| """Run GPU monitoring example.""" |
|
|
| |
| setup_logging(log_level="INFO") |
|
|
| logger.info("="*70) |
| logger.info("SPARKNET GPU Monitoring Example") |
| logger.info("="*70) |
|
|
| |
| gpu_manager = get_gpu_manager() |
|
|
| |
| logger.info("\n" + "="*70) |
| logger.info("All GPUs Status") |
| logger.info("="*70) |
| print(gpu_manager.monitor()) |
|
|
| |
| logger.info("\n" + "="*70) |
| logger.info("Detailed GPU Information") |
| logger.info("="*70) |
|
|
| all_info = gpu_manager.get_all_gpu_info() |
| for info in all_info: |
| if "error" not in info: |
| logger.info(f"\nGPU {info['gpu_id']}: {info['name']}") |
| logger.info(f" Total Memory: {info['memory_total'] / 1024**3:.2f} GB") |
| logger.info(f" Used Memory: {info['memory_used'] / 1024**3:.2f} GB") |
| logger.info(f" Free Memory: {info['memory_free'] / 1024**3:.2f} GB") |
| logger.info(f" Memory Usage: {info['memory_percent']:.1f}%") |
| logger.info(f" GPU Utilization: {info['gpu_utilization']}%") |
| logger.info(f" Memory Util: {info['memory_utilization']}%") |
| logger.info(f" Temperature: {info['temperature']}°C") |
|
|
| |
| logger.info("\n" + "="*70) |
| logger.info("GPU Selection") |
| logger.info("="*70) |
|
|
| min_memory = 2.0 |
| best_gpu = gpu_manager.select_best_gpu(min_memory_gb=min_memory) |
|
|
| if best_gpu is not None: |
| logger.info(f"\nBest GPU for {min_memory} GB requirement: GPU {best_gpu}") |
| gpu_info = gpu_manager.get_gpu_info(best_gpu) |
| logger.info(f"Free memory: {gpu_info['memory_free'] / 1024**3:.2f} GB") |
| else: |
| logger.warning(f"\nNo GPU found with {min_memory} GB free memory") |
|
|
| |
| logger.info("\n" + "="*70) |
| logger.info("GPU Context Manager Test") |
| logger.info("="*70) |
|
|
| try: |
| with gpu_manager.gpu_context(min_memory_gb=1.0) as gpu_id: |
| logger.info(f"\nUsing GPU {gpu_id} in context") |
| logger.info("This would be where you load and run your model") |
| time.sleep(1) |
| logger.info("GPU context released and cache cleared") |
| except RuntimeError as e: |
| logger.error(f"Could not allocate GPU: {e}") |
|
|
| |
| logger.info("\n" + "="*70) |
| logger.info("Available GPUs Summary") |
| logger.info("="*70) |
|
|
| available = gpu_manager.available_gpus |
| logger.info(f"\nTotal GPUs detected: {len(available)}") |
| logger.info(f"GPU IDs: {available}") |
| logger.info(f"Primary GPU: {gpu_manager.primary_gpu}") |
| logger.info(f"Fallback GPUs: {gpu_manager.fallback_gpus}") |
|
|
| logger.info("\n" + "="*70) |
| logger.info("GPU Monitoring Example Completed") |
| logger.info("="*70) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|