| """ |
| GPU Tools for SPARKNET |
| Tools for GPU monitoring and management |
| """ |
|
|
| from typing import Optional |
| from loguru import logger |
| from .base_tool import BaseTool, ToolResult |
| from ..utils.gpu_manager import get_gpu_manager |
|
|
|
|
| class GPUMonitorTool(BaseTool): |
| """Tool for monitoring GPU status.""" |
|
|
| def __init__(self): |
| super().__init__( |
| name="gpu_monitor", |
| description="Monitor GPU status, memory usage, and utilization", |
| ) |
| self.add_parameter("gpu_id", "int", "Specific GPU ID to monitor (optional)", required=False, default=None) |
| self.gpu_manager = get_gpu_manager() |
|
|
| async def execute(self, gpu_id: Optional[int] = None, **kwargs) -> ToolResult: |
| """ |
| Monitor GPU status. |
| |
| Args: |
| gpu_id: Specific GPU ID or None for all GPUs |
| |
| Returns: |
| ToolResult with GPU information |
| """ |
| try: |
| if gpu_id is not None: |
| |
| info = self.gpu_manager.get_gpu_info(gpu_id) |
|
|
| if "error" in info: |
| return ToolResult( |
| success=False, |
| output=None, |
| error=info["error"], |
| ) |
|
|
| output = self._format_gpu_info(info) |
|
|
| return ToolResult( |
| success=True, |
| output=output, |
| metadata=info, |
| ) |
| else: |
| |
| all_info = self.gpu_manager.get_all_gpu_info() |
|
|
| output_lines = [] |
| for info in all_info: |
| if "error" not in info: |
| output_lines.append(self._format_gpu_info(info)) |
|
|
| output = "\n\n".join(output_lines) |
|
|
| return ToolResult( |
| success=True, |
| output=output, |
| metadata={"gpus": all_info}, |
| ) |
|
|
| except Exception as e: |
| logger.error(f"GPU monitoring error: {e}") |
| return ToolResult( |
| success=False, |
| output=None, |
| error=f"Monitoring error: {str(e)}", |
| ) |
|
|
| def _format_gpu_info(self, info: dict) -> str: |
| """Format GPU info for display.""" |
| return ( |
| f"GPU {info['gpu_id']}: {info['name']}\n" |
| f" Memory: {info['memory_used'] / 1024**3:.2f} GB / {info['memory_total'] / 1024**3:.2f} GB " |
| f"({info['memory_percent']:.1f}% used)\n" |
| f" Free Memory: {info['memory_free'] / 1024**3:.2f} GB\n" |
| f" GPU Utilization: {info['gpu_utilization']}%\n" |
| f" Temperature: {info['temperature']}°C" |
| ) |
|
|
|
|
| class GPUSelectTool(BaseTool): |
| """Tool for selecting best available GPU.""" |
|
|
| def __init__(self): |
| super().__init__( |
| name="gpu_select", |
| description="Select the best available GPU based on free memory", |
| ) |
| self.add_parameter("min_memory_gb", "float", "Minimum required memory in GB", required=False, default=8.0) |
| self.gpu_manager = get_gpu_manager() |
|
|
| async def execute(self, min_memory_gb: float = 8.0, **kwargs) -> ToolResult: |
| """ |
| Select best GPU. |
| |
| Args: |
| min_memory_gb: Minimum required memory |
| |
| Returns: |
| ToolResult with selected GPU ID |
| """ |
| try: |
| gpu_id = self.gpu_manager.select_best_gpu(min_memory_gb) |
|
|
| if gpu_id is None: |
| return ToolResult( |
| success=False, |
| output=None, |
| error=f"No GPU found with {min_memory_gb} GB free memory", |
| ) |
|
|
| info = self.gpu_manager.get_gpu_info(gpu_id) |
|
|
| output = ( |
| f"Selected GPU {gpu_id}: {info['name']}\n" |
| f"Free Memory: {info['memory_free'] / 1024**3:.2f} GB" |
| ) |
|
|
| return ToolResult( |
| success=True, |
| output=output, |
| metadata={ |
| "gpu_id": gpu_id, |
| "gpu_info": info, |
| }, |
| ) |
|
|
| except Exception as e: |
| logger.error(f"GPU selection error: {e}") |
| return ToolResult( |
| success=False, |
| output=None, |
| error=f"Selection error: {str(e)}", |
| ) |
|
|