Improve Nvidia GPU stats (#14206)

* :Add support for nvidia driver info * Don't show temperature if detector isn't called coral * Add encoder and decoder info for Nvidia GPUs * Fix device info * Implement GPU info for nvidia GPU * Update web/src/views/system/GeneralMetrics.tsx Co-authored-by: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com> * Update web/src/views/system/GeneralMetrics.tsx Co-authored-by: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com> --------- Co-authored-by: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com>
2024-10-07 20:15:31 -06:00
parent 23ce1e930d
commit 2541a345d0
6 changed files with 316 additions and 76 deletions
--- a/frigate/api/app.py
+++ b/frigate/api/app.py
@@ -28,7 +28,12 @@ from frigate.util.builtin import (
    get_tz_modifiers,
    update_yaml_from_url,
 )
-from frigate.util.services import ffprobe_stream, restart_frigate, vainfo_hwaccel
+from frigate.util.services import (
+    ffprobe_stream,
+    get_nvidia_driver_info,
+    restart_frigate,
+    vainfo_hwaccel,
+)
 from frigate.version import VERSION

 logger = logging.getLogger(__name__)
@@ -382,6 +387,11 @@ def vainfo():
    )


+@router.get("/nvinfo")
+def nvinfo():
+    return JSONResponse(content=get_nvidia_driver_info())
+
+
@router.get("/logs/{service}", tags=[Tags.logs])
 def logs(
    service: str = Path(enum=["frigate", "nginx", "go2rtc"]),
--- a/frigate/util/services.py
+++ b/frigate/util/services.py
@@ -339,7 +339,10 @@ def get_intel_gpu_stats() -> dict[str, str]:

 def try_get_info(f, h, default="N/A"):
    try:
-        v = f(h)
+        if h:
+            v = f(h)
+        else:
+            v = f()
    except nvml.NVMLError_NotSupported:
        v = default
    return v
@@ -356,6 +359,8 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
            util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle)
            enc = try_get_info(nvml.nvmlDeviceGetEncoderUtilization, handle)
            dec = try_get_info(nvml.nvmlDeviceGetDecoderUtilization, handle)
+            pstate = try_get_info(nvml.nvmlDeviceGetPowerState, handle, default=None)
+
            if util != "N/A":
                gpu_util = util.gpu
            else:
@@ -382,6 +387,7 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
                "mem": gpu_mem_util,
                "enc": enc_util,
                "dec": dec_util,
+                "pstate": pstate or "unknown",
            }
    except Exception:
        pass
@@ -432,6 +438,31 @@ def vainfo_hwaccel(device_name: Optional[str] = None) -> sp.CompletedProcess:
    return sp.run(ffprobe_cmd, capture_output=True)


+def get_nvidia_driver_info() -> dict[str, any]:
+    """Get general hardware info for nvidia GPU."""
+    results = {}
+    try:
+        nvml.nvmlInit()
+        deviceCount = nvml.nvmlDeviceGetCount()
+        for i in range(deviceCount):
+            handle = nvml.nvmlDeviceGetHandleByIndex(i)
+            driver = try_get_info(nvml.nvmlSystemGetDriverVersion, None, default=None)
+            cuda_compute = try_get_info(
+                nvml.nvmlDeviceGetCudaComputeCapability, handle, default=None
+            )
+            vbios = try_get_info(nvml.nvmlDeviceGetVbiosVersion, handle, default=None)
+            results[i] = {
+                "name": nvml.nvmlDeviceGetName(handle),
+                "driver": driver or "unknown",
+                "cuda_compute": cuda_compute or "unknown",
+                "vbios": vbios or "unknown",
+            }
+    except Exception:
+        pass
+    finally:
+        return results
+
+
 def auto_detect_hwaccel() -> str:
    """Detect hwaccel args by default."""
    try: