Improve Nvidia GPU stats (#14206)

* :Add support for nvidia driver info

* Don't show temperature if detector isn't called coral

* Add encoder and decoder info for Nvidia GPUs

* Fix device info

* Implement GPU info for nvidia GPU

* Update web/src/views/system/GeneralMetrics.tsx

Co-authored-by: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com>

* Update web/src/views/system/GeneralMetrics.tsx

Co-authored-by: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com>

---------

Co-authored-by: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com>
This commit is contained in:
Nicolas Mowen
2024-10-07 20:15:31 -06:00
committed by GitHub
parent 23ce1e930d
commit 2541a345d0
6 changed files with 316 additions and 76 deletions

View File

@@ -28,7 +28,12 @@ from frigate.util.builtin import (
get_tz_modifiers,
update_yaml_from_url,
)
from frigate.util.services import ffprobe_stream, restart_frigate, vainfo_hwaccel
from frigate.util.services import (
ffprobe_stream,
get_nvidia_driver_info,
restart_frigate,
vainfo_hwaccel,
)
from frigate.version import VERSION
logger = logging.getLogger(__name__)
@@ -382,6 +387,11 @@ def vainfo():
)
@router.get("/nvinfo")
def nvinfo():
return JSONResponse(content=get_nvidia_driver_info())
@router.get("/logs/{service}", tags=[Tags.logs])
def logs(
service: str = Path(enum=["frigate", "nginx", "go2rtc"]),

View File

@@ -339,7 +339,10 @@ def get_intel_gpu_stats() -> dict[str, str]:
def try_get_info(f, h, default="N/A"):
try:
v = f(h)
if h:
v = f(h)
else:
v = f()
except nvml.NVMLError_NotSupported:
v = default
return v
@@ -356,6 +359,8 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle)
enc = try_get_info(nvml.nvmlDeviceGetEncoderUtilization, handle)
dec = try_get_info(nvml.nvmlDeviceGetDecoderUtilization, handle)
pstate = try_get_info(nvml.nvmlDeviceGetPowerState, handle, default=None)
if util != "N/A":
gpu_util = util.gpu
else:
@@ -382,6 +387,7 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
"mem": gpu_mem_util,
"enc": enc_util,
"dec": dec_util,
"pstate": pstate or "unknown",
}
except Exception:
pass
@@ -432,6 +438,31 @@ def vainfo_hwaccel(device_name: Optional[str] = None) -> sp.CompletedProcess:
return sp.run(ffprobe_cmd, capture_output=True)
def get_nvidia_driver_info() -> dict[str, any]:
"""Get general hardware info for nvidia GPU."""
results = {}
try:
nvml.nvmlInit()
deviceCount = nvml.nvmlDeviceGetCount()
for i in range(deviceCount):
handle = nvml.nvmlDeviceGetHandleByIndex(i)
driver = try_get_info(nvml.nvmlSystemGetDriverVersion, None, default=None)
cuda_compute = try_get_info(
nvml.nvmlDeviceGetCudaComputeCapability, handle, default=None
)
vbios = try_get_info(nvml.nvmlDeviceGetVbiosVersion, handle, default=None)
results[i] = {
"name": nvml.nvmlDeviceGetName(handle),
"driver": driver or "unknown",
"cuda_compute": cuda_compute or "unknown",
"vbios": vbios or "unknown",
}
except Exception:
pass
finally:
return results
def auto_detect_hwaccel() -> str:
"""Detect hwaccel args by default."""
try: