dashboards/nvidia-gpu-metrics.json

2163 lines
54 KiB
JSON

{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Nvidia GPU Metrics based on the prometheus metrics from github.com/utkuozdemir/nvidia_gpu_exporter",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 10,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "The official product name of the GPU. This is an alphanumeric string. For all products.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 0,
"y": 0
},
"id": 23,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"text": {},
"textMode": "name",
"wideLayout": true
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}",
"instant": true,
"interval": "",
"legendFormat": "{{name}}",
"refId": "A"
}
],
"title": "Name",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "The current performance state for the GPU. States range from P0 (maximum performance) to P12 (minimum performance).",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 0,
"mappings": [
{
"options": {
"": {
"text": ""
}
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
}
]
},
"unit": "prefix:P"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 2,
"x": 4,
"y": 0
},
"id": 22,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"text": {},
"textMode": "value",
"wideLayout": true
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_pstate{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "P-State",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 1,
"min": 0,
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green"
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 3,
"x": 6,
"y": 0
},
"id": 6,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "{{uuid}}",
"refId": "A"
}
],
"title": "GPU Utilization %",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts / The software power limit in watts.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 1,
"min": 0,
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green"
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 3,
"x": 9,
"y": 0
},
"id": 21,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"} / nvidia_smi_power_default_limit_watts{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "Power Draw %",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 1,
"min": 0,
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green"
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 3,
"x": 12,
"y": 0
},
"id": 4,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "Fan Speed %",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Core GPU temperature. in degrees C.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "red",
"value": 80
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 3,
"x": 15,
"y": 0
},
"id": 16,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "{{uuid}}",
"refId": "A"
}
],
"title": "Temperature",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "line+area"
}
},
"mappings": [],
"max": 1,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "transparent"
},
{
"color": "orange",
"value": 0.7
},
{
"color": "red",
"value": 0.9
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 18,
"y": 0
},
"id": 11,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "{{uuid}}",
"refId": "A"
}
],
"title": "Memory Utilization %",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "The version of the installed NVIDIA display driver. This is an alphanumeric string.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 3,
"x": 0,
"y": 3
},
"id": 14,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"text": {},
"textMode": "name",
"wideLayout": true
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}",
"instant": true,
"interval": "",
"legendFormat": "{{driver_version}}",
"refId": "A"
}
],
"title": "Driver Version",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "The BIOS of the GPU board.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 3,
"x": 3,
"y": 3
},
"id": 34,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"text": {},
"textMode": "name",
"wideLayout": true
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}",
"instant": true,
"interval": "",
"legendFormat": "{{vbios_version}}",
"refId": "A"
}
],
"title": "Vbios Version",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Information about factors that are reducing the frequency of clocks. If all throttle reasons are returned as \"Not Active\" it means that clocks are running as high as possible.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 0,
"mappings": [
{
"options": {
"0": {
"text": "Not Active"
},
"1": {
"text": "Active"
}
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 0,
"y": 5
},
"id": 32,
"options": {
"displayMode": "gradient",
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"maxVizHeight": 300,
"minVizHeight": 16,
"minVizWidth": 8,
"namePlacement": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showUnfilled": true,
"sizing": "auto",
"text": {},
"valueMode": "color"
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_event_reasons_gpu_idle{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_gpu_idle{uuid=\"$gpu\"}",
"instant": false,
"interval": "",
"legendFormat": "Idle",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_event_reasons_hw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_thermal_slowdown{uuid=\"$gpu\"}",
"hide": false,
"interval": "",
"legendFormat": "HW Thermal Slowdown",
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_event_reasons_sw_power_cap{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_power_cap{uuid=\"$gpu\"}",
"hide": false,
"interval": "",
"legendFormat": "SW Power Cap",
"refId": "C"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_event_reasons_applications_clocks_setting{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_applications_clocks_setting{uuid=\"$gpu\"}",
"hide": false,
"interval": "",
"legendFormat": "App Clocks Setting",
"refId": "D"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_event_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"}",
"hide": false,
"interval": "",
"legendFormat": "HW Power Brake",
"refId": "E"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_event_reasons_sw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_thermal_slowdown{uuid=\"$gpu\"}",
"hide": false,
"interval": "",
"legendFormat": "SW Thermal Slowdown",
"refId": "F"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_event_reasons_sync_boost{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sync_boost{uuid=\"$gpu\"}",
"hide": false,
"interval": "",
"legendFormat": "Sync Boost",
"refId": "G"
}
],
"title": "Throttle Reasons",
"type": "bargauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Current frequency of graphics (shader) clock\n/\nMaximum frequency of graphics (shader) clock.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 1,
"min": 0,
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green"
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 3,
"x": 6,
"y": 5
},
"id": 20,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_graphics_clock_hz{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "GPU Clock Speed %",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Current frequency of memory clock / Maximum frequency of memory clock",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 1,
"min": 0,
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green"
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 3,
"x": 9,
"y": 5
},
"id": 33,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_memory_clock_hz{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "Memory Clock Speed %",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Total memory allocated by active contexts / Total installed GPU memory.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 1,
"min": 0,
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green"
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 3,
"x": 12,
"y": 5
},
"id": 25,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"} / nvidia_smi_memory_total_bytes{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "Memory Allocation %",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 1,
"min": 0,
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green"
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 3,
"x": 15,
"y": 5
},
"id": 7,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto",
"text": {}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "Memory Utilization %",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "line+area"
}
},
"mappings": [],
"max": 1,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "transparent"
},
{
"color": "orange",
"value": 0.7
},
{
"color": "red",
"value": 0.9
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 18,
"y": 5
},
"id": 10,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "GPU Utilization %",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Total memory allocated by active contexts.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "decbytes"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 0,
"y": 10
},
"id": 17,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "{{uuid}}",
"refId": "A"
}
],
"title": "Memory Allocation",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Core GPU temperature. in degrees C.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "line+area"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "transparent"
},
{
"color": "orange",
"value": 70
},
{
"color": "red",
"value": 80
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 6,
"y": 10
},
"id": 15,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "{{uuid}}",
"refId": "A"
}
],
"title": "Temperature",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 12,
"y": 10
},
"id": 8,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "{{uuid}}",
"refId": "A"
}
],
"title": "Power Draw",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "line+area"
}
},
"mappings": [],
"max": 1,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "transparent"
},
{
"color": "orange",
"value": 0.7
},
{
"color": "red",
"value": 0.9
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 18,
"y": 10
},
"id": 9,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}",
"interval": "",
"legendFormat": "{{uuid}}",
"refId": "A"
}
],
"title": "Fan Speed %",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Current frequency of graphics (shader) clock.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "hertz"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 0,
"y": 15
},
"id": 12,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"}",
"format": "time_series",
"interval": "",
"legendFormat": "{{uuid}}",
"refId": "A"
}
],
"title": "Graphics Clock Speed",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Current frequency of video encoder/decoder clock.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "hertz"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 6,
"y": 15
},
"id": 19,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_current_video_clock_hz{uuid=\"$gpu\"}",
"format": "time_series",
"interval": "",
"legendFormat": "{{uuid}}",
"refId": "A"
}
],
"title": "Video Clock Speed",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Current frequency of SM (Streaming Multiprocessor) clock.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "hertz"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 12,
"y": 15
},
"id": 24,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_current_sm_clock_hz{uuid=\"$gpu\"}",
"format": "time_series",
"interval": "",
"legendFormat": "{{uuid}}",
"refId": "A"
}
],
"title": "SM Clock Speed",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Current frequency of memory clock.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "hertz"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 18,
"y": 15
},
"id": 18,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "11.6.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"}",
"format": "time_series",
"interval": "",
"legendFormat": "{{uuid}}",
"refId": "A"
}
],
"title": "Memory Clock Speed",
"type": "timeseries"
}
],
"preload": false,
"refresh": "10s",
"schemaVersion": 41,
"tags": [
"nvidia",
"nvidia-smi",
"nvidia_gpu_exporter",
"prometheus"
],
"templating": {
"list": [
{
"current": {},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(nvidia_smi_index, job)",
"includeAll": false,
"label": "Job",
"name": "job",
"options": [],
"query": {
"query": "label_values(nvidia_smi_index,job)",
"refId": "Prometheus-job-Variable-Query"
},
"refresh": 1,
"regex": "",
"type": "query"
},
{
"current": {},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(nvidia_smi_index{job=\"$job\"},instance)",
"includeAll": false,
"label": "Host",
"name": "node",
"options": [],
"query": {
"query": "label_values(nvidia_smi_index{job=\"$job\"},instance)",
"refId": "Prometheus-node-Variable-Query"
},
"refresh": 1,
"regex": "",
"type": "query"
},
{
"current": {},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(nvidia_smi_index{instance=\"$node\"},uuid)",
"includeAll": false,
"label": "GPU",
"name": "gpu",
"options": [],
"query": {
"query": "label_values(nvidia_smi_index{instance=\"$node\"},uuid)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"sort": 1,
"type": "query"
},
{
"current": {
"text": "Prometheus",
"value": "prometheus"
},
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
}
]
},
"time": {
"from": "now-30m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Nvidia GPU Metrics",
"uid": "vlvPlrgnk",
"version": 1
}