{ "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "datasource", "uid": "grafana" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "description": "Nvidia GPU Metrics based on the prometheus metrics from github.com/utkuozdemir/nvidia_gpu_exporter", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": 10, "links": [], "panels": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The official product name of the GPU. This is an alphanumeric string. For all products.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 2, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 3, "w": 4, "x": 0, "y": 0 }, "id": 23, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showPercentChange": false, "text": {}, "textMode": "name", "wideLayout": true }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", "instant": true, "interval": "", "legendFormat": "{{name}}", "refId": "A" } ], "title": "Name", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The current performance state for the GPU. States range from P0 (maximum performance) to P12 (minimum performance).", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 0, "mappings": [ { "options": { "": { "text": "" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" } ] }, "unit": "prefix:P" }, "overrides": [] }, "gridPos": { "h": 3, "w": 2, "x": 4, "y": 0 }, "id": 22, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showPercentChange": false, "text": {}, "textMode": "value", "wideLayout": true }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_pstate{uuid=\"$gpu\"}", "interval": "", "legendFormat": "", "refId": "A" } ], "title": "P-State", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "percentage", "steps": [ { "color": "green" }, { "color": "#EAB839", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 6, "y": 0 }, "id": 6, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto", "text": {} }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}", "interval": "", "legendFormat": "{{uuid}}", "refId": "A" } ], "title": "GPU Utilization %", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts / The software power limit in watts.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "percentage", "steps": [ { "color": "green" }, { "color": "#EAB839", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 9, "y": 0 }, "id": 21, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto", "text": {} }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"} / nvidia_smi_power_default_limit_watts{uuid=\"$gpu\"}", "interval": "", "legendFormat": "", "refId": "A" } ], "title": "Power Draw %", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "percentage", "steps": [ { "color": "green" }, { "color": "#EAB839", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 12, "y": 0 }, "id": 4, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto", "text": {} }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}", "interval": "", "legendFormat": "", "refId": "A" } ], "title": "Fan Speed %", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Core GPU temperature. in degrees C.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "#EAB839", "value": 70 }, { "color": "red", "value": 80 } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 15, "y": 0 }, "id": 16, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto", "text": {} }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}", "interval": "", "legendFormat": "{{uuid}}", "refId": "A" } ], "title": "Temperature", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "line+area" } }, "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent" }, { "color": "orange", "value": 0.7 }, { "color": "red", "value": 0.9 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 18, "y": 0 }, "id": 11, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}", "interval": "", "legendFormat": "{{uuid}}", "refId": "A" } ], "title": "Memory Utilization %", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The version of the installed NVIDIA display driver. This is an alphanumeric string.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 2, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 2, "w": 3, "x": 0, "y": 3 }, "id": 14, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showPercentChange": false, "text": {}, "textMode": "name", "wideLayout": true }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", "instant": true, "interval": "", "legendFormat": "{{driver_version}}", "refId": "A" } ], "title": "Driver Version", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The BIOS of the GPU board.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 2, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 2, "w": 3, "x": 3, "y": 3 }, "id": 34, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showPercentChange": false, "text": {}, "textMode": "name", "wideLayout": true }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", "instant": true, "interval": "", "legendFormat": "{{vbios_version}}", "refId": "A" } ], "title": "Vbios Version", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Information about factors that are reducing the frequency of clocks. If all throttle reasons are returned as \"Not Active\" it means that clocks are running as high as possible.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 0, "mappings": [ { "options": { "0": { "text": "Not Active" }, "1": { "text": "Active" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 0, "y": 5 }, "id": 32, "options": { "displayMode": "gradient", "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "text": {}, "valueMode": "color" }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_clocks_event_reasons_gpu_idle{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_gpu_idle{uuid=\"$gpu\"}", "instant": false, "interval": "", "legendFormat": "Idle", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_clocks_event_reasons_hw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_thermal_slowdown{uuid=\"$gpu\"}", "hide": false, "interval": "", "legendFormat": "HW Thermal Slowdown", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_clocks_event_reasons_sw_power_cap{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_power_cap{uuid=\"$gpu\"}", "hide": false, "interval": "", "legendFormat": "SW Power Cap", "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_clocks_event_reasons_applications_clocks_setting{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_applications_clocks_setting{uuid=\"$gpu\"}", "hide": false, "interval": "", "legendFormat": "App Clocks Setting", "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_clocks_event_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"}", "hide": false, "interval": "", "legendFormat": "HW Power Brake", "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_clocks_event_reasons_sw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_thermal_slowdown{uuid=\"$gpu\"}", "hide": false, "interval": "", "legendFormat": "SW Thermal Slowdown", "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_clocks_event_reasons_sync_boost{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sync_boost{uuid=\"$gpu\"}", "hide": false, "interval": "", "legendFormat": "Sync Boost", "refId": "G" } ], "title": "Throttle Reasons", "type": "bargauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Current frequency of graphics (shader) clock\n/\nMaximum frequency of graphics (shader) clock.\n", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "percentage", "steps": [ { "color": "green" }, { "color": "#EAB839", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 6, "y": 5 }, "id": 20, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto", "text": {} }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_graphics_clock_hz{uuid=\"$gpu\"}", "interval": "", "legendFormat": "", "refId": "A" } ], "title": "GPU Clock Speed %", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Current frequency of memory clock / Maximum frequency of memory clock", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "percentage", "steps": [ { "color": "green" }, { "color": "#EAB839", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 9, "y": 5 }, "id": 33, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto", "text": {} }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_memory_clock_hz{uuid=\"$gpu\"}", "interval": "", "legendFormat": "", "refId": "A" } ], "title": "Memory Clock Speed %", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Total memory allocated by active contexts / Total installed GPU memory.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "percentage", "steps": [ { "color": "green" }, { "color": "#EAB839", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 12, "y": 5 }, "id": 25, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto", "text": {} }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"} / nvidia_smi_memory_total_bytes{uuid=\"$gpu\"}", "interval": "", "legendFormat": "", "refId": "A" } ], "title": "Memory Allocation %", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "percentage", "steps": [ { "color": "green" }, { "color": "#EAB839", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 15, "y": 5 }, "id": 7, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "last" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto", "text": {} }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}", "interval": "", "legendFormat": "", "refId": "A" } ], "title": "Memory Utilization %", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "line+area" } }, "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent" }, { "color": "orange", "value": 0.7 }, { "color": "red", "value": 0.9 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 18, "y": 5 }, "id": 10, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}", "interval": "", "legendFormat": "", "refId": "A" } ], "title": "GPU Utilization %", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Total memory allocated by active contexts.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] }, "unit": "decbytes" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 0, "y": 10 }, "id": 17, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"}", "interval": "", "legendFormat": "{{uuid}}", "refId": "A" } ], "title": "Memory Allocation", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Core GPU temperature. in degrees C.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "line+area" } }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent" }, { "color": "orange", "value": 70 }, { "color": "red", "value": 80 } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 6, "y": 10 }, "id": 15, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}", "interval": "", "legendFormat": "{{uuid}}", "refId": "A" } ], "title": "Temperature", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] }, "unit": "watt" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 12, "y": 10 }, "id": 8, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"}", "interval": "", "legendFormat": "{{uuid}}", "refId": "A" } ], "title": "Power Draw", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "line+area" } }, "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent" }, { "color": "orange", "value": 0.7 }, { "color": "red", "value": 0.9 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 18, "y": 10 }, "id": 9, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}", "interval": "", "legendFormat": "{{uuid}}", "refId": "A" } ], "title": "Fan Speed %", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Current frequency of graphics (shader) clock.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] }, "unit": "hertz" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 0, "y": 15 }, "id": 12, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"}", "format": "time_series", "interval": "", "legendFormat": "{{uuid}}", "refId": "A" } ], "title": "Graphics Clock Speed", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Current frequency of video encoder/decoder clock.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] }, "unit": "hertz" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 6, "y": 15 }, "id": 19, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_clocks_current_video_clock_hz{uuid=\"$gpu\"}", "format": "time_series", "interval": "", "legendFormat": "{{uuid}}", "refId": "A" } ], "title": "Video Clock Speed", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Current frequency of SM (Streaming Multiprocessor) clock.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] }, "unit": "hertz" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 12, "y": 15 }, "id": 24, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_clocks_current_sm_clock_hz{uuid=\"$gpu\"}", "format": "time_series", "interval": "", "legendFormat": "{{uuid}}", "refId": "A" } ], "title": "SM Clock Speed", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "description": "Current frequency of memory clock.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green" }, { "color": "red", "value": 80 } ] }, "unit": "hertz" }, "overrides": [] }, "gridPos": { "h": 5, "w": 6, "x": 18, "y": 15 }, "id": 18, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } }, "pluginVersion": "11.6.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "exemplar": true, "expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"}", "format": "time_series", "interval": "", "legendFormat": "{{uuid}}", "refId": "A" } ], "title": "Memory Clock Speed", "type": "timeseries" } ], "preload": false, "refresh": "10s", "schemaVersion": 41, "tags": [ "nvidia", "nvidia-smi", "nvidia_gpu_exporter", "prometheus" ], "templating": { "list": [ { "current": {}, "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "definition": "label_values(nvidia_smi_index, job)", "includeAll": false, "label": "Job", "name": "job", "options": [], "query": { "query": "label_values(nvidia_smi_index,job)", "refId": "Prometheus-job-Variable-Query" }, "refresh": 1, "regex": "", "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "definition": "label_values(nvidia_smi_index{job=\"$job\"},instance)", "includeAll": false, "label": "Host", "name": "node", "options": [], "query": { "query": "label_values(nvidia_smi_index{job=\"$job\"},instance)", "refId": "Prometheus-node-Variable-Query" }, "refresh": 1, "regex": "", "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "definition": "label_values(nvidia_smi_index{instance=\"$node\"},uuid)", "includeAll": false, "label": "GPU", "name": "gpu", "options": [], "query": { "query": "label_values(nvidia_smi_index{instance=\"$node\"},uuid)", "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "", "sort": 1, "type": "query" }, { "current": { "text": "Prometheus", "value": "prometheus" }, "name": "DS_PROMETHEUS", "options": [], "query": "prometheus", "refresh": 1, "regex": "", "type": "datasource" } ] }, "time": { "from": "now-30m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Nvidia GPU Metrics", "uid": "vlvPlrgnk", "version": 1 }