diff --git a/nvidia-gpu-metrics.json b/nvidia-gpu-metrics.json new file mode 100644 index 0000000..91454ad --- /dev/null +++ b/nvidia-gpu-metrics.json @@ -0,0 +1,2163 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Nvidia GPU Metrics based on the prometheus metrics from github.com/utkuozdemir/nvidia_gpu_exporter", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 10, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The official product name of the GPU. This is an alphanumeric string. For all products.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 23, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", + "instant": true, + "interval": "", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "title": "Name", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The current performance state for the GPU. States range from P0 (maximum performance) to P12 (minimum performance).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "": { + "text": "" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "prefix:P" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 4, + "y": 0 + }, + "id": 22, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_pstate{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "P-State", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 0 + }, + "id": 6, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "GPU Utilization %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts / The software power limit in watts.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 0 + }, + "id": 21, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"} / nvidia_smi_power_default_limit_watts{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Power Draw %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Fan Speed %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Core GPU temperature. in degrees C.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 0 + }, + "id": 16, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Temperature", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent" + }, + { + "color": "orange", + "value": 0.7 + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Memory Utilization %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The version of the installed NVIDIA display driver. This is an alphanumeric string.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 0, + "y": 3 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", + "instant": true, + "interval": "", + "legendFormat": "{{driver_version}}", + "refId": "A" + } + ], + "title": "Driver Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The BIOS of the GPU board.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 3, + "y": 3 + }, + "id": 34, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", + "instant": true, + "interval": "", + "legendFormat": "{{vbios_version}}", + "refId": "A" + } + ], + "title": "Vbios Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Information about factors that are reducing the frequency of clocks. If all throttle reasons are returned as \"Not Active\" it means that clocks are running as high as possible.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "0": { + "text": "Not Active" + }, + "1": { + "text": "Active" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 5 + }, + "id": 32, + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_gpu_idle{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_gpu_idle{uuid=\"$gpu\"}", + "instant": false, + "interval": "", + "legendFormat": "Idle", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_hw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_thermal_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "HW Thermal Slowdown", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_sw_power_cap{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_power_cap{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "SW Power Cap", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_applications_clocks_setting{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_applications_clocks_setting{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "App Clocks Setting", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "HW Power Brake", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_sw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_thermal_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "SW Thermal Slowdown", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_sync_boost{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sync_boost{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "Sync Boost", + "refId": "G" + } + ], + "title": "Throttle Reasons", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Current frequency of graphics (shader) clock\n/\nMaximum frequency of graphics (shader) clock.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 5 + }, + "id": 20, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_graphics_clock_hz{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "GPU Clock Speed %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Current frequency of memory clock / Maximum frequency of memory clock", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 5 + }, + "id": 33, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_memory_clock_hz{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Clock Speed %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total memory allocated by active contexts / Total installed GPU memory.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 5 + }, + "id": 25, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"} / nvidia_smi_memory_total_bytes{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Allocation %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 5 + }, + "id": 7, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Utilization %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent" + }, + { + "color": "orange", + "value": 0.7 + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 5 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "GPU Utilization %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total memory allocated by active contexts.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 10 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Memory Allocation", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Core GPU temperature. in degrees C.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent" + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Temperature", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 10 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Power Draw", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent" + }, + { + "color": "orange", + "value": 0.7 + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 10 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Fan Speed %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Current frequency of graphics (shader) clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 15 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Graphics Clock Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Current frequency of video encoder/decoder clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 15 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_video_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Video Clock Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Current frequency of SM (Streaming Multiprocessor) clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 15 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_sm_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "SM Clock Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Current frequency of memory clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 15 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Memory Clock Speed", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "10s", + "schemaVersion": 41, + "tags": [ + "nvidia", + "nvidia-smi", + "nvidia_gpu_exporter", + "prometheus" + ], + "templating": { + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(nvidia_smi_index, job)", + "includeAll": false, + "label": "Job", + "name": "job", + "options": [], + "query": { + "query": "label_values(nvidia_smi_index,job)", + "refId": "Prometheus-job-Variable-Query" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(nvidia_smi_index{job=\"$job\"},instance)", + "includeAll": false, + "label": "Host", + "name": "node", + "options": [], + "query": { + "query": "label_values(nvidia_smi_index{job=\"$job\"},instance)", + "refId": "Prometheus-node-Variable-Query" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(nvidia_smi_index{instance=\"$node\"},uuid)", + "includeAll": false, + "label": "GPU", + "name": "gpu", + "options": [], + "query": { + "query": "label_values(nvidia_smi_index{instance=\"$node\"},uuid)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "Prometheus", + "value": "prometheus" + }, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Nvidia GPU Metrics", + "uid": "vlvPlrgnk", + "version": 1 +} \ No newline at end of file