managedClusters
Name | Type | Description |
---|---|---|
cluster_autoscaler_cluster_safe_to_autoscale | Metric | Determines whether or not cluster autoscaler will take action on the cluster |
cluster_autoscaler_unschedulable_pods_count | Metric | Number of pods that are currently unschedulable in the cluster |
kube_node_status_allocatable_cpu_cores | Metric | Total number of available cpu cores in a managed cluster |
kube_node_status_allocatable_memory_bytes | Metric | Total amount of available memory in a managed cluster |
kube_node_status_condition | Metric | Statuses for various node conditions |
kube_pod_status_phase | Metric | Number of pods by phase |
kube_pod_status_ready | Metric | Number of pods in Ready state |
node_cpu_usage_percentage | Metric | Aggregated average CPU utilization measured in percentage across the cluster |
node_disk_usage_percentage | Metric | Disk space used in percent by device |
node_memory_rss_percentage | Metric | Container RSS memory used in percent |
node_memory_working_set_percentage | Metric | Container working set memory used in percent |
Dashboards:
Click a tab to view the dashboard template
{
"__inputs": [],
"__elements": {},
"__requires": [
{
"type": "panel",
"id": "bargauge",
"name": "Bar gauge",
"version": ""
},
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "9.5.12"
},
{
"type": "datasource",
"id": "grafana-azure-monitor-datasource",
"name": "Azure Monitor",
"version": "1.0.0"
}
],
"title": "Managed clusters",
"editable": true,
"links": [],
"liveNow": false,
"panels": [
{
"title": "cluster_autoscaler_cluster_safe_to_autoscale",
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#808080",
"value": null
},
{
"color": "dark-red",
"value": 0
},
{
"color": "dark-green",
"value": 1
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"displayMode": "basic",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": true
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "9.5.12",
"targets": [
{
"azureLogAnalytics": {
"query": "AzureMetrics\r\n| where _ResourceId has 'Microsoft.ContainerService/managedClusters'\r\n| where MetricName has 'cluster_autoscaler_cluster_safe_to_autoscale'\r\n| summarize metric = avg(Average) by _ResourceId, Resource",
"resources": [
"/subscriptions/$sub"
]
},
"azureMonitor": {
"allowedTimeGrainsMs": [],
"timeGrain": "auto"
},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"queryType": "Azure Log Analytics",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"_ResourceId": true
},
"indexByName": {},
"renameByName": {}
}
}
],
"type": "bargauge"
},
{
"title": "cluster_autoscaler_unschedulable_pods_count",
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#808080",
"value": null
},
{
"color": "dark-green",
"value": 0
},
{
"color": "dark-red",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"displayMode": "basic",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": true
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "9.5.12",
"targets": [
{
"azureLogAnalytics": {
"query": "AzureMetrics\r\n| where _ResourceId has 'Microsoft.ContainerService/managedClusters'\r\n| where MetricName has 'cluster_autoscaler_unschedulable_pods_count'\r\n| summarize metric = avg(Average) by _ResourceId, Resource",
"resources": [
"/subscriptions/$sub"
]
},
"azureMonitor": {
"allowedTimeGrainsMs": [],
"timeGrain": "auto"
},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"queryType": "Azure Log Analytics",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"_ResourceId": true
},
"indexByName": {},
"renameByName": {}
}
}
],
"type": "bargauge"
},
{
"title": "kube_node_status_allocatable_cpu_cores",
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#808080",
"value": null
},
{
"color": "dark-red",
"value": 0
},
{
"color": "dark-green",
"value": 2
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 3,
"options": {
"displayMode": "basic",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": true
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "9.5.12",
"targets": [
{
"azureLogAnalytics": {
"query": "AzureMetrics\r\n| where _ResourceId has 'Microsoft.ContainerService/managedClusters'\r\n| where MetricName has 'kube_node_status_allocatable_cpu_cores'\r\n| summarize metric = avg(Average) by _ResourceId, Resource",
"resources": [
"/subscriptions/$sub"
]
},
"azureMonitor": {
"allowedTimeGrainsMs": [],
"timeGrain": "auto"
},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"queryType": "Azure Log Analytics",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"_ResourceId": true
},
"indexByName": {},
"renameByName": {}
}
}
],
"type": "bargauge"
},
{
"title": "kube_node_status_allocatable_memory_bytes",
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#808080",
"value": null
},
{
"color": "dark-red",
"value": 0
},
{
"color": "dark-green",
"value": 2147483648
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"id": 4,
"options": {
"displayMode": "basic",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": true
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "9.5.12",
"targets": [
{
"azureLogAnalytics": {
"query": "AzureMetrics\r\n| where _ResourceId has 'Microsoft.ContainerService/managedClusters'\r\n| where MetricName has 'kube_node_status_allocatable_memory_bytes'\r\n| summarize metric = avg(Average) by _ResourceId, Resource",
"resources": [
"/subscriptions/$sub"
]
},
"azureMonitor": {
"allowedTimeGrainsMs": [],
"timeGrain": "auto"
},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"queryType": "Azure Log Analytics",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"_ResourceId": true
},
"indexByName": {},
"renameByName": {}
}
}
],
"type": "bargauge"
},
{
"title": "kube_node_status_condition",
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#808080",
"value": null
},
{
"color": "dark-green",
"value": 0
},
{
"color": "dark-red",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 5,
"options": {
"displayMode": "basic",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": true
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "9.5.12",
"targets": [
{
"azureLogAnalytics": {
"query": "AzureMetrics\r\n| where _ResourceId has 'Microsoft.ContainerService/managedClusters'\r\n| where MetricName has 'kube_node_status_condition'\r\n| summarize metric = avg(Total) by _ResourceId, Resource",
"resources": [
"/subscriptions/$sub"
]
},
"azureMonitor": {
"allowedTimeGrainsMs": [],
"timeGrain": "auto"
},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"queryType": "Azure Log Analytics",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"_ResourceId": true
},
"indexByName": {},
"renameByName": {}
}
}
],
"type": "bargauge"
},
{
"title": "kube_pod_status_phase",
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#808080",
"value": null
},
{
"color": "dark-green",
"value": 0
},
{
"color": "dark-red",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"id": 6,
"options": {
"displayMode": "basic",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": true
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "9.5.12",
"targets": [
{
"azureLogAnalytics": {
"query": "AzureMetrics\r\n| where _ResourceId has 'Microsoft.ContainerService/managedClusters'\r\n| where MetricName has 'kube_pod_status_phase'\r\n| summarize metric = avg(Average) by _ResourceId, Resource",
"resources": [
"/subscriptions/$sub"
]
},
"azureMonitor": {
"allowedTimeGrainsMs": [],
"timeGrain": "auto"
},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"queryType": "Azure Log Analytics",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"_ResourceId": true
},
"indexByName": {},
"renameByName": {}
}
}
],
"type": "bargauge"
},
{
"title": "kube_pod_status_ready",
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#808080",
"value": null
},
{
"color": "dark-red",
"value": 0
},
{
"color": "dark-green",
"value": 1
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"id": 7,
"options": {
"displayMode": "basic",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": true
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "9.5.12",
"targets": [
{
"azureLogAnalytics": {
"query": "AzureMetrics\r\n| where _ResourceId has 'Microsoft.ContainerService/managedClusters'\r\n| where MetricName has 'kube_pod_status_ready'\r\n| summarize metric = avg(Average) by _ResourceId, Resource",
"resources": [
"/subscriptions/$sub"
]
},
"azureMonitor": {
"allowedTimeGrainsMs": [],
"timeGrain": "auto"
},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"queryType": "Azure Log Analytics",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"_ResourceId": true
},
"indexByName": {},
"renameByName": {}
}
}
],
"type": "bargauge"
},
{
"title": "node_cpu_usage_percentage",
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#808080",
"value": null
},
{
"color": "dark-green",
"value": 0
},
{
"color": "dark-red",
"value": 95
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 24
},
"id": 8,
"options": {
"displayMode": "basic",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": true
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "9.5.12",
"targets": [
{
"azureLogAnalytics": {
"query": "AzureMetrics\r\n| where _ResourceId has 'Microsoft.ContainerService/managedClusters'\r\n| where MetricName has 'node_cpu_usage_percentage'\r\n| summarize metric = avg(Average) by _ResourceId, Resource",
"resources": [
"/subscriptions/$sub"
]
},
"azureMonitor": {
"allowedTimeGrainsMs": [],
"timeGrain": "auto"
},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"queryType": "Azure Log Analytics",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"_ResourceId": true
},
"indexByName": {},
"renameByName": {}
}
}
],
"type": "bargauge"
},
{
"title": "node_disk_usage_percentage",
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#808080",
"value": null
},
{
"color": "dark-green",
"value": 0
},
{
"color": "dark-red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 32
},
"id": 9,
"options": {
"displayMode": "basic",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": true
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "9.5.12",
"targets": [
{
"azureLogAnalytics": {
"query": "AzureMetrics\r\n| where _ResourceId has 'Microsoft.ContainerService/managedClusters'\r\n| where MetricName has 'node_disk_usage_percentage'\r\n| summarize metric = avg(Average) by _ResourceId, Resource",
"resources": [
"/subscriptions/$sub"
]
},
"azureMonitor": {
"allowedTimeGrainsMs": [],
"timeGrain": "auto"
},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"queryType": "Azure Log Analytics",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"_ResourceId": true
},
"indexByName": {},
"renameByName": {}
}
}
],
"type": "bargauge"
},
{
"title": "node_memory_rss_percentage",
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#808080",
"value": null
},
{
"color": "dark-green",
"value": 0
},
{
"color": "dark-red",
"value": 90
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 32
},
"id": 10,
"options": {
"displayMode": "basic",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": true
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "9.5.12",
"targets": [
{
"azureLogAnalytics": {
"query": "AzureMetrics\r\n| where _ResourceId has 'Microsoft.ContainerService/managedClusters'\r\n| where MetricName has 'node_memory_rss_percentage'\r\n| summarize metric = avg(Average) by _ResourceId, Resource",
"resources": [
"/subscriptions/$sub"
]
},
"azureMonitor": {
"allowedTimeGrainsMs": [],
"timeGrain": "auto"
},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"queryType": "Azure Log Analytics",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"_ResourceId": true
},
"indexByName": {},
"renameByName": {}
}
}
],
"type": "bargauge"
},
{
"title": "node_memory_working_set_percentage",
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#808080",
"value": null
},
{
"color": "dark-green",
"value": 0
},
{
"color": "dark-red",
"value": 100
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 40
},
"id": 11,
"options": {
"displayMode": "basic",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": true
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "9.5.12",
"targets": [
{
"azureLogAnalytics": {
"query": "AzureMetrics\r\n| where _ResourceId has 'Microsoft.ContainerService/managedClusters'\r\n| where MetricName has 'node_memory_working_set_percentage'\r\n| summarize metric = avg(Average) by _ResourceId, Resource",
"resources": [
"/subscriptions/$sub"
]
},
"azureMonitor": {
"allowedTimeGrainsMs": [],
"timeGrain": "auto"
},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"queryType": "Azure Log Analytics",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"_ResourceId": true
},
"indexByName": {},
"renameByName": {}
}
}
],
"type": "bargauge"
}
],
"refresh": "",
"schemaVersion": 38,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"current": {},
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "ds",
"options": [],
"query": "grafana-azure-monitor-datasource",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"current": {},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"definition": "",
"hide": 0,
"includeAll": false,
"label": "Subscription",
"multi": false,
"name": "sub",
"options": [],
"query": {
"azureLogAnalytics": {
"query": "",
"resources": []
},
"queryType": "Azure Subscriptions",
"refId": "A"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"version": null
}
cluster_autoscaler_cluster_safe_to_autoscale - Metric Alert
Determines whether or not cluster autoscaler will take action on the cluster
Properties:
criterionType | StaticThresholdCriterion |
evaluationFrequency | PT1M |
metricName | cluster_autoscaler_cluster_safe_to_autoscale |
metricNamespace | Microsoft.ContainerService/managedClusters |
operator | LessThan |
severity | 3 |
threshold | 1 |
timeAggregation | Average |
windowSize | PT5M |
References:
Templates:
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"alertName": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Name of the alert"
}
},
"alertDescription": {
"type": "string",
"defaultValue": "Determines whether or not cluster autoscaler will take action on the cluster",
"metadata": {
"description": "Description of alert"
}
},
"targetResourceId": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "List of Azure resource Ids seperated by a comma. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name"
}
},
"targetResourceRegion": {
"type": "string",
"metadata": {
"description": "Azure region in which target resources to be monitored are in (without spaces). For example: EastUS"
}
},
"targetResourceType": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Resource type of target resources to be monitored."
}
},
"isEnabled": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "Specifies whether the alert is enabled"
}
},
"alertSeverity": {
"type": "int",
"defaultValue": 3,
"allowedValues": [
0,
1,
2,
3,
4
],
"metadata": {
"description": "Severity of alert {0,1,2,3,4}"
}
},
"operator": {
"type": "string",
"defaultValue": "LessThan",
"allowedValues": [
"Equals",
"GreaterThan",
"GreaterThanOrEqual",
"LessThan",
"LessThanOrEqual"
],
"metadata": {
"description": "Operator comparing the current value with the threshold value."
}
},
"threshold": {
"type": "string",
"defaultValue": "1",
"metadata": {
"description": "The threshold value at which the alert is activated."
}
},
"timeAggregation": {
"type": "string",
"defaultValue": "Average",
"allowedValues": [
"Average",
"Minimum",
"Maximum",
"Total",
"Count"
],
"metadata": {
"description": "How the data that is collected should be combined over time."
}
},
"windowSize": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"PT24H",
"PT1D"
],
"metadata": {
"description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
}
},
"evaluationFrequency": {
"type": "string",
"defaultValue": "PT1M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"metadata": {
"description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
}
},
"currentDateTimeUtcNow": {
"type": "string",
"defaultValue": "[utcNow()]",
"metadata": {
"description": "The current date and time using the utcNow function. Used for deployment name uniqueness"
}
},
"telemetryOptOut": {
"type": "string",
"defaultValue": "No",
"allowedValues": [
"Yes",
"No"
],
"metadata": {
"description": "The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry."
}
}
},
"variables": {
"pidDeploymentName": "[take(concat('pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-', uniqueString(resourceGroup().id, parameters('alertName'), parameters('currentDateTimeUtcNow'))), 64)]",
"varTargetResourceId": "[split(parameters('targetResourceId'), ',')]"
},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[parameters('alertName')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "[parameters('alertDescription')]",
"scopes": "[variables('varTargetResourceId')]",
"targetResourceType": "[parameters('targetResourceType')]",
"targetResourceRegion": "[parameters('targetResourceRegion')]",
"severity": "[parameters('alertSeverity')]",
"enabled": "[parameters('isEnabled')]",
"evaluationFrequency": "[parameters('evaluationFrequency')]",
"windowSize": "[parameters('windowSize')]",
"criteria": {
"odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria",
"allOf": [
{
"name": "1st criterion",
"metricName": "cluster_autoscaler_cluster_safe_to_autoscale",
"dimensions": [],
"operator": "[parameters('operator')]",
"threshold": "[parameters('threshold')]",
"timeAggregation": "[parameters('timeAggregation')]",
"criterionType": "StaticThresholdCriterion"
}
]
}
}
},
{
"condition": "[equals(parameters('telemetryOptOut'), 'No')]",
"apiVersion": "2023-07-01",
"name": "[variables('pidDeploymentName')]",
"type": "Microsoft.Resources/deployments",
"properties": {
"mode": "Incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"resources": []
}
}
}
]
}
@description('Name of the alert')
@minLength(1)
param alertName string
@description('Description of alert')
param alertDescription string = 'Determines whether or not cluster autoscaler will take action on the cluster'
@description('Array of Azure resource Ids. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name')
@minLength(1)
param targetResourceId array
@description('Azure region in which target resources to be monitored are in (without spaces). For example: EastUS')
param targetResourceRegion string
@description('Resource type of target resources to be monitored.')
@minLength(1)
param targetResourceType string
@description('Specifies whether the alert is enabled')
param isEnabled bool = true
@description('Severity of alert {0,1,2,3,4}')
@allowed([
0
1
2
3
4
])
param alertSeverity int = 3
@description('Operator comparing the current value with the threshold value.')
@allowed([
'Equals'
'GreaterThan'
'GreaterThanOrEqual'
'LessThan'
'LessThanOrEqual'
])
param operator string = 'LessThan'
@description('The threshold value at which the alert is activated.')
param threshold int = 1
@description('How the data that is collected should be combined over time.')
@allowed([
'Average'
'Minimum'
'Maximum'
'Total'
'Count'
])
param timeAggregation string = 'Average'
@description('Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format.')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
'PT6H'
'PT12H'
'PT24H'
'P1D'
])
param windowSize string = 'PT5M'
@description('how often the metric alert is evaluated represented in ISO 8601 duration format')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
])
param evaluationFrequency string = 'PT1M'
@description('"The current date and time using the utcNow function. Used for deployment name uniqueness')
param currentDateTimeUtcNow string = utcNow()
@description('The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry.')
@allowed([
'Yes'
'No'
])
param telemetryOptOut string = 'No'
resource metricAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: alertName
location: 'global'
tags: {
_deployed_by_amba: 'true'
}
properties: {
description: alertDescription
scopes: targetResourceId
targetResourceType: targetResourceType
targetResourceRegion: targetResourceRegion
severity: alertSeverity
enabled: isEnabled
evaluationFrequency: evaluationFrequency
windowSize: windowSize
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: '1st criterion'
metricName: 'cluster_autoscaler_cluster_safe_to_autoscale'
dimensions: [[]]
operator: operator
threshold: threshold
timeAggregation: timeAggregation
criterionType: 'StaticThresholdCriterion'
}
]
}
}
}
var ambaTelemetryPidName = 'pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-${uniqueString(resourceGroup().id, alertName, currentDateTimeUtcNow)}'
resource ambaTelemetryPid 'Microsoft.Resources/deployments@2023-07-01' = if (telemetryOptOut == 'No') {
name: ambaTelemetryPidName
tags: {
_deployed_by_amba: 'true'
}
properties: {
mode: 'Incremental'
template: {
'$schema': 'https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#'
contentVersion: '1.0.0.0'
resources: []
}
}
}
{
"type": "Microsoft.Authorization/policyDefinitions",
"apiVersion": "2021-06-01",
"name": "49fd8bac-d061-459d-8d80-a048c4c8ba56",
"properties": {
"policyType": "Custom",
"mode": "All",
"displayName": "Deploy ContainerService managedClusters cluster_autoscaler_cluster_safe_to_autoscale Alert",
"description": "Policy to Audit/Deploy ContainerService managedClusters cluster_autoscaler_cluster_safe_to_autoscale Alert",
"metadata": {
"version": "1.0.0-preview",
"category": "ContainerService",
"preview": true,
"source": "https://github.com/Azure/azure-monitor-baseline-alerts/",
"alzCloudEnvironments": [
"AzureCloud"
],
"_deployed_by_amba": "True"
},
"parameters": {
"severity": {
"type": "String",
"metadata": {
"displayName": "Severity",
"description": "Severity of the Alert"
},
"allowedValues": [
"0",
"1",
"2",
"3",
"4"
],
"defaultValue": "3"
},
"windowSize": {
"type": "String",
"metadata": {
"displayName": "Window Size",
"description": "Window size for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"P1D"
],
"defaultValue": "PT5M"
},
"evaluationFrequency": {
"type": "String",
"metadata": {
"displayName": "Evaluation Frequency",
"description": "Evaluation frequency for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"defaultValue": "PT1M"
},
"autoMitigate": {
"type": "String",
"metadata": {
"displayName": "Auto Mitigate",
"description": "Auto Mitigate for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"enabled": {
"type": "String",
"metadata": {
"displayName": "Alert State",
"description": "Alert state for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"threshold": {
"type": "String",
"metadata": {
"displayName": "Threshold",
"description": "Threshold for the alert"
},
"defaultValue": "1"
},
"effect": {
"type": "String",
"metadata": {
"displayName": "Effect",
"description": "Effect of the policy"
},
"allowedValues": [
"deployIfNotExists",
"disabled"
],
"defaultValue": "deployIfNotExists"
},
"MonitorDisableTagName": {
"type": "String",
"metadata": {
"displayName": "Monitoring disabled tag name",
"description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": "MonitorDisable"
},
"MonitorDisableTagValues": {
"type": "Array",
"metadata": {
"displayName": "Monitoring disabled tag values(s)",
"description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": [
"true",
"Test",
"Dev",
"Sandbox"
]
}
},
"policyRule": {
"if": {
"allOf": [
{
"field": "type",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]",
"notIn": "[[parameters('MonitorDisableTagValues')]"
}
]
},
"then": {
"effect": "[[parameters('effect')]",
"details": {
"roleDefinitionIds": [
"/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
],
"type": "Microsoft.Insights/metricAlerts",
"existenceCondition": {
"allOf": [
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName",
"equals": "cluster_autoscaler_cluster_safe_to_autoscale"
},
{
"field": "Microsoft.Insights/metricalerts/scopes[*]",
"equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.ContainerService/managedClusters/', field('fullName'))]"
},
{
"field": "Microsoft.Insights/metricAlerts/enabled",
"equals": "[[parameters('enabled')]"
},
{
"field": "Microsoft.Insights/metricAlerts/evaluationFrequency",
"equals": "[[parameters('evaluationFrequency')]"
},
{
"field": "Microsoft.Insights/metricAlerts/windowSize",
"equals": "[[parameters('windowSize')]"
},
{
"field": "Microsoft.Insights/metricalerts/severity",
"equals": "[[parameters('severity')]"
},
{
"field": "Microsoft.Insights/metricAlerts/autoMitigate",
"equals": "[[parameters('autoMitigate')]"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation",
"equals": "Average"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator",
"equals": "LessThan"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold",
"equals": "[[if(contains(field('tags'), '_amba-cluster_autoscaler_cluster_safe_to_autoscale-threshold-Override_'), field('tags._amba-cluster_autoscaler_cluster_safe_to_autoscale-threshold-Override_'), parameters('threshold'))]"
}
]
},
"deployment": {
"properties": {
"mode": "incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"resourceName": {
"type": "String",
"metadata": {
"displayName": "resourceName",
"description": "Name of the resource"
}
},
"resourceId": {
"type": "String",
"metadata": {
"displayName": "resourceId",
"description": "Resource ID of the resource emitting the metric that will be used for the comparison"
}
},
"severity": {
"type": "String"
},
"windowSize": {
"type": "String"
},
"evaluationFrequency": {
"type": "String"
},
"autoMitigate": {
"type": "String"
},
"enabled": {
"type": "String"
},
"threshold": {
"type": "String"
}
},
"variables": {},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[[concat(parameters('resourceName'), '-cluster_autoscaler_cluster_safe_to_autoscale')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "Metric Alert for ContainerService managedClusters cluster_autoscaler_cluster_safe_to_autoscale",
"severity": "[[parameters('severity')]",
"enabled": "[[parameters('enabled')]",
"scopes": [
"[[parameters('resourceId')]"
],
"evaluationFrequency": "[[parameters('evaluationFrequency')]",
"windowSize": "[[parameters('windowSize')]",
"criteria": {
"allOf": [
{
"name": "cluster_autoscaler_cluster_safe_to_autoscale",
"metricNamespace": "Microsoft.ContainerService/managedClusters",
"metricName": "cluster_autoscaler_cluster_safe_to_autoscale",
"operator": "LessThan",
"threshold": "[[parameters('threshold')]",
"timeAggregation": "Average",
"criterionType": "StaticThresholdCriterion"
}
],
"odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
},
"autoMitigate": "[[parameters('autoMitigate')]",
"parameters": {
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[parameters('threshold')]"
}
}
}
}
]
},
"parameters": {
"resourceName": {
"value": "[[field('name')]"
},
"resourceId": {
"value": "[[field('id')]"
},
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[if(contains(field('tags'), '_amba-cluster_autoscaler_cluster_safe_to_autoscale-threshold-Override_'), field('tags._amba-cluster_autoscaler_cluster_safe_to_autoscale-threshold-Override_'), parameters('threshold'))]"
}
}
}
}
}
}
}
}
}
cluster_autoscaler_unschedulable_pods_count - Metric Alert
Number of pods that are currently unschedulable in the cluster
Properties:
criterionType | StaticThresholdCriterion |
evaluationFrequency | PT1M |
metricName | cluster_autoscaler_unschedulable_pods_count |
metricNamespace | Microsoft.ContainerService/managedClusters |
operator | GreaterThan |
severity | 3 |
threshold | 0 |
timeAggregation | Average |
windowSize | PT5M |
References:
Templates:
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"alertName": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Name of the alert"
}
},
"alertDescription": {
"type": "string",
"defaultValue": "Number of pods that are currently unschedulable in the cluster",
"metadata": {
"description": "Description of alert"
}
},
"targetResourceId": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "List of Azure resource Ids seperated by a comma. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name"
}
},
"targetResourceRegion": {
"type": "string",
"metadata": {
"description": "Azure region in which target resources to be monitored are in (without spaces). For example: EastUS"
}
},
"targetResourceType": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Resource type of target resources to be monitored."
}
},
"isEnabled": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "Specifies whether the alert is enabled"
}
},
"alertSeverity": {
"type": "int",
"defaultValue": 3,
"allowedValues": [
0,
1,
2,
3,
4
],
"metadata": {
"description": "Severity of alert {0,1,2,3,4}"
}
},
"operator": {
"type": "string",
"defaultValue": "GreaterThan",
"allowedValues": [
"Equals",
"GreaterThan",
"GreaterThanOrEqual",
"LessThan",
"LessThanOrEqual"
],
"metadata": {
"description": "Operator comparing the current value with the threshold value."
}
},
"threshold": {
"type": "string",
"defaultValue": "0",
"metadata": {
"description": "The threshold value at which the alert is activated."
}
},
"timeAggregation": {
"type": "string",
"defaultValue": "Average",
"allowedValues": [
"Average",
"Minimum",
"Maximum",
"Total",
"Count"
],
"metadata": {
"description": "How the data that is collected should be combined over time."
}
},
"windowSize": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"PT24H",
"PT1D"
],
"metadata": {
"description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
}
},
"evaluationFrequency": {
"type": "string",
"defaultValue": "PT1M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"metadata": {
"description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
}
},
"currentDateTimeUtcNow": {
"type": "string",
"defaultValue": "[utcNow()]",
"metadata": {
"description": "The current date and time using the utcNow function. Used for deployment name uniqueness"
}
},
"telemetryOptOut": {
"type": "string",
"defaultValue": "No",
"allowedValues": [
"Yes",
"No"
],
"metadata": {
"description": "The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry."
}
}
},
"variables": {
"pidDeploymentName": "[take(concat('pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-', uniqueString(resourceGroup().id, parameters('alertName'), parameters('currentDateTimeUtcNow'))), 64)]",
"varTargetResourceId": "[split(parameters('targetResourceId'), ',')]"
},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[parameters('alertName')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "[parameters('alertDescription')]",
"scopes": "[variables('varTargetResourceId')]",
"targetResourceType": "[parameters('targetResourceType')]",
"targetResourceRegion": "[parameters('targetResourceRegion')]",
"severity": "[parameters('alertSeverity')]",
"enabled": "[parameters('isEnabled')]",
"evaluationFrequency": "[parameters('evaluationFrequency')]",
"windowSize": "[parameters('windowSize')]",
"criteria": {
"odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria",
"allOf": [
{
"name": "1st criterion",
"metricName": "cluster_autoscaler_unschedulable_pods_count",
"dimensions": [],
"operator": "[parameters('operator')]",
"threshold": "[parameters('threshold')]",
"timeAggregation": "[parameters('timeAggregation')]",
"criterionType": "StaticThresholdCriterion"
}
]
}
}
},
{
"condition": "[equals(parameters('telemetryOptOut'), 'No')]",
"apiVersion": "2023-07-01",
"name": "[variables('pidDeploymentName')]",
"type": "Microsoft.Resources/deployments",
"properties": {
"mode": "Incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"resources": []
}
}
}
]
}
@description('Name of the alert')
@minLength(1)
param alertName string
@description('Description of alert')
param alertDescription string = 'Number of pods that are currently unschedulable in the cluster'
@description('Array of Azure resource Ids. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name')
@minLength(1)
param targetResourceId array
@description('Azure region in which target resources to be monitored are in (without spaces). For example: EastUS')
param targetResourceRegion string
@description('Resource type of target resources to be monitored.')
@minLength(1)
param targetResourceType string
@description('Specifies whether the alert is enabled')
param isEnabled bool = true
@description('Severity of alert {0,1,2,3,4}')
@allowed([
0
1
2
3
4
])
param alertSeverity int = 3
@description('Operator comparing the current value with the threshold value.')
@allowed([
'Equals'
'GreaterThan'
'GreaterThanOrEqual'
'LessThan'
'LessThanOrEqual'
])
param operator string = 'GreaterThan'
@description('The threshold value at which the alert is activated.')
param threshold int = 0
@description('How the data that is collected should be combined over time.')
@allowed([
'Average'
'Minimum'
'Maximum'
'Total'
'Count'
])
param timeAggregation string = 'Average'
@description('Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format.')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
'PT6H'
'PT12H'
'PT24H'
'P1D'
])
param windowSize string = 'PT5M'
@description('how often the metric alert is evaluated represented in ISO 8601 duration format')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
])
param evaluationFrequency string = 'PT1M'
@description('"The current date and time using the utcNow function. Used for deployment name uniqueness')
param currentDateTimeUtcNow string = utcNow()
@description('The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry.')
@allowed([
'Yes'
'No'
])
param telemetryOptOut string = 'No'
resource metricAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: alertName
location: 'global'
tags: {
_deployed_by_amba: 'true'
}
properties: {
description: alertDescription
scopes: targetResourceId
targetResourceType: targetResourceType
targetResourceRegion: targetResourceRegion
severity: alertSeverity
enabled: isEnabled
evaluationFrequency: evaluationFrequency
windowSize: windowSize
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: '1st criterion'
metricName: 'cluster_autoscaler_unschedulable_pods_count'
dimensions: [[]]
operator: operator
threshold: threshold
timeAggregation: timeAggregation
criterionType: 'StaticThresholdCriterion'
}
]
}
}
}
var ambaTelemetryPidName = 'pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-${uniqueString(resourceGroup().id, alertName, currentDateTimeUtcNow)}'
resource ambaTelemetryPid 'Microsoft.Resources/deployments@2023-07-01' = if (telemetryOptOut == 'No') {
name: ambaTelemetryPidName
tags: {
_deployed_by_amba: 'true'
}
properties: {
mode: 'Incremental'
template: {
'$schema': 'https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#'
contentVersion: '1.0.0.0'
resources: []
}
}
}
{
"type": "Microsoft.Authorization/policyDefinitions",
"apiVersion": "2021-06-01",
"name": "301ca0e3-fc88-4285-be99-a2a587c412f5",
"properties": {
"policyType": "Custom",
"mode": "All",
"displayName": "Deploy ContainerService managedClusters cluster_autoscaler_unschedulable_pods_count Alert",
"description": "Policy to Audit/Deploy ContainerService managedClusters cluster_autoscaler_unschedulable_pods_count Alert",
"metadata": {
"version": "1.0.0-preview",
"category": "ContainerService",
"preview": true,
"source": "https://github.com/Azure/azure-monitor-baseline-alerts/",
"alzCloudEnvironments": [
"AzureCloud"
],
"_deployed_by_amba": "True"
},
"parameters": {
"severity": {
"type": "String",
"metadata": {
"displayName": "Severity",
"description": "Severity of the Alert"
},
"allowedValues": [
"0",
"1",
"2",
"3",
"4"
],
"defaultValue": "3"
},
"windowSize": {
"type": "String",
"metadata": {
"displayName": "Window Size",
"description": "Window size for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"P1D"
],
"defaultValue": "PT5M"
},
"evaluationFrequency": {
"type": "String",
"metadata": {
"displayName": "Evaluation Frequency",
"description": "Evaluation frequency for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"defaultValue": "PT1M"
},
"autoMitigate": {
"type": "String",
"metadata": {
"displayName": "Auto Mitigate",
"description": "Auto Mitigate for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"enabled": {
"type": "String",
"metadata": {
"displayName": "Alert State",
"description": "Alert state for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"threshold": {
"type": "String",
"metadata": {
"displayName": "Threshold",
"description": "Threshold for the alert"
},
"defaultValue": "0"
},
"effect": {
"type": "String",
"metadata": {
"displayName": "Effect",
"description": "Effect of the policy"
},
"allowedValues": [
"deployIfNotExists",
"disabled"
],
"defaultValue": "deployIfNotExists"
},
"MonitorDisableTagName": {
"type": "String",
"metadata": {
"displayName": "Monitoring disabled tag name",
"description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": "MonitorDisable"
},
"MonitorDisableTagValues": {
"type": "Array",
"metadata": {
"displayName": "Monitoring disabled tag values(s)",
"description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": [
"true",
"Test",
"Dev",
"Sandbox"
]
}
},
"policyRule": {
"if": {
"allOf": [
{
"field": "type",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]",
"notIn": "[[parameters('MonitorDisableTagValues')]"
}
]
},
"then": {
"effect": "[[parameters('effect')]",
"details": {
"roleDefinitionIds": [
"/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
],
"type": "Microsoft.Insights/metricAlerts",
"existenceCondition": {
"allOf": [
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName",
"equals": "cluster_autoscaler_unschedulable_pods_count"
},
{
"field": "Microsoft.Insights/metricalerts/scopes[*]",
"equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.ContainerService/managedClusters/', field('fullName'))]"
},
{
"field": "Microsoft.Insights/metricAlerts/enabled",
"equals": "[[parameters('enabled')]"
},
{
"field": "Microsoft.Insights/metricAlerts/evaluationFrequency",
"equals": "[[parameters('evaluationFrequency')]"
},
{
"field": "Microsoft.Insights/metricAlerts/windowSize",
"equals": "[[parameters('windowSize')]"
},
{
"field": "Microsoft.Insights/metricalerts/severity",
"equals": "[[parameters('severity')]"
},
{
"field": "Microsoft.Insights/metricAlerts/autoMitigate",
"equals": "[[parameters('autoMitigate')]"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation",
"equals": "Average"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator",
"equals": "GreaterThan"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold",
"equals": "[[if(contains(field('tags'), '_amba-cluster_autoscaler_unschedulable_pods_count-threshold-Override_'), field('tags._amba-cluster_autoscaler_unschedulable_pods_count-threshold-Override_'), parameters('threshold'))]"
}
]
},
"deployment": {
"properties": {
"mode": "incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"resourceName": {
"type": "String",
"metadata": {
"displayName": "resourceName",
"description": "Name of the resource"
}
},
"resourceId": {
"type": "String",
"metadata": {
"displayName": "resourceId",
"description": "Resource ID of the resource emitting the metric that will be used for the comparison"
}
},
"severity": {
"type": "String"
},
"windowSize": {
"type": "String"
},
"evaluationFrequency": {
"type": "String"
},
"autoMitigate": {
"type": "String"
},
"enabled": {
"type": "String"
},
"threshold": {
"type": "String"
}
},
"variables": {},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[[concat(parameters('resourceName'), '-cluster_autoscaler_unschedulable_pods_count')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "Metric Alert for ContainerService managedClusters cluster_autoscaler_unschedulable_pods_count",
"severity": "[[parameters('severity')]",
"enabled": "[[parameters('enabled')]",
"scopes": [
"[[parameters('resourceId')]"
],
"evaluationFrequency": "[[parameters('evaluationFrequency')]",
"windowSize": "[[parameters('windowSize')]",
"criteria": {
"allOf": [
{
"name": "cluster_autoscaler_unschedulable_pods_count",
"metricNamespace": "Microsoft.ContainerService/managedClusters",
"metricName": "cluster_autoscaler_unschedulable_pods_count",
"operator": "GreaterThan",
"threshold": "[[parameters('threshold')]",
"timeAggregation": "Average",
"criterionType": "StaticThresholdCriterion"
}
],
"odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
},
"autoMitigate": "[[parameters('autoMitigate')]",
"parameters": {
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[parameters('threshold')]"
}
}
}
}
]
},
"parameters": {
"resourceName": {
"value": "[[field('name')]"
},
"resourceId": {
"value": "[[field('id')]"
},
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[if(contains(field('tags'), '_amba-cluster_autoscaler_unschedulable_pods_count-threshold-Override_'), field('tags._amba-cluster_autoscaler_unschedulable_pods_count-threshold-Override_'), parameters('threshold'))]"
}
}
}
}
}
}
}
}
}
kube_node_status_allocatable_cpu_cores - Metric Alert
Total number of available cpu cores in a managed cluster
Properties:
criterionType | StaticThresholdCriterion |
evaluationFrequency | PT1M |
metricName | kube_node_status_allocatable_cpu_cores |
metricNamespace | Microsoft.ContainerService/managedClusters |
operator | LessThan |
severity | 3 |
threshold | 2 |
timeAggregation | Average |
windowSize | PT5M |
References:
Templates:
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"alertName": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Name of the alert"
}
},
"alertDescription": {
"type": "string",
"defaultValue": "Total number of available cpu cores in a managed cluster",
"metadata": {
"description": "Description of alert"
}
},
"targetResourceId": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "List of Azure resource Ids seperated by a comma. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name"
}
},
"targetResourceRegion": {
"type": "string",
"metadata": {
"description": "Azure region in which target resources to be monitored are in (without spaces). For example: EastUS"
}
},
"targetResourceType": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Resource type of target resources to be monitored."
}
},
"isEnabled": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "Specifies whether the alert is enabled"
}
},
"alertSeverity": {
"type": "int",
"defaultValue": 3,
"allowedValues": [
0,
1,
2,
3,
4
],
"metadata": {
"description": "Severity of alert {0,1,2,3,4}"
}
},
"operator": {
"type": "string",
"defaultValue": "LessThan",
"allowedValues": [
"Equals",
"GreaterThan",
"GreaterThanOrEqual",
"LessThan",
"LessThanOrEqual"
],
"metadata": {
"description": "Operator comparing the current value with the threshold value."
}
},
"threshold": {
"type": "string",
"defaultValue": "2",
"metadata": {
"description": "The threshold value at which the alert is activated."
}
},
"timeAggregation": {
"type": "string",
"defaultValue": "Average",
"allowedValues": [
"Average",
"Minimum",
"Maximum",
"Total",
"Count"
],
"metadata": {
"description": "How the data that is collected should be combined over time."
}
},
"windowSize": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"PT24H",
"PT1D"
],
"metadata": {
"description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
}
},
"evaluationFrequency": {
"type": "string",
"defaultValue": "PT1M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"metadata": {
"description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
}
},
"currentDateTimeUtcNow": {
"type": "string",
"defaultValue": "[utcNow()]",
"metadata": {
"description": "The current date and time using the utcNow function. Used for deployment name uniqueness"
}
},
"telemetryOptOut": {
"type": "string",
"defaultValue": "No",
"allowedValues": [
"Yes",
"No"
],
"metadata": {
"description": "The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry."
}
}
},
"variables": {
"pidDeploymentName": "[take(concat('pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-', uniqueString(resourceGroup().id, parameters('alertName'), parameters('currentDateTimeUtcNow'))), 64)]",
"varTargetResourceId": "[split(parameters('targetResourceId'), ',')]"
},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[parameters('alertName')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "[parameters('alertDescription')]",
"scopes": "[variables('varTargetResourceId')]",
"targetResourceType": "[parameters('targetResourceType')]",
"targetResourceRegion": "[parameters('targetResourceRegion')]",
"severity": "[parameters('alertSeverity')]",
"enabled": "[parameters('isEnabled')]",
"evaluationFrequency": "[parameters('evaluationFrequency')]",
"windowSize": "[parameters('windowSize')]",
"criteria": {
"odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria",
"allOf": [
{
"name": "1st criterion",
"metricName": "kube_node_status_allocatable_cpu_cores",
"dimensions": [],
"operator": "[parameters('operator')]",
"threshold": "[parameters('threshold')]",
"timeAggregation": "[parameters('timeAggregation')]",
"criterionType": "StaticThresholdCriterion"
}
]
}
}
},
{
"condition": "[equals(parameters('telemetryOptOut'), 'No')]",
"apiVersion": "2023-07-01",
"name": "[variables('pidDeploymentName')]",
"type": "Microsoft.Resources/deployments",
"properties": {
"mode": "Incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"resources": []
}
}
}
]
}
@description('Name of the alert')
@minLength(1)
param alertName string
@description('Description of alert')
param alertDescription string = 'Total number of available cpu cores in a managed cluster'
@description('Array of Azure resource Ids. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name')
@minLength(1)
param targetResourceId array
@description('Azure region in which target resources to be monitored are in (without spaces). For example: EastUS')
param targetResourceRegion string
@description('Resource type of target resources to be monitored.')
@minLength(1)
param targetResourceType string
@description('Specifies whether the alert is enabled')
param isEnabled bool = true
@description('Severity of alert {0,1,2,3,4}')
@allowed([
0
1
2
3
4
])
param alertSeverity int = 3
@description('Operator comparing the current value with the threshold value.')
@allowed([
'Equals'
'GreaterThan'
'GreaterThanOrEqual'
'LessThan'
'LessThanOrEqual'
])
param operator string = 'LessThan'
@description('The threshold value at which the alert is activated.')
param threshold int = 2
@description('How the data that is collected should be combined over time.')
@allowed([
'Average'
'Minimum'
'Maximum'
'Total'
'Count'
])
param timeAggregation string = 'Average'
@description('Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format.')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
'PT6H'
'PT12H'
'PT24H'
'P1D'
])
param windowSize string = 'PT5M'
@description('how often the metric alert is evaluated represented in ISO 8601 duration format')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
])
param evaluationFrequency string = 'PT1M'
@description('"The current date and time using the utcNow function. Used for deployment name uniqueness')
param currentDateTimeUtcNow string = utcNow()
@description('The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry.')
@allowed([
'Yes'
'No'
])
param telemetryOptOut string = 'No'
resource metricAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: alertName
location: 'global'
tags: {
_deployed_by_amba: 'true'
}
properties: {
description: alertDescription
scopes: targetResourceId
targetResourceType: targetResourceType
targetResourceRegion: targetResourceRegion
severity: alertSeverity
enabled: isEnabled
evaluationFrequency: evaluationFrequency
windowSize: windowSize
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: '1st criterion'
metricName: 'kube_node_status_allocatable_cpu_cores'
dimensions: [[]]
operator: operator
threshold: threshold
timeAggregation: timeAggregation
criterionType: 'StaticThresholdCriterion'
}
]
}
}
}
var ambaTelemetryPidName = 'pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-${uniqueString(resourceGroup().id, alertName, currentDateTimeUtcNow)}'
resource ambaTelemetryPid 'Microsoft.Resources/deployments@2023-07-01' = if (telemetryOptOut == 'No') {
name: ambaTelemetryPidName
tags: {
_deployed_by_amba: 'true'
}
properties: {
mode: 'Incremental'
template: {
'$schema': 'https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#'
contentVersion: '1.0.0.0'
resources: []
}
}
}
{
"type": "Microsoft.Authorization/policyDefinitions",
"apiVersion": "2021-06-01",
"name": "64a872f9-5ec6-4121-acad-edd12f4c3466",
"properties": {
"policyType": "Custom",
"mode": "All",
"displayName": "Deploy ContainerService managedClusters kube_node_status_allocatable_cpu_cores Alert",
"description": "Policy to Audit/Deploy ContainerService managedClusters kube_node_status_allocatable_cpu_cores Alert",
"metadata": {
"version": "1.0.0-preview",
"category": "ContainerService",
"preview": true,
"source": "https://github.com/Azure/azure-monitor-baseline-alerts/",
"alzCloudEnvironments": [
"AzureCloud"
],
"_deployed_by_amba": "True"
},
"parameters": {
"severity": {
"type": "String",
"metadata": {
"displayName": "Severity",
"description": "Severity of the Alert"
},
"allowedValues": [
"0",
"1",
"2",
"3",
"4"
],
"defaultValue": "3"
},
"windowSize": {
"type": "String",
"metadata": {
"displayName": "Window Size",
"description": "Window size for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"P1D"
],
"defaultValue": "PT5M"
},
"evaluationFrequency": {
"type": "String",
"metadata": {
"displayName": "Evaluation Frequency",
"description": "Evaluation frequency for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"defaultValue": "PT1M"
},
"autoMitigate": {
"type": "String",
"metadata": {
"displayName": "Auto Mitigate",
"description": "Auto Mitigate for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"enabled": {
"type": "String",
"metadata": {
"displayName": "Alert State",
"description": "Alert state for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"threshold": {
"type": "String",
"metadata": {
"displayName": "Threshold",
"description": "Threshold for the alert"
},
"defaultValue": "2"
},
"effect": {
"type": "String",
"metadata": {
"displayName": "Effect",
"description": "Effect of the policy"
},
"allowedValues": [
"deployIfNotExists",
"disabled"
],
"defaultValue": "deployIfNotExists"
},
"MonitorDisableTagName": {
"type": "String",
"metadata": {
"displayName": "Monitoring disabled tag name",
"description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": "MonitorDisable"
},
"MonitorDisableTagValues": {
"type": "Array",
"metadata": {
"displayName": "Monitoring disabled tag values(s)",
"description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": [
"true",
"Test",
"Dev",
"Sandbox"
]
}
},
"policyRule": {
"if": {
"allOf": [
{
"field": "type",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]",
"notIn": "[[parameters('MonitorDisableTagValues')]"
}
]
},
"then": {
"effect": "[[parameters('effect')]",
"details": {
"roleDefinitionIds": [
"/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
],
"type": "Microsoft.Insights/metricAlerts",
"existenceCondition": {
"allOf": [
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName",
"equals": "kube_node_status_allocatable_cpu_cores"
},
{
"field": "Microsoft.Insights/metricalerts/scopes[*]",
"equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.ContainerService/managedClusters/', field('fullName'))]"
},
{
"field": "Microsoft.Insights/metricAlerts/enabled",
"equals": "[[parameters('enabled')]"
},
{
"field": "Microsoft.Insights/metricAlerts/evaluationFrequency",
"equals": "[[parameters('evaluationFrequency')]"
},
{
"field": "Microsoft.Insights/metricAlerts/windowSize",
"equals": "[[parameters('windowSize')]"
},
{
"field": "Microsoft.Insights/metricalerts/severity",
"equals": "[[parameters('severity')]"
},
{
"field": "Microsoft.Insights/metricAlerts/autoMitigate",
"equals": "[[parameters('autoMitigate')]"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation",
"equals": "Average"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator",
"equals": "LessThan"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold",
"equals": "[[if(contains(field('tags'), '_amba-kube_node_status_allocatable_cpu_cores-threshold-Override_'), field('tags._amba-kube_node_status_allocatable_cpu_cores-threshold-Override_'), parameters('threshold'))]"
}
]
},
"deployment": {
"properties": {
"mode": "incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"resourceName": {
"type": "String",
"metadata": {
"displayName": "resourceName",
"description": "Name of the resource"
}
},
"resourceId": {
"type": "String",
"metadata": {
"displayName": "resourceId",
"description": "Resource ID of the resource emitting the metric that will be used for the comparison"
}
},
"severity": {
"type": "String"
},
"windowSize": {
"type": "String"
},
"evaluationFrequency": {
"type": "String"
},
"autoMitigate": {
"type": "String"
},
"enabled": {
"type": "String"
},
"threshold": {
"type": "String"
}
},
"variables": {},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[[concat(parameters('resourceName'), '-kube_node_status_allocatable_cpu_cores')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "Metric Alert for ContainerService managedClusters kube_node_status_allocatable_cpu_cores",
"severity": "[[parameters('severity')]",
"enabled": "[[parameters('enabled')]",
"scopes": [
"[[parameters('resourceId')]"
],
"evaluationFrequency": "[[parameters('evaluationFrequency')]",
"windowSize": "[[parameters('windowSize')]",
"criteria": {
"allOf": [
{
"name": "kube_node_status_allocatable_cpu_cores",
"metricNamespace": "Microsoft.ContainerService/managedClusters",
"metricName": "kube_node_status_allocatable_cpu_cores",
"operator": "LessThan",
"threshold": "[[parameters('threshold')]",
"timeAggregation": "Average",
"criterionType": "StaticThresholdCriterion"
}
],
"odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
},
"autoMitigate": "[[parameters('autoMitigate')]",
"parameters": {
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[parameters('threshold')]"
}
}
}
}
]
},
"parameters": {
"resourceName": {
"value": "[[field('name')]"
},
"resourceId": {
"value": "[[field('id')]"
},
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[if(contains(field('tags'), '_amba-kube_node_status_allocatable_cpu_cores-threshold-Override_'), field('tags._amba-kube_node_status_allocatable_cpu_cores-threshold-Override_'), parameters('threshold'))]"
}
}
}
}
}
}
}
}
}
kube_node_status_allocatable_memory_bytes - Metric Alert
Total amount of available memory in a managed cluster
Properties:
criterionType | StaticThresholdCriterion |
evaluationFrequency | PT1M |
metricName | kube_node_status_allocatable_memory_bytes |
metricNamespace | Microsoft.ContainerService/managedClusters |
operator | LessThan |
severity | 3 |
threshold | 2.147483648e+09 |
timeAggregation | Average |
windowSize | PT5M |
References:
Templates:
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"alertName": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Name of the alert"
}
},
"alertDescription": {
"type": "string",
"defaultValue": "Total amount of available memory in a managed cluster",
"metadata": {
"description": "Description of alert"
}
},
"targetResourceId": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "List of Azure resource Ids seperated by a comma. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name"
}
},
"targetResourceRegion": {
"type": "string",
"metadata": {
"description": "Azure region in which target resources to be monitored are in (without spaces). For example: EastUS"
}
},
"targetResourceType": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Resource type of target resources to be monitored."
}
},
"isEnabled": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "Specifies whether the alert is enabled"
}
},
"alertSeverity": {
"type": "int",
"defaultValue": 3,
"allowedValues": [
0,
1,
2,
3,
4
],
"metadata": {
"description": "Severity of alert {0,1,2,3,4}"
}
},
"operator": {
"type": "string",
"defaultValue": "LessThan",
"allowedValues": [
"Equals",
"GreaterThan",
"GreaterThanOrEqual",
"LessThan",
"LessThanOrEqual"
],
"metadata": {
"description": "Operator comparing the current value with the threshold value."
}
},
"threshold": {
"type": "string",
"defaultValue": "2147483648",
"metadata": {
"description": "The threshold value at which the alert is activated."
}
},
"timeAggregation": {
"type": "string",
"defaultValue": "Average",
"allowedValues": [
"Average",
"Minimum",
"Maximum",
"Total",
"Count"
],
"metadata": {
"description": "How the data that is collected should be combined over time."
}
},
"windowSize": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"PT24H",
"PT1D"
],
"metadata": {
"description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
}
},
"evaluationFrequency": {
"type": "string",
"defaultValue": "PT1M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"metadata": {
"description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
}
},
"currentDateTimeUtcNow": {
"type": "string",
"defaultValue": "[utcNow()]",
"metadata": {
"description": "The current date and time using the utcNow function. Used for deployment name uniqueness"
}
},
"telemetryOptOut": {
"type": "string",
"defaultValue": "No",
"allowedValues": [
"Yes",
"No"
],
"metadata": {
"description": "The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry."
}
}
},
"variables": {
"pidDeploymentName": "[take(concat('pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-', uniqueString(resourceGroup().id, parameters('alertName'), parameters('currentDateTimeUtcNow'))), 64)]",
"varTargetResourceId": "[split(parameters('targetResourceId'), ',')]"
},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[parameters('alertName')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "[parameters('alertDescription')]",
"scopes": "[variables('varTargetResourceId')]",
"targetResourceType": "[parameters('targetResourceType')]",
"targetResourceRegion": "[parameters('targetResourceRegion')]",
"severity": "[parameters('alertSeverity')]",
"enabled": "[parameters('isEnabled')]",
"evaluationFrequency": "[parameters('evaluationFrequency')]",
"windowSize": "[parameters('windowSize')]",
"criteria": {
"odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria",
"allOf": [
{
"name": "1st criterion",
"metricName": "kube_node_status_allocatable_memory_bytes",
"dimensions": [],
"operator": "[parameters('operator')]",
"threshold": "[parameters('threshold')]",
"timeAggregation": "[parameters('timeAggregation')]",
"criterionType": "StaticThresholdCriterion"
}
]
}
}
},
{
"condition": "[equals(parameters('telemetryOptOut'), 'No')]",
"apiVersion": "2023-07-01",
"name": "[variables('pidDeploymentName')]",
"type": "Microsoft.Resources/deployments",
"properties": {
"mode": "Incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"resources": []
}
}
}
]
}
@description('Name of the alert')
@minLength(1)
param alertName string
@description('Description of alert')
param alertDescription string = 'Total amount of available memory in a managed cluster'
@description('Array of Azure resource Ids. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name')
@minLength(1)
param targetResourceId array
@description('Azure region in which target resources to be monitored are in (without spaces). For example: EastUS')
param targetResourceRegion string
@description('Resource type of target resources to be monitored.')
@minLength(1)
param targetResourceType string
@description('Specifies whether the alert is enabled')
param isEnabled bool = true
@description('Severity of alert {0,1,2,3,4}')
@allowed([
0
1
2
3
4
])
param alertSeverity int = 3
@description('Operator comparing the current value with the threshold value.')
@allowed([
'Equals'
'GreaterThan'
'GreaterThanOrEqual'
'LessThan'
'LessThanOrEqual'
])
param operator string = 'LessThan'
@description('The threshold value at which the alert is activated.')
param threshold int = 2147483648
@description('How the data that is collected should be combined over time.')
@allowed([
'Average'
'Minimum'
'Maximum'
'Total'
'Count'
])
param timeAggregation string = 'Average'
@description('Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format.')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
'PT6H'
'PT12H'
'PT24H'
'P1D'
])
param windowSize string = 'PT5M'
@description('how often the metric alert is evaluated represented in ISO 8601 duration format')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
])
param evaluationFrequency string = 'PT1M'
@description('"The current date and time using the utcNow function. Used for deployment name uniqueness')
param currentDateTimeUtcNow string = utcNow()
@description('The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry.')
@allowed([
'Yes'
'No'
])
param telemetryOptOut string = 'No'
resource metricAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: alertName
location: 'global'
tags: {
_deployed_by_amba: 'true'
}
properties: {
description: alertDescription
scopes: targetResourceId
targetResourceType: targetResourceType
targetResourceRegion: targetResourceRegion
severity: alertSeverity
enabled: isEnabled
evaluationFrequency: evaluationFrequency
windowSize: windowSize
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: '1st criterion'
metricName: 'kube_node_status_allocatable_memory_bytes'
dimensions: [[]]
operator: operator
threshold: threshold
timeAggregation: timeAggregation
criterionType: 'StaticThresholdCriterion'
}
]
}
}
}
var ambaTelemetryPidName = 'pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-${uniqueString(resourceGroup().id, alertName, currentDateTimeUtcNow)}'
resource ambaTelemetryPid 'Microsoft.Resources/deployments@2023-07-01' = if (telemetryOptOut == 'No') {
name: ambaTelemetryPidName
tags: {
_deployed_by_amba: 'true'
}
properties: {
mode: 'Incremental'
template: {
'$schema': 'https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#'
contentVersion: '1.0.0.0'
resources: []
}
}
}
{
"type": "Microsoft.Authorization/policyDefinitions",
"apiVersion": "2021-06-01",
"name": "14fa63fb-7da9-4d2e-9de9-203c4a3e0401",
"properties": {
"policyType": "Custom",
"mode": "All",
"displayName": "Deploy ContainerService managedClusters kube_node_status_allocatable_memory_bytes Alert",
"description": "Policy to Audit/Deploy ContainerService managedClusters kube_node_status_allocatable_memory_bytes Alert",
"metadata": {
"version": "1.0.0-preview",
"category": "ContainerService",
"preview": true,
"source": "https://github.com/Azure/azure-monitor-baseline-alerts/",
"alzCloudEnvironments": [
"AzureCloud"
],
"_deployed_by_amba": "True"
},
"parameters": {
"severity": {
"type": "String",
"metadata": {
"displayName": "Severity",
"description": "Severity of the Alert"
},
"allowedValues": [
"0",
"1",
"2",
"3",
"4"
],
"defaultValue": "3"
},
"windowSize": {
"type": "String",
"metadata": {
"displayName": "Window Size",
"description": "Window size for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"P1D"
],
"defaultValue": "PT5M"
},
"evaluationFrequency": {
"type": "String",
"metadata": {
"displayName": "Evaluation Frequency",
"description": "Evaluation frequency for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"defaultValue": "PT1M"
},
"autoMitigate": {
"type": "String",
"metadata": {
"displayName": "Auto Mitigate",
"description": "Auto Mitigate for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"enabled": {
"type": "String",
"metadata": {
"displayName": "Alert State",
"description": "Alert state for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"threshold": {
"type": "String",
"metadata": {
"displayName": "Threshold",
"description": "Threshold for the alert"
},
"defaultValue": "2147483648"
},
"effect": {
"type": "String",
"metadata": {
"displayName": "Effect",
"description": "Effect of the policy"
},
"allowedValues": [
"deployIfNotExists",
"disabled"
],
"defaultValue": "deployIfNotExists"
},
"MonitorDisableTagName": {
"type": "String",
"metadata": {
"displayName": "Monitoring disabled tag name",
"description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": "MonitorDisable"
},
"MonitorDisableTagValues": {
"type": "Array",
"metadata": {
"displayName": "Monitoring disabled tag values(s)",
"description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": [
"true",
"Test",
"Dev",
"Sandbox"
]
}
},
"policyRule": {
"if": {
"allOf": [
{
"field": "type",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]",
"notIn": "[[parameters('MonitorDisableTagValues')]"
}
]
},
"then": {
"effect": "[[parameters('effect')]",
"details": {
"roleDefinitionIds": [
"/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
],
"type": "Microsoft.Insights/metricAlerts",
"existenceCondition": {
"allOf": [
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName",
"equals": "kube_node_status_allocatable_memory_bytes"
},
{
"field": "Microsoft.Insights/metricalerts/scopes[*]",
"equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.ContainerService/managedClusters/', field('fullName'))]"
},
{
"field": "Microsoft.Insights/metricAlerts/enabled",
"equals": "[[parameters('enabled')]"
},
{
"field": "Microsoft.Insights/metricAlerts/evaluationFrequency",
"equals": "[[parameters('evaluationFrequency')]"
},
{
"field": "Microsoft.Insights/metricAlerts/windowSize",
"equals": "[[parameters('windowSize')]"
},
{
"field": "Microsoft.Insights/metricalerts/severity",
"equals": "[[parameters('severity')]"
},
{
"field": "Microsoft.Insights/metricAlerts/autoMitigate",
"equals": "[[parameters('autoMitigate')]"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation",
"equals": "Average"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator",
"equals": "LessThan"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold",
"equals": "[[if(contains(field('tags'), '_amba-kube_node_status_allocatable_memory_bytes-threshold-Override_'), field('tags._amba-kube_node_status_allocatable_memory_bytes-threshold-Override_'), parameters('threshold'))]"
}
]
},
"deployment": {
"properties": {
"mode": "incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"resourceName": {
"type": "String",
"metadata": {
"displayName": "resourceName",
"description": "Name of the resource"
}
},
"resourceId": {
"type": "String",
"metadata": {
"displayName": "resourceId",
"description": "Resource ID of the resource emitting the metric that will be used for the comparison"
}
},
"severity": {
"type": "String"
},
"windowSize": {
"type": "String"
},
"evaluationFrequency": {
"type": "String"
},
"autoMitigate": {
"type": "String"
},
"enabled": {
"type": "String"
},
"threshold": {
"type": "String"
}
},
"variables": {},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[[concat(parameters('resourceName'), '-kube_node_status_allocatable_memory_bytes')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "Metric Alert for ContainerService managedClusters kube_node_status_allocatable_memory_bytes",
"severity": "[[parameters('severity')]",
"enabled": "[[parameters('enabled')]",
"scopes": [
"[[parameters('resourceId')]"
],
"evaluationFrequency": "[[parameters('evaluationFrequency')]",
"windowSize": "[[parameters('windowSize')]",
"criteria": {
"allOf": [
{
"name": "kube_node_status_allocatable_memory_bytes",
"metricNamespace": "Microsoft.ContainerService/managedClusters",
"metricName": "kube_node_status_allocatable_memory_bytes",
"operator": "LessThan",
"threshold": "[[parameters('threshold')]",
"timeAggregation": "Average",
"criterionType": "StaticThresholdCriterion"
}
],
"odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
},
"autoMitigate": "[[parameters('autoMitigate')]",
"parameters": {
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[parameters('threshold')]"
}
}
}
}
]
},
"parameters": {
"resourceName": {
"value": "[[field('name')]"
},
"resourceId": {
"value": "[[field('id')]"
},
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[if(contains(field('tags'), '_amba-kube_node_status_allocatable_memory_bytes-threshold-Override_'), field('tags._amba-kube_node_status_allocatable_memory_bytes-threshold-Override_'), parameters('threshold'))]"
}
}
}
}
}
}
}
}
}
kube_node_status_condition - Metric Alert
Statuses for various node conditions
Properties:
criterionType | StaticThresholdCriterion |
dimensions |
|
evaluationFrequency | PT1M |
metricName | kube_node_status_condition |
metricNamespace | Microsoft.ContainerService/managedClusters |
operator | GreaterThan |
severity | 3 |
threshold | 0 |
timeAggregation | Total |
windowSize | PT5M |
References:
Templates:
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"alertName": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Name of the alert"
}
},
"alertDescription": {
"type": "string",
"defaultValue": "Statuses for various node conditions",
"metadata": {
"description": "Description of alert"
}
},
"targetResourceId": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "List of Azure resource Ids seperated by a comma. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name"
}
},
"targetResourceRegion": {
"type": "string",
"metadata": {
"description": "Azure region in which target resources to be monitored are in (without spaces). For example: EastUS"
}
},
"targetResourceType": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Resource type of target resources to be monitored."
}
},
"isEnabled": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "Specifies whether the alert is enabled"
}
},
"alertSeverity": {
"type": "int",
"defaultValue": 3,
"allowedValues": [
0,
1,
2,
3,
4
],
"metadata": {
"description": "Severity of alert {0,1,2,3,4}"
}
},
"operator": {
"type": "string",
"defaultValue": "GreaterThan",
"allowedValues": [
"Equals",
"GreaterThan",
"GreaterThanOrEqual",
"LessThan",
"LessThanOrEqual"
],
"metadata": {
"description": "Operator comparing the current value with the threshold value."
}
},
"threshold": {
"type": "string",
"defaultValue": "0",
"metadata": {
"description": "The threshold value at which the alert is activated."
}
},
"timeAggregation": {
"type": "string",
"defaultValue": "Total",
"allowedValues": [
"Average",
"Minimum",
"Maximum",
"Total",
"Count"
],
"metadata": {
"description": "How the data that is collected should be combined over time."
}
},
"windowSize": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"PT24H",
"PT1D"
],
"metadata": {
"description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
}
},
"evaluationFrequency": {
"type": "string",
"defaultValue": "PT1M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"metadata": {
"description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
}
},
"currentDateTimeUtcNow": {
"type": "string",
"defaultValue": "[utcNow()]",
"metadata": {
"description": "The current date and time using the utcNow function. Used for deployment name uniqueness"
}
},
"telemetryOptOut": {
"type": "string",
"defaultValue": "No",
"allowedValues": [
"Yes",
"No"
],
"metadata": {
"description": "The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry."
}
}
},
"variables": {
"pidDeploymentName": "[take(concat('pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-', uniqueString(resourceGroup().id, parameters('alertName'), parameters('currentDateTimeUtcNow'))), 64)]",
"varTargetResourceId": "[split(parameters('targetResourceId'), ',')]"
},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[parameters('alertName')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "[parameters('alertDescription')]",
"scopes": "[variables('varTargetResourceId')]",
"targetResourceType": "[parameters('targetResourceType')]",
"targetResourceRegion": "[parameters('targetResourceRegion')]",
"severity": "[parameters('alertSeverity')]",
"enabled": "[parameters('isEnabled')]",
"evaluationFrequency": "[parameters('evaluationFrequency')]",
"windowSize": "[parameters('windowSize')]",
"criteria": {
"odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria",
"allOf": [
{
"name": "1st criterion",
"metricName": "kube_node_status_condition",
"dimensions": [{"operator": "include", "name": "status2", "values": ["notready", "unknown"]}],
"operator": "[parameters('operator')]",
"threshold": "[parameters('threshold')]",
"timeAggregation": "[parameters('timeAggregation')]",
"criterionType": "StaticThresholdCriterion"
}
]
}
}
},
{
"condition": "[equals(parameters('telemetryOptOut'), 'No')]",
"apiVersion": "2023-07-01",
"name": "[variables('pidDeploymentName')]",
"type": "Microsoft.Resources/deployments",
"properties": {
"mode": "Incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"resources": []
}
}
}
]
}
@description('Name of the alert')
@minLength(1)
param alertName string
@description('Description of alert')
param alertDescription string = 'Statuses for various node conditions'
@description('Array of Azure resource Ids. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name')
@minLength(1)
param targetResourceId array
@description('Azure region in which target resources to be monitored are in (without spaces). For example: EastUS')
param targetResourceRegion string
@description('Resource type of target resources to be monitored.')
@minLength(1)
param targetResourceType string
@description('Specifies whether the alert is enabled')
param isEnabled bool = true
@description('Severity of alert {0,1,2,3,4}')
@allowed([
0
1
2
3
4
])
param alertSeverity int = 3
@description('Operator comparing the current value with the threshold value.')
@allowed([
'Equals'
'GreaterThan'
'GreaterThanOrEqual'
'LessThan'
'LessThanOrEqual'
])
param operator string = 'GreaterThan'
@description('The threshold value at which the alert is activated.')
param threshold int = 0
@description('How the data that is collected should be combined over time.')
@allowed([
'Average'
'Minimum'
'Maximum'
'Total'
'Count'
])
param timeAggregation string = 'Total'
@description('Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format.')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
'PT6H'
'PT12H'
'PT24H'
'P1D'
])
param windowSize string = 'PT5M'
@description('how often the metric alert is evaluated represented in ISO 8601 duration format')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
])
param evaluationFrequency string = 'PT1M'
@description('"The current date and time using the utcNow function. Used for deployment name uniqueness')
param currentDateTimeUtcNow string = utcNow()
@description('The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry.')
@allowed([
'Yes'
'No'
])
param telemetryOptOut string = 'No'
resource metricAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: alertName
location: 'global'
tags: {
_deployed_by_amba: 'true'
}
properties: {
description: alertDescription
scopes: targetResourceId
targetResourceType: targetResourceType
targetResourceRegion: targetResourceRegion
severity: alertSeverity
enabled: isEnabled
evaluationFrequency: evaluationFrequency
windowSize: windowSize
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: '1st criterion'
metricName: 'kube_node_status_condition'
dimensions: [
{
name: 'status2'
operator: 'include'
values: ['notready','unknown']
}]
operator: operator
threshold: threshold
timeAggregation: timeAggregation
criterionType: 'StaticThresholdCriterion'
}
]
}
}
}
var ambaTelemetryPidName = 'pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-${uniqueString(resourceGroup().id, alertName, currentDateTimeUtcNow)}'
resource ambaTelemetryPid 'Microsoft.Resources/deployments@2023-07-01' = if (telemetryOptOut == 'No') {
name: ambaTelemetryPidName
tags: {
_deployed_by_amba: 'true'
}
properties: {
mode: 'Incremental'
template: {
'$schema': 'https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#'
contentVersion: '1.0.0.0'
resources: []
}
}
}
{
"type": "Microsoft.Authorization/policyDefinitions",
"apiVersion": "2021-06-01",
"name": "ebbbbb92-5208-4d52-b407-6bea8e4473b9",
"properties": {
"policyType": "Custom",
"mode": "All",
"displayName": "Deploy ContainerService managedClusters kube_node_status_condition Alert",
"description": "Policy to Audit/Deploy ContainerService managedClusters kube_node_status_condition Alert",
"metadata": {
"version": "1.0.0-preview",
"category": "ContainerService",
"preview": true,
"source": "https://github.com/Azure/azure-monitor-baseline-alerts/",
"alzCloudEnvironments": [
"AzureCloud"
],
"_deployed_by_amba": "True"
},
"parameters": {
"severity": {
"type": "String",
"metadata": {
"displayName": "Severity",
"description": "Severity of the Alert"
},
"allowedValues": [
"0",
"1",
"2",
"3",
"4"
],
"defaultValue": "3"
},
"windowSize": {
"type": "String",
"metadata": {
"displayName": "Window Size",
"description": "Window size for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"P1D"
],
"defaultValue": "PT5M"
},
"evaluationFrequency": {
"type": "String",
"metadata": {
"displayName": "Evaluation Frequency",
"description": "Evaluation frequency for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"defaultValue": "PT1M"
},
"autoMitigate": {
"type": "String",
"metadata": {
"displayName": "Auto Mitigate",
"description": "Auto Mitigate for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"enabled": {
"type": "String",
"metadata": {
"displayName": "Alert State",
"description": "Alert state for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"threshold": {
"type": "String",
"metadata": {
"displayName": "Threshold",
"description": "Threshold for the alert"
},
"defaultValue": "0"
},
"effect": {
"type": "String",
"metadata": {
"displayName": "Effect",
"description": "Effect of the policy"
},
"allowedValues": [
"deployIfNotExists",
"disabled"
],
"defaultValue": "deployIfNotExists"
},
"MonitorDisableTagName": {
"type": "String",
"metadata": {
"displayName": "Monitoring disabled tag name",
"description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": "MonitorDisable"
},
"MonitorDisableTagValues": {
"type": "Array",
"metadata": {
"displayName": "Monitoring disabled tag values(s)",
"description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": [
"true",
"Test",
"Dev",
"Sandbox"
]
}
},
"policyRule": {
"if": {
"allOf": [
{
"field": "type",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]",
"notIn": "[[parameters('MonitorDisableTagValues')]"
}
]
},
"then": {
"effect": "[[parameters('effect')]",
"details": {
"roleDefinitionIds": [
"/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
],
"type": "Microsoft.Insights/metricAlerts",
"existenceCondition": {
"allOf": [
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName",
"equals": "kube_node_status_condition"
},
{
"field": "Microsoft.Insights/metricalerts/scopes[*]",
"equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.ContainerService/managedClusters/', field('fullName'))]"
},
{
"field": "Microsoft.Insights/metricAlerts/enabled",
"equals": "[[parameters('enabled')]"
},
{
"field": "Microsoft.Insights/metricAlerts/evaluationFrequency",
"equals": "[[parameters('evaluationFrequency')]"
},
{
"field": "Microsoft.Insights/metricAlerts/windowSize",
"equals": "[[parameters('windowSize')]"
},
{
"field": "Microsoft.Insights/metricalerts/severity",
"equals": "[[parameters('severity')]"
},
{
"field": "Microsoft.Insights/metricAlerts/autoMitigate",
"equals": "[[parameters('autoMitigate')]"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation",
"equals": "Total"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator",
"equals": "GreaterThan"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold",
"equals": "[[if(contains(field('tags'), '_amba-kube_node_status_condition-threshold-Override_'), field('tags._amba-kube_node_status_condition-threshold-Override_'), parameters('threshold'))]"
}
]
},
"deployment": {
"properties": {
"mode": "incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"resourceName": {
"type": "String",
"metadata": {
"displayName": "resourceName",
"description": "Name of the resource"
}
},
"resourceId": {
"type": "String",
"metadata": {
"displayName": "resourceId",
"description": "Resource ID of the resource emitting the metric that will be used for the comparison"
}
},
"severity": {
"type": "String"
},
"windowSize": {
"type": "String"
},
"evaluationFrequency": {
"type": "String"
},
"autoMitigate": {
"type": "String"
},
"enabled": {
"type": "String"
},
"threshold": {
"type": "String"
}
},
"variables": {},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[[concat(parameters('resourceName'), '-kube_node_status_condition')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "Metric Alert for ContainerService managedClusters kube_node_status_condition",
"severity": "[[parameters('severity')]",
"enabled": "[[parameters('enabled')]",
"scopes": [
"[[parameters('resourceId')]"
],
"evaluationFrequency": "[[parameters('evaluationFrequency')]",
"windowSize": "[[parameters('windowSize')]",
"criteria": {
"allOf": [
{
"name": "kube_node_status_condition",
"metricNamespace": "Microsoft.ContainerService/managedClusters",
"metricName": "kube_node_status_condition",
"operator": "GreaterThan",
"threshold": "[[parameters('threshold')]",
"timeAggregation": "Total",
"criterionType": "StaticThresholdCriterion"
}
],
"odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
},
"autoMitigate": "[[parameters('autoMitigate')]",
"parameters": {
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[parameters('threshold')]"
}
}
}
}
]
},
"parameters": {
"resourceName": {
"value": "[[field('name')]"
},
"resourceId": {
"value": "[[field('id')]"
},
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[if(contains(field('tags'), '_amba-kube_node_status_condition-threshold-Override_'), field('tags._amba-kube_node_status_condition-threshold-Override_'), parameters('threshold'))]"
}
}
}
}
}
}
}
}
}
kube_pod_status_phase - Metric Alert
Number of pods by phase
Properties:
criterionType | StaticThresholdCriterion |
dimensions |
|
evaluationFrequency | PT1M |
metricName | kube_pod_status_phase |
metricNamespace | Microsoft.ContainerService/managedClusters |
operator | GreaterThan |
severity | 3 |
threshold | 0 |
timeAggregation | Average |
windowSize | PT5M |
References:
Templates:
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"alertName": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Name of the alert"
}
},
"alertDescription": {
"type": "string",
"defaultValue": "Number of pods by phase",
"metadata": {
"description": "Description of alert"
}
},
"targetResourceId": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "List of Azure resource Ids seperated by a comma. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name"
}
},
"targetResourceRegion": {
"type": "string",
"metadata": {
"description": "Azure region in which target resources to be monitored are in (without spaces). For example: EastUS"
}
},
"targetResourceType": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Resource type of target resources to be monitored."
}
},
"isEnabled": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "Specifies whether the alert is enabled"
}
},
"alertSeverity": {
"type": "int",
"defaultValue": 3,
"allowedValues": [
0,
1,
2,
3,
4
],
"metadata": {
"description": "Severity of alert {0,1,2,3,4}"
}
},
"operator": {
"type": "string",
"defaultValue": "GreaterThan",
"allowedValues": [
"Equals",
"GreaterThan",
"GreaterThanOrEqual",
"LessThan",
"LessThanOrEqual"
],
"metadata": {
"description": "Operator comparing the current value with the threshold value."
}
},
"threshold": {
"type": "string",
"defaultValue": "0",
"metadata": {
"description": "The threshold value at which the alert is activated."
}
},
"timeAggregation": {
"type": "string",
"defaultValue": "Average",
"allowedValues": [
"Average",
"Minimum",
"Maximum",
"Total",
"Count"
],
"metadata": {
"description": "How the data that is collected should be combined over time."
}
},
"windowSize": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"PT24H",
"PT1D"
],
"metadata": {
"description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
}
},
"evaluationFrequency": {
"type": "string",
"defaultValue": "PT1M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"metadata": {
"description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
}
},
"currentDateTimeUtcNow": {
"type": "string",
"defaultValue": "[utcNow()]",
"metadata": {
"description": "The current date and time using the utcNow function. Used for deployment name uniqueness"
}
},
"telemetryOptOut": {
"type": "string",
"defaultValue": "No",
"allowedValues": [
"Yes",
"No"
],
"metadata": {
"description": "The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry."
}
}
},
"variables": {
"pidDeploymentName": "[take(concat('pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-', uniqueString(resourceGroup().id, parameters('alertName'), parameters('currentDateTimeUtcNow'))), 64)]",
"varTargetResourceId": "[split(parameters('targetResourceId'), ',')]"
},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[parameters('alertName')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "[parameters('alertDescription')]",
"scopes": "[variables('varTargetResourceId')]",
"targetResourceType": "[parameters('targetResourceType')]",
"targetResourceRegion": "[parameters('targetResourceRegion')]",
"severity": "[parameters('alertSeverity')]",
"enabled": "[parameters('isEnabled')]",
"evaluationFrequency": "[parameters('evaluationFrequency')]",
"windowSize": "[parameters('windowSize')]",
"criteria": {
"odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria",
"allOf": [
{
"name": "1st criterion",
"metricName": "kube_pod_status_phase",
"dimensions": [{"operator": "include", "name": "phase", "values": ["failed"]}],
"operator": "[parameters('operator')]",
"threshold": "[parameters('threshold')]",
"timeAggregation": "[parameters('timeAggregation')]",
"criterionType": "StaticThresholdCriterion"
}
]
}
}
},
{
"condition": "[equals(parameters('telemetryOptOut'), 'No')]",
"apiVersion": "2023-07-01",
"name": "[variables('pidDeploymentName')]",
"type": "Microsoft.Resources/deployments",
"properties": {
"mode": "Incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"resources": []
}
}
}
]
}
@description('Name of the alert')
@minLength(1)
param alertName string
@description('Description of alert')
param alertDescription string = 'Number of pods by phase'
@description('Array of Azure resource Ids. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name')
@minLength(1)
param targetResourceId array
@description('Azure region in which target resources to be monitored are in (without spaces). For example: EastUS')
param targetResourceRegion string
@description('Resource type of target resources to be monitored.')
@minLength(1)
param targetResourceType string
@description('Specifies whether the alert is enabled')
param isEnabled bool = true
@description('Severity of alert {0,1,2,3,4}')
@allowed([
0
1
2
3
4
])
param alertSeverity int = 3
@description('Operator comparing the current value with the threshold value.')
@allowed([
'Equals'
'GreaterThan'
'GreaterThanOrEqual'
'LessThan'
'LessThanOrEqual'
])
param operator string = 'GreaterThan'
@description('The threshold value at which the alert is activated.')
param threshold int = 0
@description('How the data that is collected should be combined over time.')
@allowed([
'Average'
'Minimum'
'Maximum'
'Total'
'Count'
])
param timeAggregation string = 'Average'
@description('Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format.')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
'PT6H'
'PT12H'
'PT24H'
'P1D'
])
param windowSize string = 'PT5M'
@description('how often the metric alert is evaluated represented in ISO 8601 duration format')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
])
param evaluationFrequency string = 'PT1M'
@description('"The current date and time using the utcNow function. Used for deployment name uniqueness')
param currentDateTimeUtcNow string = utcNow()
@description('The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry.')
@allowed([
'Yes'
'No'
])
param telemetryOptOut string = 'No'
resource metricAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: alertName
location: 'global'
tags: {
_deployed_by_amba: 'true'
}
properties: {
description: alertDescription
scopes: targetResourceId
targetResourceType: targetResourceType
targetResourceRegion: targetResourceRegion
severity: alertSeverity
enabled: isEnabled
evaluationFrequency: evaluationFrequency
windowSize: windowSize
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: '1st criterion'
metricName: 'kube_pod_status_phase'
dimensions: [
{
name: 'phase'
operator: 'include'
values: ['failed']
}]
operator: operator
threshold: threshold
timeAggregation: timeAggregation
criterionType: 'StaticThresholdCriterion'
}
]
}
}
}
var ambaTelemetryPidName = 'pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-${uniqueString(resourceGroup().id, alertName, currentDateTimeUtcNow)}'
resource ambaTelemetryPid 'Microsoft.Resources/deployments@2023-07-01' = if (telemetryOptOut == 'No') {
name: ambaTelemetryPidName
tags: {
_deployed_by_amba: 'true'
}
properties: {
mode: 'Incremental'
template: {
'$schema': 'https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#'
contentVersion: '1.0.0.0'
resources: []
}
}
}
{
"type": "Microsoft.Authorization/policyDefinitions",
"apiVersion": "2021-06-01",
"name": "e38e6e11-ac88-4014-98c8-8e64f70b832a",
"properties": {
"policyType": "Custom",
"mode": "All",
"displayName": "Deploy ContainerService managedClusters kube_pod_status_phase Alert",
"description": "Policy to Audit/Deploy ContainerService managedClusters kube_pod_status_phase Alert",
"metadata": {
"version": "1.0.0-preview",
"category": "ContainerService",
"preview": true,
"source": "https://github.com/Azure/azure-monitor-baseline-alerts/",
"alzCloudEnvironments": [
"AzureCloud"
],
"_deployed_by_amba": "True"
},
"parameters": {
"severity": {
"type": "String",
"metadata": {
"displayName": "Severity",
"description": "Severity of the Alert"
},
"allowedValues": [
"0",
"1",
"2",
"3",
"4"
],
"defaultValue": "3"
},
"windowSize": {
"type": "String",
"metadata": {
"displayName": "Window Size",
"description": "Window size for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"P1D"
],
"defaultValue": "PT5M"
},
"evaluationFrequency": {
"type": "String",
"metadata": {
"displayName": "Evaluation Frequency",
"description": "Evaluation frequency for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"defaultValue": "PT1M"
},
"autoMitigate": {
"type": "String",
"metadata": {
"displayName": "Auto Mitigate",
"description": "Auto Mitigate for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"enabled": {
"type": "String",
"metadata": {
"displayName": "Alert State",
"description": "Alert state for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"threshold": {
"type": "String",
"metadata": {
"displayName": "Threshold",
"description": "Threshold for the alert"
},
"defaultValue": "0"
},
"effect": {
"type": "String",
"metadata": {
"displayName": "Effect",
"description": "Effect of the policy"
},
"allowedValues": [
"deployIfNotExists",
"disabled"
],
"defaultValue": "deployIfNotExists"
},
"MonitorDisableTagName": {
"type": "String",
"metadata": {
"displayName": "Monitoring disabled tag name",
"description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": "MonitorDisable"
},
"MonitorDisableTagValues": {
"type": "Array",
"metadata": {
"displayName": "Monitoring disabled tag values(s)",
"description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": [
"true",
"Test",
"Dev",
"Sandbox"
]
}
},
"policyRule": {
"if": {
"allOf": [
{
"field": "type",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]",
"notIn": "[[parameters('MonitorDisableTagValues')]"
}
]
},
"then": {
"effect": "[[parameters('effect')]",
"details": {
"roleDefinitionIds": [
"/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
],
"type": "Microsoft.Insights/metricAlerts",
"existenceCondition": {
"allOf": [
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName",
"equals": "kube_pod_status_phase"
},
{
"field": "Microsoft.Insights/metricalerts/scopes[*]",
"equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.ContainerService/managedClusters/', field('fullName'))]"
},
{
"field": "Microsoft.Insights/metricAlerts/enabled",
"equals": "[[parameters('enabled')]"
},
{
"field": "Microsoft.Insights/metricAlerts/evaluationFrequency",
"equals": "[[parameters('evaluationFrequency')]"
},
{
"field": "Microsoft.Insights/metricAlerts/windowSize",
"equals": "[[parameters('windowSize')]"
},
{
"field": "Microsoft.Insights/metricalerts/severity",
"equals": "[[parameters('severity')]"
},
{
"field": "Microsoft.Insights/metricAlerts/autoMitigate",
"equals": "[[parameters('autoMitigate')]"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation",
"equals": "Average"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator",
"equals": "GreaterThan"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold",
"equals": "[[if(contains(field('tags'), '_amba-kube_pod_status_phase-threshold-Override_'), field('tags._amba-kube_pod_status_phase-threshold-Override_'), parameters('threshold'))]"
}
]
},
"deployment": {
"properties": {
"mode": "incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"resourceName": {
"type": "String",
"metadata": {
"displayName": "resourceName",
"description": "Name of the resource"
}
},
"resourceId": {
"type": "String",
"metadata": {
"displayName": "resourceId",
"description": "Resource ID of the resource emitting the metric that will be used for the comparison"
}
},
"severity": {
"type": "String"
},
"windowSize": {
"type": "String"
},
"evaluationFrequency": {
"type": "String"
},
"autoMitigate": {
"type": "String"
},
"enabled": {
"type": "String"
},
"threshold": {
"type": "String"
}
},
"variables": {},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[[concat(parameters('resourceName'), '-kube_pod_status_phase')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "Metric Alert for ContainerService managedClusters kube_pod_status_phase",
"severity": "[[parameters('severity')]",
"enabled": "[[parameters('enabled')]",
"scopes": [
"[[parameters('resourceId')]"
],
"evaluationFrequency": "[[parameters('evaluationFrequency')]",
"windowSize": "[[parameters('windowSize')]",
"criteria": {
"allOf": [
{
"name": "kube_pod_status_phase",
"metricNamespace": "Microsoft.ContainerService/managedClusters",
"metricName": "kube_pod_status_phase",
"operator": "GreaterThan",
"threshold": "[[parameters('threshold')]",
"timeAggregation": "Average",
"criterionType": "StaticThresholdCriterion"
}
],
"odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
},
"autoMitigate": "[[parameters('autoMitigate')]",
"parameters": {
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[parameters('threshold')]"
}
}
}
}
]
},
"parameters": {
"resourceName": {
"value": "[[field('name')]"
},
"resourceId": {
"value": "[[field('id')]"
},
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[if(contains(field('tags'), '_amba-kube_pod_status_phase-threshold-Override_'), field('tags._amba-kube_pod_status_phase-threshold-Override_'), parameters('threshold'))]"
}
}
}
}
}
}
}
}
}
kube_pod_status_ready - Metric Alert
Number of pods in Ready state
Properties:
criterionType | StaticThresholdCriterion |
evaluationFrequency | PT1M |
metricName | kube_pod_status_ready |
metricNamespace | Microsoft.ContainerService/managedClusters |
operator | LessThan |
severity | 2 |
threshold | 1 |
timeAggregation | Average |
windowSize | PT5M |
References:
Templates:
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"alertName": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Name of the alert"
}
},
"alertDescription": {
"type": "string",
"defaultValue": "Number of pods in Ready state",
"metadata": {
"description": "Description of alert"
}
},
"targetResourceId": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "List of Azure resource Ids seperated by a comma. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name"
}
},
"targetResourceRegion": {
"type": "string",
"metadata": {
"description": "Azure region in which target resources to be monitored are in (without spaces). For example: EastUS"
}
},
"targetResourceType": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Resource type of target resources to be monitored."
}
},
"isEnabled": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "Specifies whether the alert is enabled"
}
},
"alertSeverity": {
"type": "int",
"defaultValue": 2,
"allowedValues": [
0,
1,
2,
3,
4
],
"metadata": {
"description": "Severity of alert {0,1,2,3,4}"
}
},
"operator": {
"type": "string",
"defaultValue": "LessThan",
"allowedValues": [
"Equals",
"GreaterThan",
"GreaterThanOrEqual",
"LessThan",
"LessThanOrEqual"
],
"metadata": {
"description": "Operator comparing the current value with the threshold value."
}
},
"threshold": {
"type": "string",
"defaultValue": "1",
"metadata": {
"description": "The threshold value at which the alert is activated."
}
},
"timeAggregation": {
"type": "string",
"defaultValue": "Average",
"allowedValues": [
"Average",
"Minimum",
"Maximum",
"Total",
"Count"
],
"metadata": {
"description": "How the data that is collected should be combined over time."
}
},
"windowSize": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"PT24H",
"PT1D"
],
"metadata": {
"description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
}
},
"evaluationFrequency": {
"type": "string",
"defaultValue": "PT1M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"metadata": {
"description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
}
},
"currentDateTimeUtcNow": {
"type": "string",
"defaultValue": "[utcNow()]",
"metadata": {
"description": "The current date and time using the utcNow function. Used for deployment name uniqueness"
}
},
"telemetryOptOut": {
"type": "string",
"defaultValue": "No",
"allowedValues": [
"Yes",
"No"
],
"metadata": {
"description": "The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry."
}
}
},
"variables": {
"pidDeploymentName": "[take(concat('pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-', uniqueString(resourceGroup().id, parameters('alertName'), parameters('currentDateTimeUtcNow'))), 64)]",
"varTargetResourceId": "[split(parameters('targetResourceId'), ',')]"
},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[parameters('alertName')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "[parameters('alertDescription')]",
"scopes": "[variables('varTargetResourceId')]",
"targetResourceType": "[parameters('targetResourceType')]",
"targetResourceRegion": "[parameters('targetResourceRegion')]",
"severity": "[parameters('alertSeverity')]",
"enabled": "[parameters('isEnabled')]",
"evaluationFrequency": "[parameters('evaluationFrequency')]",
"windowSize": "[parameters('windowSize')]",
"criteria": {
"odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria",
"allOf": [
{
"name": "1st criterion",
"metricName": "kube_pod_status_ready",
"dimensions": [],
"operator": "[parameters('operator')]",
"threshold": "[parameters('threshold')]",
"timeAggregation": "[parameters('timeAggregation')]",
"criterionType": "StaticThresholdCriterion"
}
]
}
}
},
{
"condition": "[equals(parameters('telemetryOptOut'), 'No')]",
"apiVersion": "2023-07-01",
"name": "[variables('pidDeploymentName')]",
"type": "Microsoft.Resources/deployments",
"properties": {
"mode": "Incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"resources": []
}
}
}
]
}
@description('Name of the alert')
@minLength(1)
param alertName string
@description('Description of alert')
param alertDescription string = 'Number of pods in Ready state'
@description('Array of Azure resource Ids. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name')
@minLength(1)
param targetResourceId array
@description('Azure region in which target resources to be monitored are in (without spaces). For example: EastUS')
param targetResourceRegion string
@description('Resource type of target resources to be monitored.')
@minLength(1)
param targetResourceType string
@description('Specifies whether the alert is enabled')
param isEnabled bool = true
@description('Severity of alert {0,1,2,3,4}')
@allowed([
0
1
2
3
4
])
param alertSeverity int = 2
@description('Operator comparing the current value with the threshold value.')
@allowed([
'Equals'
'GreaterThan'
'GreaterThanOrEqual'
'LessThan'
'LessThanOrEqual'
])
param operator string = 'LessThan'
@description('The threshold value at which the alert is activated.')
param threshold int = 1
@description('How the data that is collected should be combined over time.')
@allowed([
'Average'
'Minimum'
'Maximum'
'Total'
'Count'
])
param timeAggregation string = 'Average'
@description('Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format.')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
'PT6H'
'PT12H'
'PT24H'
'P1D'
])
param windowSize string = 'PT5M'
@description('how often the metric alert is evaluated represented in ISO 8601 duration format')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
])
param evaluationFrequency string = 'PT1M'
@description('"The current date and time using the utcNow function. Used for deployment name uniqueness')
param currentDateTimeUtcNow string = utcNow()
@description('The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry.')
@allowed([
'Yes'
'No'
])
param telemetryOptOut string = 'No'
resource metricAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: alertName
location: 'global'
tags: {
_deployed_by_amba: 'true'
}
properties: {
description: alertDescription
scopes: targetResourceId
targetResourceType: targetResourceType
targetResourceRegion: targetResourceRegion
severity: alertSeverity
enabled: isEnabled
evaluationFrequency: evaluationFrequency
windowSize: windowSize
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: '1st criterion'
metricName: 'kube_pod_status_ready'
dimensions: [[]]
operator: operator
threshold: threshold
timeAggregation: timeAggregation
criterionType: 'StaticThresholdCriterion'
}
]
}
}
}
var ambaTelemetryPidName = 'pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-${uniqueString(resourceGroup().id, alertName, currentDateTimeUtcNow)}'
resource ambaTelemetryPid 'Microsoft.Resources/deployments@2023-07-01' = if (telemetryOptOut == 'No') {
name: ambaTelemetryPidName
tags: {
_deployed_by_amba: 'true'
}
properties: {
mode: 'Incremental'
template: {
'$schema': 'https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#'
contentVersion: '1.0.0.0'
resources: []
}
}
}
{
"type": "Microsoft.Authorization/policyDefinitions",
"apiVersion": "2021-06-01",
"name": "cebf00b7-7294-4cf7-bc50-108f999d0c67",
"properties": {
"policyType": "Custom",
"mode": "All",
"displayName": "Deploy ContainerService managedClusters kube_pod_status_ready Alert",
"description": "Policy to Audit/Deploy ContainerService managedClusters kube_pod_status_ready Alert",
"metadata": {
"version": "1.0.0-preview",
"category": "ContainerService",
"preview": true,
"source": "https://github.com/Azure/azure-monitor-baseline-alerts/",
"alzCloudEnvironments": [
"AzureCloud"
],
"_deployed_by_amba": "True"
},
"parameters": {
"severity": {
"type": "String",
"metadata": {
"displayName": "Severity",
"description": "Severity of the Alert"
},
"allowedValues": [
"0",
"1",
"2",
"3",
"4"
],
"defaultValue": "2"
},
"windowSize": {
"type": "String",
"metadata": {
"displayName": "Window Size",
"description": "Window size for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"P1D"
],
"defaultValue": "PT5M"
},
"evaluationFrequency": {
"type": "String",
"metadata": {
"displayName": "Evaluation Frequency",
"description": "Evaluation frequency for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"defaultValue": "PT1M"
},
"autoMitigate": {
"type": "String",
"metadata": {
"displayName": "Auto Mitigate",
"description": "Auto Mitigate for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"enabled": {
"type": "String",
"metadata": {
"displayName": "Alert State",
"description": "Alert state for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"threshold": {
"type": "String",
"metadata": {
"displayName": "Threshold",
"description": "Threshold for the alert"
},
"defaultValue": "1"
},
"effect": {
"type": "String",
"metadata": {
"displayName": "Effect",
"description": "Effect of the policy"
},
"allowedValues": [
"deployIfNotExists",
"disabled"
],
"defaultValue": "deployIfNotExists"
},
"MonitorDisableTagName": {
"type": "String",
"metadata": {
"displayName": "Monitoring disabled tag name",
"description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": "MonitorDisable"
},
"MonitorDisableTagValues": {
"type": "Array",
"metadata": {
"displayName": "Monitoring disabled tag values(s)",
"description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": [
"true",
"Test",
"Dev",
"Sandbox"
]
}
},
"policyRule": {
"if": {
"allOf": [
{
"field": "type",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]",
"notIn": "[[parameters('MonitorDisableTagValues')]"
}
]
},
"then": {
"effect": "[[parameters('effect')]",
"details": {
"roleDefinitionIds": [
"/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
],
"type": "Microsoft.Insights/metricAlerts",
"existenceCondition": {
"allOf": [
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName",
"equals": "kube_pod_status_ready"
},
{
"field": "Microsoft.Insights/metricalerts/scopes[*]",
"equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.ContainerService/managedClusters/', field('fullName'))]"
},
{
"field": "Microsoft.Insights/metricAlerts/enabled",
"equals": "[[parameters('enabled')]"
},
{
"field": "Microsoft.Insights/metricAlerts/evaluationFrequency",
"equals": "[[parameters('evaluationFrequency')]"
},
{
"field": "Microsoft.Insights/metricAlerts/windowSize",
"equals": "[[parameters('windowSize')]"
},
{
"field": "Microsoft.Insights/metricalerts/severity",
"equals": "[[parameters('severity')]"
},
{
"field": "Microsoft.Insights/metricAlerts/autoMitigate",
"equals": "[[parameters('autoMitigate')]"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation",
"equals": "Average"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator",
"equals": "LessThan"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold",
"equals": "[[if(contains(field('tags'), '_amba-kube_pod_status_ready-threshold-Override_'), field('tags._amba-kube_pod_status_ready-threshold-Override_'), parameters('threshold'))]"
}
]
},
"deployment": {
"properties": {
"mode": "incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"resourceName": {
"type": "String",
"metadata": {
"displayName": "resourceName",
"description": "Name of the resource"
}
},
"resourceId": {
"type": "String",
"metadata": {
"displayName": "resourceId",
"description": "Resource ID of the resource emitting the metric that will be used for the comparison"
}
},
"severity": {
"type": "String"
},
"windowSize": {
"type": "String"
},
"evaluationFrequency": {
"type": "String"
},
"autoMitigate": {
"type": "String"
},
"enabled": {
"type": "String"
},
"threshold": {
"type": "String"
}
},
"variables": {},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[[concat(parameters('resourceName'), '-kube_pod_status_ready')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "Metric Alert for ContainerService managedClusters kube_pod_status_ready",
"severity": "[[parameters('severity')]",
"enabled": "[[parameters('enabled')]",
"scopes": [
"[[parameters('resourceId')]"
],
"evaluationFrequency": "[[parameters('evaluationFrequency')]",
"windowSize": "[[parameters('windowSize')]",
"criteria": {
"allOf": [
{
"name": "kube_pod_status_ready",
"metricNamespace": "Microsoft.ContainerService/managedClusters",
"metricName": "kube_pod_status_ready",
"operator": "LessThan",
"threshold": "[[parameters('threshold')]",
"timeAggregation": "Average",
"criterionType": "StaticThresholdCriterion"
}
],
"odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
},
"autoMitigate": "[[parameters('autoMitigate')]",
"parameters": {
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[parameters('threshold')]"
}
}
}
}
]
},
"parameters": {
"resourceName": {
"value": "[[field('name')]"
},
"resourceId": {
"value": "[[field('id')]"
},
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[if(contains(field('tags'), '_amba-kube_pod_status_ready-threshold-Override_'), field('tags._amba-kube_pod_status_ready-threshold-Override_'), parameters('threshold'))]"
}
}
}
}
}
}
}
}
}
node_cpu_usage_percentage - Metric Alert
Aggregated average CPU utilization measured in percentage across the cluster
Properties:
criterionType | StaticThresholdCriterion |
evaluationFrequency | PT5M |
metricName | node_cpu_usage_percentage |
metricNamespace | Microsoft.ContainerService/managedClusters |
operator | GreaterThan |
severity | 3 |
threshold | 95 |
timeAggregation | Average |
windowSize | PT5M |
References:
Templates:
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"alertName": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Name of the alert"
}
},
"alertDescription": {
"type": "string",
"defaultValue": "Aggregated average CPU utilization measured in percentage across the cluster",
"metadata": {
"description": "Description of alert"
}
},
"targetResourceId": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "List of Azure resource Ids seperated by a comma. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name"
}
},
"targetResourceRegion": {
"type": "string",
"metadata": {
"description": "Azure region in which target resources to be monitored are in (without spaces). For example: EastUS"
}
},
"targetResourceType": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Resource type of target resources to be monitored."
}
},
"isEnabled": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "Specifies whether the alert is enabled"
}
},
"alertSeverity": {
"type": "int",
"defaultValue": 3,
"allowedValues": [
0,
1,
2,
3,
4
],
"metadata": {
"description": "Severity of alert {0,1,2,3,4}"
}
},
"operator": {
"type": "string",
"defaultValue": "GreaterThan",
"allowedValues": [
"Equals",
"GreaterThan",
"GreaterThanOrEqual",
"LessThan",
"LessThanOrEqual"
],
"metadata": {
"description": "Operator comparing the current value with the threshold value."
}
},
"threshold": {
"type": "string",
"defaultValue": "95",
"metadata": {
"description": "The threshold value at which the alert is activated."
}
},
"timeAggregation": {
"type": "string",
"defaultValue": "Average",
"allowedValues": [
"Average",
"Minimum",
"Maximum",
"Total",
"Count"
],
"metadata": {
"description": "How the data that is collected should be combined over time."
}
},
"windowSize": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"PT24H",
"PT1D"
],
"metadata": {
"description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
}
},
"evaluationFrequency": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"metadata": {
"description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
}
},
"currentDateTimeUtcNow": {
"type": "string",
"defaultValue": "[utcNow()]",
"metadata": {
"description": "The current date and time using the utcNow function. Used for deployment name uniqueness"
}
},
"telemetryOptOut": {
"type": "string",
"defaultValue": "No",
"allowedValues": [
"Yes",
"No"
],
"metadata": {
"description": "The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry."
}
}
},
"variables": {
"pidDeploymentName": "[take(concat('pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-', uniqueString(resourceGroup().id, parameters('alertName'), parameters('currentDateTimeUtcNow'))), 64)]",
"varTargetResourceId": "[split(parameters('targetResourceId'), ',')]"
},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[parameters('alertName')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "[parameters('alertDescription')]",
"scopes": "[variables('varTargetResourceId')]",
"targetResourceType": "[parameters('targetResourceType')]",
"targetResourceRegion": "[parameters('targetResourceRegion')]",
"severity": "[parameters('alertSeverity')]",
"enabled": "[parameters('isEnabled')]",
"evaluationFrequency": "[parameters('evaluationFrequency')]",
"windowSize": "[parameters('windowSize')]",
"criteria": {
"odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria",
"allOf": [
{
"name": "1st criterion",
"metricName": "node_cpu_usage_percentage",
"dimensions": [],
"operator": "[parameters('operator')]",
"threshold": "[parameters('threshold')]",
"timeAggregation": "[parameters('timeAggregation')]",
"criterionType": "StaticThresholdCriterion"
}
]
}
}
},
{
"condition": "[equals(parameters('telemetryOptOut'), 'No')]",
"apiVersion": "2023-07-01",
"name": "[variables('pidDeploymentName')]",
"type": "Microsoft.Resources/deployments",
"properties": {
"mode": "Incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"resources": []
}
}
}
]
}
@description('Name of the alert')
@minLength(1)
param alertName string
@description('Description of alert')
param alertDescription string = 'Aggregated average CPU utilization measured in percentage across the cluster'
@description('Array of Azure resource Ids. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name')
@minLength(1)
param targetResourceId array
@description('Azure region in which target resources to be monitored are in (without spaces). For example: EastUS')
param targetResourceRegion string
@description('Resource type of target resources to be monitored.')
@minLength(1)
param targetResourceType string
@description('Specifies whether the alert is enabled')
param isEnabled bool = true
@description('Severity of alert {0,1,2,3,4}')
@allowed([
0
1
2
3
4
])
param alertSeverity int = 3
@description('Operator comparing the current value with the threshold value.')
@allowed([
'Equals'
'GreaterThan'
'GreaterThanOrEqual'
'LessThan'
'LessThanOrEqual'
])
param operator string = 'GreaterThan'
@description('The threshold value at which the alert is activated.')
param threshold int = 95
@description('How the data that is collected should be combined over time.')
@allowed([
'Average'
'Minimum'
'Maximum'
'Total'
'Count'
])
param timeAggregation string = 'Average'
@description('Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format.')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
'PT6H'
'PT12H'
'PT24H'
'P1D'
])
param windowSize string = 'PT5M'
@description('how often the metric alert is evaluated represented in ISO 8601 duration format')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
])
param evaluationFrequency string = 'PT5M'
@description('"The current date and time using the utcNow function. Used for deployment name uniqueness')
param currentDateTimeUtcNow string = utcNow()
@description('The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry.')
@allowed([
'Yes'
'No'
])
param telemetryOptOut string = 'No'
resource metricAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: alertName
location: 'global'
tags: {
_deployed_by_amba: 'true'
}
properties: {
description: alertDescription
scopes: targetResourceId
targetResourceType: targetResourceType
targetResourceRegion: targetResourceRegion
severity: alertSeverity
enabled: isEnabled
evaluationFrequency: evaluationFrequency
windowSize: windowSize
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: '1st criterion'
metricName: 'node_cpu_usage_percentage'
dimensions: [[]]
operator: operator
threshold: threshold
timeAggregation: timeAggregation
criterionType: 'StaticThresholdCriterion'
}
]
}
}
}
var ambaTelemetryPidName = 'pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-${uniqueString(resourceGroup().id, alertName, currentDateTimeUtcNow)}'
resource ambaTelemetryPid 'Microsoft.Resources/deployments@2023-07-01' = if (telemetryOptOut == 'No') {
name: ambaTelemetryPidName
tags: {
_deployed_by_amba: 'true'
}
properties: {
mode: 'Incremental'
template: {
'$schema': 'https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#'
contentVersion: '1.0.0.0'
resources: []
}
}
}
{
"type": "Microsoft.Authorization/policyDefinitions",
"apiVersion": "2021-06-01",
"name": "1303e91d-bc80-4ec2-937e-1d179fc32b43",
"properties": {
"policyType": "Custom",
"mode": "All",
"displayName": "Deploy ContainerService managedClusters node_cpu_usage_percentage Alert",
"description": "Policy to Audit/Deploy ContainerService managedClusters node_cpu_usage_percentage Alert",
"metadata": {
"version": "1.0.0-preview",
"category": "ContainerService",
"preview": true,
"source": "https://github.com/Azure/azure-monitor-baseline-alerts/",
"alzCloudEnvironments": [
"AzureCloud"
],
"_deployed_by_amba": "True"
},
"parameters": {
"severity": {
"type": "String",
"metadata": {
"displayName": "Severity",
"description": "Severity of the Alert"
},
"allowedValues": [
"0",
"1",
"2",
"3",
"4"
],
"defaultValue": "3"
},
"windowSize": {
"type": "String",
"metadata": {
"displayName": "Window Size",
"description": "Window size for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"P1D"
],
"defaultValue": "PT5M"
},
"evaluationFrequency": {
"type": "String",
"metadata": {
"displayName": "Evaluation Frequency",
"description": "Evaluation frequency for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"defaultValue": "PT5M"
},
"autoMitigate": {
"type": "String",
"metadata": {
"displayName": "Auto Mitigate",
"description": "Auto Mitigate for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"enabled": {
"type": "String",
"metadata": {
"displayName": "Alert State",
"description": "Alert state for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"threshold": {
"type": "String",
"metadata": {
"displayName": "Threshold",
"description": "Threshold for the alert"
},
"defaultValue": "95"
},
"effect": {
"type": "String",
"metadata": {
"displayName": "Effect",
"description": "Effect of the policy"
},
"allowedValues": [
"deployIfNotExists",
"disabled"
],
"defaultValue": "deployIfNotExists"
},
"MonitorDisableTagName": {
"type": "String",
"metadata": {
"displayName": "Monitoring disabled tag name",
"description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": "MonitorDisable"
},
"MonitorDisableTagValues": {
"type": "Array",
"metadata": {
"displayName": "Monitoring disabled tag values(s)",
"description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": [
"true",
"Test",
"Dev",
"Sandbox"
]
}
},
"policyRule": {
"if": {
"allOf": [
{
"field": "type",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]",
"notIn": "[[parameters('MonitorDisableTagValues')]"
}
]
},
"then": {
"effect": "[[parameters('effect')]",
"details": {
"roleDefinitionIds": [
"/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
],
"type": "Microsoft.Insights/metricAlerts",
"existenceCondition": {
"allOf": [
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName",
"equals": "node_cpu_usage_percentage"
},
{
"field": "Microsoft.Insights/metricalerts/scopes[*]",
"equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.ContainerService/managedClusters/', field('fullName'))]"
},
{
"field": "Microsoft.Insights/metricAlerts/enabled",
"equals": "[[parameters('enabled')]"
},
{
"field": "Microsoft.Insights/metricAlerts/evaluationFrequency",
"equals": "[[parameters('evaluationFrequency')]"
},
{
"field": "Microsoft.Insights/metricAlerts/windowSize",
"equals": "[[parameters('windowSize')]"
},
{
"field": "Microsoft.Insights/metricalerts/severity",
"equals": "[[parameters('severity')]"
},
{
"field": "Microsoft.Insights/metricAlerts/autoMitigate",
"equals": "[[parameters('autoMitigate')]"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation",
"equals": "Average"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator",
"equals": "GreaterThan"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold",
"equals": "[[if(contains(field('tags'), '_amba-node_cpu_usage_percentage-threshold-Override_'), field('tags._amba-node_cpu_usage_percentage-threshold-Override_'), parameters('threshold'))]"
}
]
},
"deployment": {
"properties": {
"mode": "incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"resourceName": {
"type": "String",
"metadata": {
"displayName": "resourceName",
"description": "Name of the resource"
}
},
"resourceId": {
"type": "String",
"metadata": {
"displayName": "resourceId",
"description": "Resource ID of the resource emitting the metric that will be used for the comparison"
}
},
"severity": {
"type": "String"
},
"windowSize": {
"type": "String"
},
"evaluationFrequency": {
"type": "String"
},
"autoMitigate": {
"type": "String"
},
"enabled": {
"type": "String"
},
"threshold": {
"type": "String"
}
},
"variables": {},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[[concat(parameters('resourceName'), '-node_cpu_usage_percentage')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "Metric Alert for ContainerService managedClusters node_cpu_usage_percentage",
"severity": "[[parameters('severity')]",
"enabled": "[[parameters('enabled')]",
"scopes": [
"[[parameters('resourceId')]"
],
"evaluationFrequency": "[[parameters('evaluationFrequency')]",
"windowSize": "[[parameters('windowSize')]",
"criteria": {
"allOf": [
{
"name": "node_cpu_usage_percentage",
"metricNamespace": "Microsoft.ContainerService/managedClusters",
"metricName": "node_cpu_usage_percentage",
"operator": "GreaterThan",
"threshold": "[[parameters('threshold')]",
"timeAggregation": "Average",
"criterionType": "StaticThresholdCriterion"
}
],
"odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
},
"autoMitigate": "[[parameters('autoMitigate')]",
"parameters": {
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[parameters('threshold')]"
}
}
}
}
]
},
"parameters": {
"resourceName": {
"value": "[[field('name')]"
},
"resourceId": {
"value": "[[field('id')]"
},
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[if(contains(field('tags'), '_amba-node_cpu_usage_percentage-threshold-Override_'), field('tags._amba-node_cpu_usage_percentage-threshold-Override_'), parameters('threshold'))]"
}
}
}
}
}
}
}
}
}
node_disk_usage_percentage - Metric Alert
Disk space used in percent by device
Properties:
criterionType | StaticThresholdCriterion |
evaluationFrequency | PT1M |
metricName | node_disk_usage_percentage |
metricNamespace | Microsoft.ContainerService/managedClusters |
operator | GreaterThan |
severity | 2 |
threshold | 80 |
timeAggregation | Average |
windowSize | PT5M |
References:
Templates:
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"alertName": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Name of the alert"
}
},
"alertDescription": {
"type": "string",
"defaultValue": "Disk space used in percent by device",
"metadata": {
"description": "Description of alert"
}
},
"targetResourceId": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "List of Azure resource Ids seperated by a comma. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name"
}
},
"targetResourceRegion": {
"type": "string",
"metadata": {
"description": "Azure region in which target resources to be monitored are in (without spaces). For example: EastUS"
}
},
"targetResourceType": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Resource type of target resources to be monitored."
}
},
"isEnabled": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "Specifies whether the alert is enabled"
}
},
"alertSeverity": {
"type": "int",
"defaultValue": 2,
"allowedValues": [
0,
1,
2,
3,
4
],
"metadata": {
"description": "Severity of alert {0,1,2,3,4}"
}
},
"operator": {
"type": "string",
"defaultValue": "GreaterThan",
"allowedValues": [
"Equals",
"GreaterThan",
"GreaterThanOrEqual",
"LessThan",
"LessThanOrEqual"
],
"metadata": {
"description": "Operator comparing the current value with the threshold value."
}
},
"threshold": {
"type": "string",
"defaultValue": "80",
"metadata": {
"description": "The threshold value at which the alert is activated."
}
},
"timeAggregation": {
"type": "string",
"defaultValue": "Average",
"allowedValues": [
"Average",
"Minimum",
"Maximum",
"Total",
"Count"
],
"metadata": {
"description": "How the data that is collected should be combined over time."
}
},
"windowSize": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"PT24H",
"PT1D"
],
"metadata": {
"description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
}
},
"evaluationFrequency": {
"type": "string",
"defaultValue": "PT1M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"metadata": {
"description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
}
},
"currentDateTimeUtcNow": {
"type": "string",
"defaultValue": "[utcNow()]",
"metadata": {
"description": "The current date and time using the utcNow function. Used for deployment name uniqueness"
}
},
"telemetryOptOut": {
"type": "string",
"defaultValue": "No",
"allowedValues": [
"Yes",
"No"
],
"metadata": {
"description": "The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry."
}
}
},
"variables": {
"pidDeploymentName": "[take(concat('pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-', uniqueString(resourceGroup().id, parameters('alertName'), parameters('currentDateTimeUtcNow'))), 64)]",
"varTargetResourceId": "[split(parameters('targetResourceId'), ',')]"
},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[parameters('alertName')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "[parameters('alertDescription')]",
"scopes": "[variables('varTargetResourceId')]",
"targetResourceType": "[parameters('targetResourceType')]",
"targetResourceRegion": "[parameters('targetResourceRegion')]",
"severity": "[parameters('alertSeverity')]",
"enabled": "[parameters('isEnabled')]",
"evaluationFrequency": "[parameters('evaluationFrequency')]",
"windowSize": "[parameters('windowSize')]",
"criteria": {
"odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria",
"allOf": [
{
"name": "1st criterion",
"metricName": "node_disk_usage_percentage",
"dimensions": [],
"operator": "[parameters('operator')]",
"threshold": "[parameters('threshold')]",
"timeAggregation": "[parameters('timeAggregation')]",
"criterionType": "StaticThresholdCriterion"
}
]
}
}
},
{
"condition": "[equals(parameters('telemetryOptOut'), 'No')]",
"apiVersion": "2023-07-01",
"name": "[variables('pidDeploymentName')]",
"type": "Microsoft.Resources/deployments",
"properties": {
"mode": "Incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"resources": []
}
}
}
]
}
@description('Name of the alert')
@minLength(1)
param alertName string
@description('Description of alert')
param alertDescription string = 'Disk space used in percent by device'
@description('Array of Azure resource Ids. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name')
@minLength(1)
param targetResourceId array
@description('Azure region in which target resources to be monitored are in (without spaces). For example: EastUS')
param targetResourceRegion string
@description('Resource type of target resources to be monitored.')
@minLength(1)
param targetResourceType string
@description('Specifies whether the alert is enabled')
param isEnabled bool = true
@description('Severity of alert {0,1,2,3,4}')
@allowed([
0
1
2
3
4
])
param alertSeverity int = 2
@description('Operator comparing the current value with the threshold value.')
@allowed([
'Equals'
'GreaterThan'
'GreaterThanOrEqual'
'LessThan'
'LessThanOrEqual'
])
param operator string = 'GreaterThan'
@description('The threshold value at which the alert is activated.')
param threshold int = 80
@description('How the data that is collected should be combined over time.')
@allowed([
'Average'
'Minimum'
'Maximum'
'Total'
'Count'
])
param timeAggregation string = 'Average'
@description('Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format.')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
'PT6H'
'PT12H'
'PT24H'
'P1D'
])
param windowSize string = 'PT5M'
@description('how often the metric alert is evaluated represented in ISO 8601 duration format')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
])
param evaluationFrequency string = 'PT1M'
@description('"The current date and time using the utcNow function. Used for deployment name uniqueness')
param currentDateTimeUtcNow string = utcNow()
@description('The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry.')
@allowed([
'Yes'
'No'
])
param telemetryOptOut string = 'No'
resource metricAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: alertName
location: 'global'
tags: {
_deployed_by_amba: 'true'
}
properties: {
description: alertDescription
scopes: targetResourceId
targetResourceType: targetResourceType
targetResourceRegion: targetResourceRegion
severity: alertSeverity
enabled: isEnabled
evaluationFrequency: evaluationFrequency
windowSize: windowSize
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: '1st criterion'
metricName: 'node_disk_usage_percentage'
dimensions: [[]]
operator: operator
threshold: threshold
timeAggregation: timeAggregation
criterionType: 'StaticThresholdCriterion'
}
]
}
}
}
var ambaTelemetryPidName = 'pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-${uniqueString(resourceGroup().id, alertName, currentDateTimeUtcNow)}'
resource ambaTelemetryPid 'Microsoft.Resources/deployments@2023-07-01' = if (telemetryOptOut == 'No') {
name: ambaTelemetryPidName
tags: {
_deployed_by_amba: 'true'
}
properties: {
mode: 'Incremental'
template: {
'$schema': 'https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#'
contentVersion: '1.0.0.0'
resources: []
}
}
}
{
"type": "Microsoft.Authorization/policyDefinitions",
"apiVersion": "2021-06-01",
"name": "aa3c5697-5cca-4c16-a37e-94f1d580701e",
"properties": {
"policyType": "Custom",
"mode": "All",
"displayName": "Deploy ContainerService managedClusters node_disk_usage_percentage Alert",
"description": "Policy to Audit/Deploy ContainerService managedClusters node_disk_usage_percentage Alert",
"metadata": {
"version": "1.0.0-preview",
"category": "ContainerService",
"preview": true,
"source": "https://github.com/Azure/azure-monitor-baseline-alerts/",
"alzCloudEnvironments": [
"AzureCloud"
],
"_deployed_by_amba": "True"
},
"parameters": {
"severity": {
"type": "String",
"metadata": {
"displayName": "Severity",
"description": "Severity of the Alert"
},
"allowedValues": [
"0",
"1",
"2",
"3",
"4"
],
"defaultValue": "2"
},
"windowSize": {
"type": "String",
"metadata": {
"displayName": "Window Size",
"description": "Window size for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"P1D"
],
"defaultValue": "PT5M"
},
"evaluationFrequency": {
"type": "String",
"metadata": {
"displayName": "Evaluation Frequency",
"description": "Evaluation frequency for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"defaultValue": "PT1M"
},
"autoMitigate": {
"type": "String",
"metadata": {
"displayName": "Auto Mitigate",
"description": "Auto Mitigate for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"enabled": {
"type": "String",
"metadata": {
"displayName": "Alert State",
"description": "Alert state for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"threshold": {
"type": "String",
"metadata": {
"displayName": "Threshold",
"description": "Threshold for the alert"
},
"defaultValue": "80"
},
"effect": {
"type": "String",
"metadata": {
"displayName": "Effect",
"description": "Effect of the policy"
},
"allowedValues": [
"deployIfNotExists",
"disabled"
],
"defaultValue": "deployIfNotExists"
},
"MonitorDisableTagName": {
"type": "String",
"metadata": {
"displayName": "Monitoring disabled tag name",
"description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": "MonitorDisable"
},
"MonitorDisableTagValues": {
"type": "Array",
"metadata": {
"displayName": "Monitoring disabled tag values(s)",
"description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": [
"true",
"Test",
"Dev",
"Sandbox"
]
}
},
"policyRule": {
"if": {
"allOf": [
{
"field": "type",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]",
"notIn": "[[parameters('MonitorDisableTagValues')]"
}
]
},
"then": {
"effect": "[[parameters('effect')]",
"details": {
"roleDefinitionIds": [
"/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
],
"type": "Microsoft.Insights/metricAlerts",
"existenceCondition": {
"allOf": [
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName",
"equals": "node_disk_usage_percentage"
},
{
"field": "Microsoft.Insights/metricalerts/scopes[*]",
"equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.ContainerService/managedClusters/', field('fullName'))]"
},
{
"field": "Microsoft.Insights/metricAlerts/enabled",
"equals": "[[parameters('enabled')]"
},
{
"field": "Microsoft.Insights/metricAlerts/evaluationFrequency",
"equals": "[[parameters('evaluationFrequency')]"
},
{
"field": "Microsoft.Insights/metricAlerts/windowSize",
"equals": "[[parameters('windowSize')]"
},
{
"field": "Microsoft.Insights/metricalerts/severity",
"equals": "[[parameters('severity')]"
},
{
"field": "Microsoft.Insights/metricAlerts/autoMitigate",
"equals": "[[parameters('autoMitigate')]"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation",
"equals": "Average"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator",
"equals": "GreaterThan"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold",
"equals": "[[if(contains(field('tags'), '_amba-node_disk_usage_percentage-threshold-Override_'), field('tags._amba-node_disk_usage_percentage-threshold-Override_'), parameters('threshold'))]"
}
]
},
"deployment": {
"properties": {
"mode": "incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"resourceName": {
"type": "String",
"metadata": {
"displayName": "resourceName",
"description": "Name of the resource"
}
},
"resourceId": {
"type": "String",
"metadata": {
"displayName": "resourceId",
"description": "Resource ID of the resource emitting the metric that will be used for the comparison"
}
},
"severity": {
"type": "String"
},
"windowSize": {
"type": "String"
},
"evaluationFrequency": {
"type": "String"
},
"autoMitigate": {
"type": "String"
},
"enabled": {
"type": "String"
},
"threshold": {
"type": "String"
}
},
"variables": {},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[[concat(parameters('resourceName'), '-node_disk_usage_percentage')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "Metric Alert for ContainerService managedClusters node_disk_usage_percentage",
"severity": "[[parameters('severity')]",
"enabled": "[[parameters('enabled')]",
"scopes": [
"[[parameters('resourceId')]"
],
"evaluationFrequency": "[[parameters('evaluationFrequency')]",
"windowSize": "[[parameters('windowSize')]",
"criteria": {
"allOf": [
{
"name": "node_disk_usage_percentage",
"metricNamespace": "Microsoft.ContainerService/managedClusters",
"metricName": "node_disk_usage_percentage",
"operator": "GreaterThan",
"threshold": "[[parameters('threshold')]",
"timeAggregation": "Average",
"criterionType": "StaticThresholdCriterion"
}
],
"odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
},
"autoMitigate": "[[parameters('autoMitigate')]",
"parameters": {
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[parameters('threshold')]"
}
}
}
}
]
},
"parameters": {
"resourceName": {
"value": "[[field('name')]"
},
"resourceId": {
"value": "[[field('id')]"
},
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[if(contains(field('tags'), '_amba-node_disk_usage_percentage-threshold-Override_'), field('tags._amba-node_disk_usage_percentage-threshold-Override_'), parameters('threshold'))]"
}
}
}
}
}
}
}
}
}
node_memory_rss_percentage - Metric Alert
Container RSS memory used in percent
Properties:
criterionType | StaticThresholdCriterion |
evaluationFrequency | PT1M |
metricName | node_memory_rss_percentage |
metricNamespace | Microsoft.ContainerService/managedClusters |
operator | GreaterThan |
severity | 3 |
threshold | 90 |
timeAggregation | Average |
windowSize | PT5M |
References:
Templates:
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"alertName": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Name of the alert"
}
},
"alertDescription": {
"type": "string",
"defaultValue": "Container RSS memory used in percent",
"metadata": {
"description": "Description of alert"
}
},
"targetResourceId": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "List of Azure resource Ids seperated by a comma. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name"
}
},
"targetResourceRegion": {
"type": "string",
"metadata": {
"description": "Azure region in which target resources to be monitored are in (without spaces). For example: EastUS"
}
},
"targetResourceType": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Resource type of target resources to be monitored."
}
},
"isEnabled": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "Specifies whether the alert is enabled"
}
},
"alertSeverity": {
"type": "int",
"defaultValue": 3,
"allowedValues": [
0,
1,
2,
3,
4
],
"metadata": {
"description": "Severity of alert {0,1,2,3,4}"
}
},
"operator": {
"type": "string",
"defaultValue": "GreaterThan",
"allowedValues": [
"Equals",
"GreaterThan",
"GreaterThanOrEqual",
"LessThan",
"LessThanOrEqual"
],
"metadata": {
"description": "Operator comparing the current value with the threshold value."
}
},
"threshold": {
"type": "string",
"defaultValue": "90",
"metadata": {
"description": "The threshold value at which the alert is activated."
}
},
"timeAggregation": {
"type": "string",
"defaultValue": "Average",
"allowedValues": [
"Average",
"Minimum",
"Maximum",
"Total",
"Count"
],
"metadata": {
"description": "How the data that is collected should be combined over time."
}
},
"windowSize": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"PT24H",
"PT1D"
],
"metadata": {
"description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
}
},
"evaluationFrequency": {
"type": "string",
"defaultValue": "PT1M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"metadata": {
"description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
}
},
"currentDateTimeUtcNow": {
"type": "string",
"defaultValue": "[utcNow()]",
"metadata": {
"description": "The current date and time using the utcNow function. Used for deployment name uniqueness"
}
},
"telemetryOptOut": {
"type": "string",
"defaultValue": "No",
"allowedValues": [
"Yes",
"No"
],
"metadata": {
"description": "The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry."
}
}
},
"variables": {
"pidDeploymentName": "[take(concat('pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-', uniqueString(resourceGroup().id, parameters('alertName'), parameters('currentDateTimeUtcNow'))), 64)]",
"varTargetResourceId": "[split(parameters('targetResourceId'), ',')]"
},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[parameters('alertName')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "[parameters('alertDescription')]",
"scopes": "[variables('varTargetResourceId')]",
"targetResourceType": "[parameters('targetResourceType')]",
"targetResourceRegion": "[parameters('targetResourceRegion')]",
"severity": "[parameters('alertSeverity')]",
"enabled": "[parameters('isEnabled')]",
"evaluationFrequency": "[parameters('evaluationFrequency')]",
"windowSize": "[parameters('windowSize')]",
"criteria": {
"odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria",
"allOf": [
{
"name": "1st criterion",
"metricName": "node_memory_rss_percentage",
"dimensions": [],
"operator": "[parameters('operator')]",
"threshold": "[parameters('threshold')]",
"timeAggregation": "[parameters('timeAggregation')]",
"criterionType": "StaticThresholdCriterion"
}
]
}
}
},
{
"condition": "[equals(parameters('telemetryOptOut'), 'No')]",
"apiVersion": "2023-07-01",
"name": "[variables('pidDeploymentName')]",
"type": "Microsoft.Resources/deployments",
"properties": {
"mode": "Incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"resources": []
}
}
}
]
}
@description('Name of the alert')
@minLength(1)
param alertName string
@description('Description of alert')
param alertDescription string = 'Container RSS memory used in percent'
@description('Array of Azure resource Ids. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name')
@minLength(1)
param targetResourceId array
@description('Azure region in which target resources to be monitored are in (without spaces). For example: EastUS')
param targetResourceRegion string
@description('Resource type of target resources to be monitored.')
@minLength(1)
param targetResourceType string
@description('Specifies whether the alert is enabled')
param isEnabled bool = true
@description('Severity of alert {0,1,2,3,4}')
@allowed([
0
1
2
3
4
])
param alertSeverity int = 3
@description('Operator comparing the current value with the threshold value.')
@allowed([
'Equals'
'GreaterThan'
'GreaterThanOrEqual'
'LessThan'
'LessThanOrEqual'
])
param operator string = 'GreaterThan'
@description('The threshold value at which the alert is activated.')
param threshold int = 90
@description('How the data that is collected should be combined over time.')
@allowed([
'Average'
'Minimum'
'Maximum'
'Total'
'Count'
])
param timeAggregation string = 'Average'
@description('Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format.')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
'PT6H'
'PT12H'
'PT24H'
'P1D'
])
param windowSize string = 'PT5M'
@description('how often the metric alert is evaluated represented in ISO 8601 duration format')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
])
param evaluationFrequency string = 'PT1M'
@description('"The current date and time using the utcNow function. Used for deployment name uniqueness')
param currentDateTimeUtcNow string = utcNow()
@description('The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry.')
@allowed([
'Yes'
'No'
])
param telemetryOptOut string = 'No'
resource metricAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: alertName
location: 'global'
tags: {
_deployed_by_amba: 'true'
}
properties: {
description: alertDescription
scopes: targetResourceId
targetResourceType: targetResourceType
targetResourceRegion: targetResourceRegion
severity: alertSeverity
enabled: isEnabled
evaluationFrequency: evaluationFrequency
windowSize: windowSize
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: '1st criterion'
metricName: 'node_memory_rss_percentage'
dimensions: [[]]
operator: operator
threshold: threshold
timeAggregation: timeAggregation
criterionType: 'StaticThresholdCriterion'
}
]
}
}
}
var ambaTelemetryPidName = 'pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-${uniqueString(resourceGroup().id, alertName, currentDateTimeUtcNow)}'
resource ambaTelemetryPid 'Microsoft.Resources/deployments@2023-07-01' = if (telemetryOptOut == 'No') {
name: ambaTelemetryPidName
tags: {
_deployed_by_amba: 'true'
}
properties: {
mode: 'Incremental'
template: {
'$schema': 'https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#'
contentVersion: '1.0.0.0'
resources: []
}
}
}
{
"type": "Microsoft.Authorization/policyDefinitions",
"apiVersion": "2021-06-01",
"name": "e89e023e-117b-4db4-bfb2-853849e273f5",
"properties": {
"policyType": "Custom",
"mode": "All",
"displayName": "Deploy ContainerService managedClusters node_memory_rss_percentage Alert",
"description": "Policy to Audit/Deploy ContainerService managedClusters node_memory_rss_percentage Alert",
"metadata": {
"version": "1.0.0-preview",
"category": "ContainerService",
"preview": true,
"source": "https://github.com/Azure/azure-monitor-baseline-alerts/",
"alzCloudEnvironments": [
"AzureCloud"
],
"_deployed_by_amba": "True"
},
"parameters": {
"severity": {
"type": "String",
"metadata": {
"displayName": "Severity",
"description": "Severity of the Alert"
},
"allowedValues": [
"0",
"1",
"2",
"3",
"4"
],
"defaultValue": "3"
},
"windowSize": {
"type": "String",
"metadata": {
"displayName": "Window Size",
"description": "Window size for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"P1D"
],
"defaultValue": "PT5M"
},
"evaluationFrequency": {
"type": "String",
"metadata": {
"displayName": "Evaluation Frequency",
"description": "Evaluation frequency for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"defaultValue": "PT1M"
},
"autoMitigate": {
"type": "String",
"metadata": {
"displayName": "Auto Mitigate",
"description": "Auto Mitigate for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"enabled": {
"type": "String",
"metadata": {
"displayName": "Alert State",
"description": "Alert state for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"threshold": {
"type": "String",
"metadata": {
"displayName": "Threshold",
"description": "Threshold for the alert"
},
"defaultValue": "90"
},
"effect": {
"type": "String",
"metadata": {
"displayName": "Effect",
"description": "Effect of the policy"
},
"allowedValues": [
"deployIfNotExists",
"disabled"
],
"defaultValue": "deployIfNotExists"
},
"MonitorDisableTagName": {
"type": "String",
"metadata": {
"displayName": "Monitoring disabled tag name",
"description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": "MonitorDisable"
},
"MonitorDisableTagValues": {
"type": "Array",
"metadata": {
"displayName": "Monitoring disabled tag values(s)",
"description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": [
"true",
"Test",
"Dev",
"Sandbox"
]
}
},
"policyRule": {
"if": {
"allOf": [
{
"field": "type",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]",
"notIn": "[[parameters('MonitorDisableTagValues')]"
}
]
},
"then": {
"effect": "[[parameters('effect')]",
"details": {
"roleDefinitionIds": [
"/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
],
"type": "Microsoft.Insights/metricAlerts",
"existenceCondition": {
"allOf": [
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName",
"equals": "node_memory_rss_percentage"
},
{
"field": "Microsoft.Insights/metricalerts/scopes[*]",
"equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.ContainerService/managedClusters/', field('fullName'))]"
},
{
"field": "Microsoft.Insights/metricAlerts/enabled",
"equals": "[[parameters('enabled')]"
},
{
"field": "Microsoft.Insights/metricAlerts/evaluationFrequency",
"equals": "[[parameters('evaluationFrequency')]"
},
{
"field": "Microsoft.Insights/metricAlerts/windowSize",
"equals": "[[parameters('windowSize')]"
},
{
"field": "Microsoft.Insights/metricalerts/severity",
"equals": "[[parameters('severity')]"
},
{
"field": "Microsoft.Insights/metricAlerts/autoMitigate",
"equals": "[[parameters('autoMitigate')]"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation",
"equals": "Average"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator",
"equals": "GreaterThan"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold",
"equals": "[[if(contains(field('tags'), '_amba-node_memory_rss_percentage-threshold-Override_'), field('tags._amba-node_memory_rss_percentage-threshold-Override_'), parameters('threshold'))]"
}
]
},
"deployment": {
"properties": {
"mode": "incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"resourceName": {
"type": "String",
"metadata": {
"displayName": "resourceName",
"description": "Name of the resource"
}
},
"resourceId": {
"type": "String",
"metadata": {
"displayName": "resourceId",
"description": "Resource ID of the resource emitting the metric that will be used for the comparison"
}
},
"severity": {
"type": "String"
},
"windowSize": {
"type": "String"
},
"evaluationFrequency": {
"type": "String"
},
"autoMitigate": {
"type": "String"
},
"enabled": {
"type": "String"
},
"threshold": {
"type": "String"
}
},
"variables": {},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[[concat(parameters('resourceName'), '-node_memory_rss_percentage')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "Metric Alert for ContainerService managedClusters node_memory_rss_percentage",
"severity": "[[parameters('severity')]",
"enabled": "[[parameters('enabled')]",
"scopes": [
"[[parameters('resourceId')]"
],
"evaluationFrequency": "[[parameters('evaluationFrequency')]",
"windowSize": "[[parameters('windowSize')]",
"criteria": {
"allOf": [
{
"name": "node_memory_rss_percentage",
"metricNamespace": "Microsoft.ContainerService/managedClusters",
"metricName": "node_memory_rss_percentage",
"operator": "GreaterThan",
"threshold": "[[parameters('threshold')]",
"timeAggregation": "Average",
"criterionType": "StaticThresholdCriterion"
}
],
"odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
},
"autoMitigate": "[[parameters('autoMitigate')]",
"parameters": {
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[parameters('threshold')]"
}
}
}
}
]
},
"parameters": {
"resourceName": {
"value": "[[field('name')]"
},
"resourceId": {
"value": "[[field('id')]"
},
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[if(contains(field('tags'), '_amba-node_memory_rss_percentage-threshold-Override_'), field('tags._amba-node_memory_rss_percentage-threshold-Override_'), parameters('threshold'))]"
}
}
}
}
}
}
}
}
}
node_memory_working_set_percentage - Metric Alert
Container working set memory used in percent
Properties:
criterionType | StaticThresholdCriterion |
evaluationFrequency | PT5M |
metricName | node_memory_working_set_percentage |
metricNamespace | Microsoft.ContainerService/managedClusters |
operator | GreaterThan |
severity | 3 |
threshold | 100 |
timeAggregation | Average |
windowSize | PT5M |
References:
Templates:
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"alertName": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Name of the alert"
}
},
"alertDescription": {
"type": "string",
"defaultValue": "Container working set memory used in percent",
"metadata": {
"description": "Description of alert"
}
},
"targetResourceId": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "List of Azure resource Ids seperated by a comma. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name"
}
},
"targetResourceRegion": {
"type": "string",
"metadata": {
"description": "Azure region in which target resources to be monitored are in (without spaces). For example: EastUS"
}
},
"targetResourceType": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Resource type of target resources to be monitored."
}
},
"isEnabled": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "Specifies whether the alert is enabled"
}
},
"alertSeverity": {
"type": "int",
"defaultValue": 3,
"allowedValues": [
0,
1,
2,
3,
4
],
"metadata": {
"description": "Severity of alert {0,1,2,3,4}"
}
},
"operator": {
"type": "string",
"defaultValue": "GreaterThan",
"allowedValues": [
"Equals",
"GreaterThan",
"GreaterThanOrEqual",
"LessThan",
"LessThanOrEqual"
],
"metadata": {
"description": "Operator comparing the current value with the threshold value."
}
},
"threshold": {
"type": "string",
"defaultValue": "100",
"metadata": {
"description": "The threshold value at which the alert is activated."
}
},
"timeAggregation": {
"type": "string",
"defaultValue": "Average",
"allowedValues": [
"Average",
"Minimum",
"Maximum",
"Total",
"Count"
],
"metadata": {
"description": "How the data that is collected should be combined over time."
}
},
"windowSize": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"PT24H",
"PT1D"
],
"metadata": {
"description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
}
},
"evaluationFrequency": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"metadata": {
"description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
}
},
"currentDateTimeUtcNow": {
"type": "string",
"defaultValue": "[utcNow()]",
"metadata": {
"description": "The current date and time using the utcNow function. Used for deployment name uniqueness"
}
},
"telemetryOptOut": {
"type": "string",
"defaultValue": "No",
"allowedValues": [
"Yes",
"No"
],
"metadata": {
"description": "The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry."
}
}
},
"variables": {
"pidDeploymentName": "[take(concat('pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-', uniqueString(resourceGroup().id, parameters('alertName'), parameters('currentDateTimeUtcNow'))), 64)]",
"varTargetResourceId": "[split(parameters('targetResourceId'), ',')]"
},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[parameters('alertName')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "[parameters('alertDescription')]",
"scopes": "[variables('varTargetResourceId')]",
"targetResourceType": "[parameters('targetResourceType')]",
"targetResourceRegion": "[parameters('targetResourceRegion')]",
"severity": "[parameters('alertSeverity')]",
"enabled": "[parameters('isEnabled')]",
"evaluationFrequency": "[parameters('evaluationFrequency')]",
"windowSize": "[parameters('windowSize')]",
"criteria": {
"odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria",
"allOf": [
{
"name": "1st criterion",
"metricName": "node_memory_working_set_percentage",
"dimensions": [],
"operator": "[parameters('operator')]",
"threshold": "[parameters('threshold')]",
"timeAggregation": "[parameters('timeAggregation')]",
"criterionType": "StaticThresholdCriterion"
}
]
}
}
},
{
"condition": "[equals(parameters('telemetryOptOut'), 'No')]",
"apiVersion": "2023-07-01",
"name": "[variables('pidDeploymentName')]",
"type": "Microsoft.Resources/deployments",
"properties": {
"mode": "Incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"resources": []
}
}
}
]
}
@description('Name of the alert')
@minLength(1)
param alertName string
@description('Description of alert')
param alertDescription string = 'Container working set memory used in percent'
@description('Array of Azure resource Ids. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name')
@minLength(1)
param targetResourceId array
@description('Azure region in which target resources to be monitored are in (without spaces). For example: EastUS')
param targetResourceRegion string
@description('Resource type of target resources to be monitored.')
@minLength(1)
param targetResourceType string
@description('Specifies whether the alert is enabled')
param isEnabled bool = true
@description('Severity of alert {0,1,2,3,4}')
@allowed([
0
1
2
3
4
])
param alertSeverity int = 3
@description('Operator comparing the current value with the threshold value.')
@allowed([
'Equals'
'GreaterThan'
'GreaterThanOrEqual'
'LessThan'
'LessThanOrEqual'
])
param operator string = 'GreaterThan'
@description('The threshold value at which the alert is activated.')
param threshold int = 100
@description('How the data that is collected should be combined over time.')
@allowed([
'Average'
'Minimum'
'Maximum'
'Total'
'Count'
])
param timeAggregation string = 'Average'
@description('Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format.')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
'PT6H'
'PT12H'
'PT24H'
'P1D'
])
param windowSize string = 'PT5M'
@description('how often the metric alert is evaluated represented in ISO 8601 duration format')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
])
param evaluationFrequency string = 'PT5M'
@description('"The current date and time using the utcNow function. Used for deployment name uniqueness')
param currentDateTimeUtcNow string = utcNow()
@description('The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry.')
@allowed([
'Yes'
'No'
])
param telemetryOptOut string = 'No'
resource metricAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: alertName
location: 'global'
tags: {
_deployed_by_amba: 'true'
}
properties: {
description: alertDescription
scopes: targetResourceId
targetResourceType: targetResourceType
targetResourceRegion: targetResourceRegion
severity: alertSeverity
enabled: isEnabled
evaluationFrequency: evaluationFrequency
windowSize: windowSize
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: '1st criterion'
metricName: 'node_memory_working_set_percentage'
dimensions: [[]]
operator: operator
threshold: threshold
timeAggregation: timeAggregation
criterionType: 'StaticThresholdCriterion'
}
]
}
}
}
var ambaTelemetryPidName = 'pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-${uniqueString(resourceGroup().id, alertName, currentDateTimeUtcNow)}'
resource ambaTelemetryPid 'Microsoft.Resources/deployments@2023-07-01' = if (telemetryOptOut == 'No') {
name: ambaTelemetryPidName
tags: {
_deployed_by_amba: 'true'
}
properties: {
mode: 'Incremental'
template: {
'$schema': 'https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#'
contentVersion: '1.0.0.0'
resources: []
}
}
}
{
"type": "Microsoft.Authorization/policyDefinitions",
"apiVersion": "2021-06-01",
"name": "9ae2dfbf-d69b-4802-8497-8b7836bef5e9",
"properties": {
"policyType": "Custom",
"mode": "All",
"displayName": "Deploy ContainerService managedClusters node_memory_working_set_percentage Alert",
"description": "Policy to Audit/Deploy ContainerService managedClusters node_memory_working_set_percentage Alert",
"metadata": {
"version": "1.0.0-preview",
"category": "ContainerService",
"preview": true,
"source": "https://github.com/Azure/azure-monitor-baseline-alerts/",
"alzCloudEnvironments": [
"AzureCloud"
],
"_deployed_by_amba": "True"
},
"parameters": {
"severity": {
"type": "String",
"metadata": {
"displayName": "Severity",
"description": "Severity of the Alert"
},
"allowedValues": [
"0",
"1",
"2",
"3",
"4"
],
"defaultValue": "3"
},
"windowSize": {
"type": "String",
"metadata": {
"displayName": "Window Size",
"description": "Window size for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"P1D"
],
"defaultValue": "PT5M"
},
"evaluationFrequency": {
"type": "String",
"metadata": {
"displayName": "Evaluation Frequency",
"description": "Evaluation frequency for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"defaultValue": "PT5M"
},
"autoMitigate": {
"type": "String",
"metadata": {
"displayName": "Auto Mitigate",
"description": "Auto Mitigate for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"enabled": {
"type": "String",
"metadata": {
"displayName": "Alert State",
"description": "Alert state for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"threshold": {
"type": "String",
"metadata": {
"displayName": "Threshold",
"description": "Threshold for the alert"
},
"defaultValue": "100"
},
"effect": {
"type": "String",
"metadata": {
"displayName": "Effect",
"description": "Effect of the policy"
},
"allowedValues": [
"deployIfNotExists",
"disabled"
],
"defaultValue": "deployIfNotExists"
},
"MonitorDisableTagName": {
"type": "String",
"metadata": {
"displayName": "Monitoring disabled tag name",
"description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": "MonitorDisable"
},
"MonitorDisableTagValues": {
"type": "Array",
"metadata": {
"displayName": "Monitoring disabled tag values(s)",
"description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": [
"true",
"Test",
"Dev",
"Sandbox"
]
}
},
"policyRule": {
"if": {
"allOf": [
{
"field": "type",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]",
"notIn": "[[parameters('MonitorDisableTagValues')]"
}
]
},
"then": {
"effect": "[[parameters('effect')]",
"details": {
"roleDefinitionIds": [
"/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
],
"type": "Microsoft.Insights/metricAlerts",
"existenceCondition": {
"allOf": [
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace",
"equals": "Microsoft.ContainerService/managedClusters"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName",
"equals": "node_memory_working_set_percentage"
},
{
"field": "Microsoft.Insights/metricalerts/scopes[*]",
"equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.ContainerService/managedClusters/', field('fullName'))]"
},
{
"field": "Microsoft.Insights/metricAlerts/enabled",
"equals": "[[parameters('enabled')]"
},
{
"field": "Microsoft.Insights/metricAlerts/evaluationFrequency",
"equals": "[[parameters('evaluationFrequency')]"
},
{
"field": "Microsoft.Insights/metricAlerts/windowSize",
"equals": "[[parameters('windowSize')]"
},
{
"field": "Microsoft.Insights/metricalerts/severity",
"equals": "[[parameters('severity')]"
},
{
"field": "Microsoft.Insights/metricAlerts/autoMitigate",
"equals": "[[parameters('autoMitigate')]"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation",
"equals": "Average"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator",
"equals": "GreaterThan"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold",
"equals": "[[if(contains(field('tags'), '_amba-node_memory_working_set_percentage-threshold-Override_'), field('tags._amba-node_memory_working_set_percentage-threshold-Override_'), parameters('threshold'))]"
}
]
},
"deployment": {
"properties": {
"mode": "incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"resourceName": {
"type": "String",
"metadata": {
"displayName": "resourceName",
"description": "Name of the resource"
}
},
"resourceId": {
"type": "String",
"metadata": {
"displayName": "resourceId",
"description": "Resource ID of the resource emitting the metric that will be used for the comparison"
}
},
"severity": {
"type": "String"
},
"windowSize": {
"type": "String"
},
"evaluationFrequency": {
"type": "String"
},
"autoMitigate": {
"type": "String"
},
"enabled": {
"type": "String"
},
"threshold": {
"type": "String"
}
},
"variables": {},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[[concat(parameters('resourceName'), '-node_memory_working_set_percentage')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "Metric Alert for ContainerService managedClusters node_memory_working_set_percentage",
"severity": "[[parameters('severity')]",
"enabled": "[[parameters('enabled')]",
"scopes": [
"[[parameters('resourceId')]"
],
"evaluationFrequency": "[[parameters('evaluationFrequency')]",
"windowSize": "[[parameters('windowSize')]",
"criteria": {
"allOf": [
{
"name": "node_memory_working_set_percentage",
"metricNamespace": "Microsoft.ContainerService/managedClusters",
"metricName": "node_memory_working_set_percentage",
"operator": "GreaterThan",
"threshold": "[[parameters('threshold')]",
"timeAggregation": "Average",
"criterionType": "StaticThresholdCriterion"
}
],
"odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
},
"autoMitigate": "[[parameters('autoMitigate')]",
"parameters": {
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[parameters('threshold')]"
}
}
}
}
]
},
"parameters": {
"resourceName": {
"value": "[[field('name')]"
},
"resourceId": {
"value": "[[field('id')]"
},
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[if(contains(field('tags'), '_amba-node_memory_working_set_percentage-threshold-Override_'), field('tags._amba-node_memory_working_set_percentage-threshold-Override_'), parameters('threshold'))]"
}
}
}
}
}
}
}
}
}