workspaces
Name | Type | Description |
---|---|---|
Failed Runs | Metric | Number of runs failed for this workspace. Count is updated when a run fails. |
Dashboards:
Click a tab to view the dashboard template
{
"__inputs": [],
"__elements": {},
"__requires": [
{
"type": "panel",
"id": "bargauge",
"name": "Bar gauge",
"version": ""
},
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "9.5.12"
},
{
"type": "datasource",
"id": "grafana-azure-monitor-datasource",
"name": "Azure Monitor",
"version": "1.0.0"
}
],
"title": "Workspaces",
"editable": true,
"links": [],
"liveNow": false,
"panels": [
{
"title": "Failed Runs",
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#808080",
"value": null
},
{
"color": "dark-green",
"value": 0
},
{
"color": "dark-red",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"displayMode": "basic",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": true
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "9.5.12",
"targets": [
{
"azureLogAnalytics": {
"query": "AzureMetrics\r\n| where _ResourceId has 'Microsoft.MachineLearningServices/workspaces'\r\n| where MetricName has 'Failed Runs'\r\n| summarize metric = avg(Total) by _ResourceId, Resource",
"resources": [
"/subscriptions/$sub"
]
},
"azureMonitor": {
"allowedTimeGrainsMs": [],
"timeGrain": "auto"
},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"queryType": "Azure Log Analytics",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"_ResourceId": true
},
"indexByName": {},
"renameByName": {}
}
}
],
"type": "bargauge"
}
],
"refresh": "",
"schemaVersion": 38,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"current": {},
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "ds",
"options": [],
"query": "grafana-azure-monitor-datasource",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"current": {},
"datasource": {
"type": "grafana-azure-monitor-datasource",
"uid": "${ds}"
},
"definition": "",
"hide": 0,
"includeAll": false,
"label": "Subscription",
"multi": false,
"name": "sub",
"options": [],
"query": {
"azureLogAnalytics": {
"query": "",
"resources": []
},
"queryType": "Azure Subscriptions",
"refId": "A"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"version": null
}
Failed Runs - Metric Alert
Number of runs failed for this workspace. Count is updated when a run fails.
Properties:
criterionType | StaticThresholdCriterion |
evaluationFrequency | PT1M |
metricName | Failed Runs |
metricNamespace | Microsoft.MachineLearningServices/workspaces |
operator | GreaterThan |
severity | 3 |
threshold | 0 |
timeAggregation | Total |
windowSize | PT5M |
References:
Templates:
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"alertName": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Name of the alert"
}
},
"alertDescription": {
"type": "string",
"defaultValue": "Number of runs failed for this workspace. Count is updated when a run fails.",
"metadata": {
"description": "Description of alert"
}
},
"targetResourceId": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "List of Azure resource Ids seperated by a comma. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name"
}
},
"targetResourceRegion": {
"type": "string",
"metadata": {
"description": "Azure region in which target resources to be monitored are in (without spaces). For example: EastUS"
}
},
"targetResourceType": {
"type": "string",
"minLength": 1,
"metadata": {
"description": "Resource type of target resources to be monitored."
}
},
"isEnabled": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "Specifies whether the alert is enabled"
}
},
"alertSeverity": {
"type": "int",
"defaultValue": 3,
"allowedValues": [
0,
1,
2,
3,
4
],
"metadata": {
"description": "Severity of alert {0,1,2,3,4}"
}
},
"operator": {
"type": "string",
"defaultValue": "GreaterThan",
"allowedValues": [
"Equals",
"GreaterThan",
"GreaterThanOrEqual",
"LessThan",
"LessThanOrEqual"
],
"metadata": {
"description": "Operator comparing the current value with the threshold value."
}
},
"threshold": {
"type": "string",
"defaultValue": "0",
"metadata": {
"description": "The threshold value at which the alert is activated."
}
},
"timeAggregation": {
"type": "string",
"defaultValue": "Total",
"allowedValues": [
"Average",
"Minimum",
"Maximum",
"Total",
"Count"
],
"metadata": {
"description": "How the data that is collected should be combined over time."
}
},
"windowSize": {
"type": "string",
"defaultValue": "PT5M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"PT24H",
"PT1D"
],
"metadata": {
"description": "Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format."
}
},
"evaluationFrequency": {
"type": "string",
"defaultValue": "PT1M",
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"metadata": {
"description": "how often the metric alert is evaluated represented in ISO 8601 duration format"
}
},
"currentDateTimeUtcNow": {
"type": "string",
"defaultValue": "[utcNow()]",
"metadata": {
"description": "The current date and time using the utcNow function. Used for deployment name uniqueness"
}
},
"telemetryOptOut": {
"type": "string",
"defaultValue": "No",
"allowedValues": [
"Yes",
"No"
],
"metadata": {
"description": "The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry."
}
}
},
"variables": {
"pidDeploymentName": "[take(concat('pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-', uniqueString(resourceGroup().id, parameters('alertName'), parameters('currentDateTimeUtcNow'))), 64)]",
"varTargetResourceId": "[split(parameters('targetResourceId'), ',')]"
},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[parameters('alertName')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "[parameters('alertDescription')]",
"scopes": "[variables('varTargetResourceId')]",
"targetResourceType": "[parameters('targetResourceType')]",
"targetResourceRegion": "[parameters('targetResourceRegion')]",
"severity": "[parameters('alertSeverity')]",
"enabled": "[parameters('isEnabled')]",
"evaluationFrequency": "[parameters('evaluationFrequency')]",
"windowSize": "[parameters('windowSize')]",
"criteria": {
"odata.type": "Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria",
"allOf": [
{
"name": "1st criterion",
"metricName": "Failed Runs",
"dimensions": [],
"operator": "[parameters('operator')]",
"threshold": "[parameters('threshold')]",
"timeAggregation": "[parameters('timeAggregation')]",
"criterionType": "StaticThresholdCriterion"
}
]
}
}
},
{
"condition": "[equals(parameters('telemetryOptOut'), 'No')]",
"apiVersion": "2020-06-01",
"name": "[variables('pidDeploymentName')]",
"type": "Microsoft.Resources/deployments",
"properties": {
"mode": "Incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"resources": []
}
}
}
]
}
@description('Name of the alert')
@minLength(1)
param alertName string
@description('Description of alert')
param alertDescription string = 'Number of runs failed for this workspace. Count is updated when a run fails.'
@description('Array of Azure resource Ids. For example - /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroup/resource-group-name/Microsoft.compute/virtualMachines/vm-name')
@minLength(1)
param targetResourceId array
@description('Azure region in which target resources to be monitored are in (without spaces). For example: EastUS')
param targetResourceRegion string
@description('Resource type of target resources to be monitored.')
@minLength(1)
param targetResourceType string
@description('Specifies whether the alert is enabled')
param isEnabled bool = true
@description('Severity of alert {0,1,2,3,4}')
@allowed([
0
1
2
3
4
])
param alertSeverity int = 3
@description('Operator comparing the current value with the threshold value.')
@allowed([
'Equals'
'GreaterThan'
'GreaterThanOrEqual'
'LessThan'
'LessThanOrEqual'
])
param operator string = 'GreaterThan'
@description('The threshold value at which the alert is activated.')
param threshold int = 0
@description('How the data that is collected should be combined over time.')
@allowed([
'Average'
'Minimum'
'Maximum'
'Total'
'Count'
])
param timeAggregation string = 'Total'
@description('Period of time used to monitor alert activity based on the threshold. Must be between one minute and one day. ISO 8601 duration format.')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
'PT6H'
'PT12H'
'PT24H'
'P1D'
])
param windowSize string = 'PT5M'
@description('how often the metric alert is evaluated represented in ISO 8601 duration format')
@allowed([
'PT1M'
'PT5M'
'PT15M'
'PT30M'
'PT1H'
])
param evaluationFrequency string = 'PT1M'
@description('"The current date and time using the utcNow function. Used for deployment name uniqueness')
param currentDateTimeUtcNow string = utcNow()
@description('The customer usage identifier used for telemetry purposes. The default value of False enables telemetry. The value of True disables telemetry.')
@allowed([
'Yes'
'No'
])
param telemetryOptOut string = 'No'
resource metricAlert 'Microsoft.Insights/metricAlerts@2018-03-01' = {
name: alertName
location: 'global'
tags: {
_deployed_by_amba: 'true'
}
properties: {
description: alertDescription
scopes: targetResourceId
targetResourceType: targetResourceType
targetResourceRegion: targetResourceRegion
severity: alertSeverity
enabled: isEnabled
evaluationFrequency: evaluationFrequency
windowSize: windowSize
criteria: {
'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria'
allOf: [
{
name: '1st criterion'
metricName: 'Failed Runs'
dimensions: [[]]
operator: operator
threshold: threshold
timeAggregation: timeAggregation
criterionType: 'StaticThresholdCriterion'
}
]
}
}
}
var ambaTelemetryPidName = 'pid-8bb7cf8a-bcf7-4264-abcb-703ace2fc84d-${uniqueString(resourceGroup().id, alertName, currentDateTimeUtcNow)}'
resource ambaTelemetryPid 'Microsoft.Resources/deployments@2020-06-01' = if (telemetryOptOut == 'No') {
name: ambaTelemetryPidName
tags: {
_deployed_by_amba: 'true'
}
properties: {
mode: 'Incremental'
template: {
'$schema': 'https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#'
contentVersion: '1.0.0.0'
resources: []
}
}
}
{
"type": "Microsoft.Authorization/policyDefinitions",
"apiVersion": "2021-06-01",
"name": "c897902c-40a5-497b-a0ce-86c3eda7c61d",
"properties": {
"policyType": "Custom",
"mode": "All",
"displayName": "Deploy MachineLearningServices workspaces Failed Runs Alert",
"description": "Policy to Audit/Deploy MachineLearningServices workspaces Failed Runs Alert",
"metadata": {
"version": "1.0.0-preview",
"category": "MachineLearningServices",
"preview": true,
"source": "https://github.com/Azure/azure-monitor-baseline-alerts/",
"alzCloudEnvironments": [
"AzureCloud"
],
"_deployed_by_amba": "True"
},
"parameters": {
"severity": {
"type": "String",
"metadata": {
"displayName": "Severity",
"description": "Severity of the Alert"
},
"allowedValues": [
"0",
"1",
"2",
"3",
"4"
],
"defaultValue": "3"
},
"windowSize": {
"type": "String",
"metadata": {
"displayName": "Window Size",
"description": "Window size for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H",
"PT6H",
"PT12H",
"P1D"
],
"defaultValue": "PT5M"
},
"evaluationFrequency": {
"type": "String",
"metadata": {
"displayName": "Evaluation Frequency",
"description": "Evaluation frequency for the alert"
},
"allowedValues": [
"PT1M",
"PT5M",
"PT15M",
"PT30M",
"PT1H"
],
"defaultValue": "PT1M"
},
"autoMitigate": {
"type": "String",
"metadata": {
"displayName": "Auto Mitigate",
"description": "Auto Mitigate for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"enabled": {
"type": "String",
"metadata": {
"displayName": "Alert State",
"description": "Alert state for the alert"
},
"allowedValues": [
"true",
"false"
],
"defaultValue": "true"
},
"threshold": {
"type": "String",
"metadata": {
"displayName": "Threshold",
"description": "Threshold for the alert"
},
"defaultValue": "0"
},
"effect": {
"type": "String",
"metadata": {
"displayName": "Effect",
"description": "Effect of the policy"
},
"allowedValues": [
"deployIfNotExists",
"disabled"
],
"defaultValue": "deployIfNotExists"
},
"MonitorDisableTagName": {
"type": "String",
"metadata": {
"displayName": "Monitoring disabled tag name",
"description": "Tag name used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": "MonitorDisable"
},
"MonitorDisableTagValues": {
"type": "Array",
"metadata": {
"displayName": "Monitoring disabled tag values(s)",
"description": "Tag value(s) used to disable monitoring at the resource level. Set to true if monitoring should be disabled."
},
"defaultValue": [
"true",
"Test",
"Dev",
"Sandbox"
]
}
},
"policyRule": {
"if": {
"allOf": [
{
"field": "type",
"equals": "Microsoft.MachineLearningServices/workspaces"
},
{
"field": "[[concat('tags[', parameters('MonitorDisableTagName'), ']')]",
"notIn": "[[parameters('MonitorDisableTagValues')]"
}
]
},
"then": {
"effect": "[[parameters('effect')]",
"details": {
"roleDefinitionIds": [
"/providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
],
"type": "Microsoft.Insights/metricAlerts",
"existenceCondition": {
"allOf": [
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricNamespace",
"equals": "Microsoft.MachineLearningServices/workspaces"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].metricName",
"equals": "Failed Runs"
},
{
"field": "Microsoft.Insights/metricalerts/scopes[*]",
"equals": "[[concat(subscription().id, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.MachineLearningServices/workspaces/', field('fullName'))]"
},
{
"field": "Microsoft.Insights/metricAlerts/enabled",
"equals": "[[parameters('enabled')]"
},
{
"field": "Microsoft.Insights/metricAlerts/evaluationFrequency",
"equals": "[[parameters('evaluationFrequency')]"
},
{
"field": "Microsoft.Insights/metricAlerts/windowSize",
"equals": "[[parameters('windowSize')]"
},
{
"field": "Microsoft.Insights/metricalerts/severity",
"equals": "[[parameters('severity')]"
},
{
"field": "Microsoft.Insights/metricAlerts/autoMitigate",
"equals": "[[parameters('autoMitigate')]"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft-Azure-Monitor-SingleResourceMultipleMetricCriteria.allOf[*].timeAggregation",
"equals": "Total"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.operator",
"equals": "GreaterThan"
},
{
"field": "Microsoft.Insights/metricAlerts/criteria.Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria.allOf[*].StaticThresholdCriterion.threshold",
"equals": "[[if(contains(field('tags'), '_amba-Failed Runs-threshold-Override_'), field('tags._amba-Failed Runs-threshold-Override_'), parameters('threshold'))]"
}
]
},
"deployment": {
"properties": {
"mode": "incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"resourceName": {
"type": "String",
"metadata": {
"displayName": "resourceName",
"description": "Name of the resource"
}
},
"resourceId": {
"type": "String",
"metadata": {
"displayName": "resourceId",
"description": "Resource ID of the resource emitting the metric that will be used for the comparison"
}
},
"severity": {
"type": "String"
},
"windowSize": {
"type": "String"
},
"evaluationFrequency": {
"type": "String"
},
"autoMitigate": {
"type": "String"
},
"enabled": {
"type": "String"
},
"threshold": {
"type": "String"
}
},
"variables": {},
"resources": [
{
"type": "Microsoft.Insights/metricAlerts",
"apiVersion": "2018-03-01",
"name": "[[concat(parameters('resourceName'), '-Failed Runs')]",
"location": "global",
"tags": {
"_deployed_by_amba": true
},
"properties": {
"description": "Metric Alert for MachineLearningServices workspaces Failed Runs",
"severity": "[[parameters('severity')]",
"enabled": "[[parameters('enabled')]",
"scopes": [
"[[parameters('resourceId')]"
],
"evaluationFrequency": "[[parameters('evaluationFrequency')]",
"windowSize": "[[parameters('windowSize')]",
"criteria": {
"allOf": [
{
"name": "Failed Runs",
"metricNamespace": "Microsoft.MachineLearningServices/workspaces",
"metricName": "Failed Runs",
"operator": "GreaterThan",
"threshold": "[[parameters('threshold')]",
"timeAggregation": "Total",
"criterionType": "StaticThresholdCriterion"
}
],
"odata.type": "Microsoft.Azure.Monitor.SingleResourceMultipleMetricCriteria"
},
"autoMitigate": "[[parameters('autoMitigate')]",
"parameters": {
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[parameters('threshold')]"
}
}
}
}
]
},
"parameters": {
"resourceName": {
"value": "[[field('name')]"
},
"resourceId": {
"value": "[[field('id')]"
},
"severity": {
"value": "[[parameters('severity')]"
},
"windowSize": {
"value": "[[parameters('windowSize')]"
},
"evaluationFrequency": {
"value": "[[parameters('evaluationFrequency')]"
},
"autoMitigate": {
"value": "[[parameters('autoMitigate')]"
},
"enabled": {
"value": "[[parameters('enabled')]"
},
"threshold": {
"value": "[[if(contains(field('tags'), '_amba-Failed Runs-threshold-Override_'), field('tags._amba-Failed Runs-threshold-Override_'), parameters('threshold'))]"
}
}
}
}
}
}
}
}
}