Monday, June 14, 2021

Datadog event - function to capture notebook exception

This function can be used at any "except Exception" like below. e.g. 

try:
## any code logic

except Exception as e:
context_str = dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson()
DatabricksNBRuntimeError(context_str)

Functions are:

from datadog import initialize, api

api_key = dbutils.secrets.get(scope="datadog", key="datadog_api_key")
app_key = dbutils.secrets.get(scope="datadog", key="datadog_app_key")

options = {
"api_key": f"{api_key}",
"app_key": f"{app_key}"
}

initialize(**options)
import sys
import requests
import traceback
import json

def datadog_event(notebook_path, job_name, user):
exc_type, exc_value, exc_tb = sys.exc_info()
traceback_error=str(traceback.format_exception(exc_type, exc_value, exc_tb))
error_case_list = [find_between(traceback_error,"Exception","\n\tat"), find_between(traceback_error,"Error","\n\tat"), traceback_error]
error_message = next(error for error in error_case_list if error != '')

error_dict = {"notebook_metadata": {"job_name":job_name, "notebook_path": notebook_path,"user": user}, "error": error_message}
error_message = json.dumps(error_dict)

title = datadog_identify_string
if job_name:
title = datadog_identify_string + ': job( ' + job_name + ' )'

text = error_message

print(title, text)
api.Event.create(title=title, text=text, tags=datadog_tags, priority='Low')
def exit_notebook(job_name, user):

error_case_list = [find_between(str(e),"Exception","\n\tat"), find_between(str(e),"Error","\n\tat"), str(e)[:200]]
error_message = next(error for error in error_case_list if error != '')
output = {"status": "FAIL", "notebook_metadata": {"job_name":job_name, "user":user}, "message": error_message}

dbutils.notebook.exit(json.dumps(output))
class DatabricksNBRuntimeError(Exception):

def __init__(self, context_str, **kwargs):

self.strerror = args
self.args = args

notebook_metadata = json.loads(context_str)

job_name = None
user = None
notebook_path = notebook_metadata['extraContext']['notebook_path']
if 'jobName' in notebook_metadata['tags'].keys():
job_name = notebook_metadata['tags']['jobName']
if 'user' in notebook_metadata['tags'].keys():
user = notebook_metadata['tags']['user']

datadog_event(notebook_path, job_name, user)
exit_notebook(job_name, user)

Argo workflow - Prometheus metrics

https://argoproj.github.io/argo-workflows/metrics/#metric-spec


prometheus:
# Metric name (will be prepended with "argo_workflows_").
# Metric names can only contain alphanumeric characters, _, and :.
- name: metric_failed_with_output_result
# Metric definitions must include a name and a help doc string.
# (this is a Prometheus requirement)Metrics with the
# same name must always use the same exact help string,
# having different metrics with the same name,
# but with a different help string will cause an error.
help: "test run"
# Labels are optional. Avoid cardinality explosion.
labels:
- key: workflow_name
value: "{{ workflow.name }}"
- key: workflow_namespace
value: "{{ workflow.namespace }}"
- key: status
# different workflows may fail with the same error,
# so add unique workflow as prefix
value: "{{ workflow.name }}_{{ outputs.result }}"
# Emit the metric conditionally. Works the same as normal "when"
when: "{{ status }} == Failed"
# The metric type. Available are "gauge", "histogram", and "counter".
gauge:
# This increments the counter by 1
value: "{{duration}}"