n8n Monitoring and Observability: Building Self-Healing Workflows
Production n8n workflows require robust monitoring and observability. This guide covers implementing comprehensive monitoring with health checks, metrics collection, alerting, and self-healing patterns.
Workflow Health Monitoring
Execution Health Tracker
// Function node for workflow health tracking
const workflowId = $workflow.id;
const executionId = $executionId;
const startTime = $json.startTime || Date.now();
// Store execution metrics
const execution = {
workflowId,
executionId,
startTime,
status: 'running',
metadata: {
triggerType: $json.triggerType || 'unknown',
inputSize: JSON.stringify($json).length,
retryCount: $json.retryCount || 0
}
};
// Use workflow static data for tracking
const staticData = $getWorkflowStaticData('global');
if (!staticData.executions) {
staticData.executions = {};
}
staticData.executions[executionId] = execution;
// Clean old executions (keep last 100)
const executionIds = Object.keys(staticData.executions);
if (executionIds.length > 100) {
const toDelete = executionIds.slice(0, executionIds.length - 100);
toDelete.forEach(id => delete staticData.executions[id]);
}
return [{
json: {
...$json,
_monitoring: {
executionId,
startTime,
trackingEnabled: true
}
}
}];Health Check Endpoint Workflow
// Webhook trigger workflow for health checks
// Returns workflow system health status
// Function node to check system health
const staticData = $getWorkflowStaticData('global');
const now = Date.now();
// Calculate metrics
const metrics = {
uptime: now - (staticData.startTime || now),
totalExecutions: staticData.totalExecutions || 0,
successfulExecutions: staticData.successfulExecutions || 0,
failedExecutions: staticData.failedExecutions || 0,
averageExecutionTime: staticData.averageExecutionTime || 0,
lastExecutionTime: staticData.lastExecutionTime || null
};
// Check recent failure rate
const recentExecutions = Object.values(staticData.executions || {})
.filter(e => now - e.startTime < 3600000); // Last hour
const recentFailures = recentExecutions.filter(e => e.status === 'failed').length;
const failureRate = recentExecutions.length > 0
? recentFailures / recentExecutions.length
: 0;
// Determine health status
let status = 'healthy';
let issues = [];
if (failureRate > 0.5) {
status = 'unhealthy';
issues.push(`High failure rate: ${(failureRate * 100).toFixed(1)}%`);
} else if (failureRate > 0.2) {
status = 'degraded';
issues.push(`Elevated failure rate: ${(failureRate * 100).toFixed(1)}%`);
}
// Check for stale executions
const staleExecutions = recentExecutions.filter(
e => e.status === 'running' && now - e.startTime > 300000 // 5 min timeout
);
if (staleExecutions.length > 0) {
status = 'degraded';
issues.push(`${staleExecutions.length} stale executions detected`);
}
return [{
json: {
status,
timestamp: new Date().toISOString(),
metrics,
recentActivity: {
executionsLastHour: recentExecutions.length,
failureRate: (failureRate * 100).toFixed(1) + '%',
staleExecutions: staleExecutions.length
},
issues
}
}];Metrics Collection
Prometheus Metrics Export
// Function node to export Prometheus metrics
const staticData = $getWorkflowStaticData('global');
const now = Date.now();
// Collect metrics
const metrics = [];
// Workflow execution counter
metrics.push(`# HELP n8n_workflow_executions_total Total workflow executions`);
metrics.push(`# TYPE n8n_workflow_executions_total counter`);
metrics.push(`n8n_workflow_executions_total{status="success"} ${staticData.successfulExecutions || 0}`);
metrics.push(`n8n_workflow_executions_total{status="failed"} ${staticData.failedExecutions || 0}`);
// Execution duration histogram
metrics.push(`# HELP n8n_workflow_execution_duration_seconds Workflow execution duration`);
metrics.push(`# TYPE n8n_workflow_execution_duration_seconds histogram`);
const durationBuckets = [0.1, 0.5, 1, 2.5, 5, 10, 30, 60, 120];
const durations = (staticData.executionDurations || []).slice(-1000);
for (const bucket of durationBuckets) {
const count = durations.filter(d => d <= bucket * 1000).length;
metrics.push(`n8n_workflow_execution_duration_seconds_bucket{le="${bucket}"} ${count}`);
}
metrics.push(`n8n_workflow_execution_duration_seconds_bucket{le="+Inf"} ${durations.length}`);
metrics.push(`n8n_workflow_execution_duration_seconds_sum ${durations.reduce((a, b) => a + b, 0) / 1000}`);
metrics.push(`n8n_workflow_execution_duration_seconds_count ${durations.length}`);
// Active executions gauge
const activeExecutions = Object.values(staticData.executions || {})
.filter(e => e.status === 'running').length;
metrics.push(`# HELP n8n_workflow_active_executions Currently running executions`);
metrics.push(`# TYPE n8n_workflow_active_executions gauge`);
metrics.push(`n8n_workflow_active_executions ${activeExecutions}`);
// Queue size (if applicable)
metrics.push(`# HELP n8n_workflow_queue_size Items waiting in queue`);
metrics.push(`# TYPE n8n_workflow_queue_size gauge`);
metrics.push(`n8n_workflow_queue_size ${staticData.queueSize || 0}`);
return [{
json: {
contentType: 'text/plain',
body: metrics.join('\n')
}
}];Custom Metrics Collection
// Function node for detailed metrics collection
const metricsCollector = {
// Record execution start
recordStart: (executionId, workflowName) => {
const staticData = $getWorkflowStaticData('global');
if (!staticData.metrics) {
staticData.metrics = {
executions: {},
byWorkflow: {},
byHour: {}
};
}
staticData.metrics.executions[executionId] = {
workflow: workflowName,
startTime: Date.now(),
nodeTimings: {}
};
},
// Record node execution
recordNode: (executionId, nodeName, duration, success) => {
const staticData = $getWorkflowStaticData('global');
const execution = staticData.metrics?.executions?.[executionId];
if (execution) {
execution.nodeTimings[nodeName] = {
duration,
success,
timestamp: Date.now()
};
}
},
// Record execution end
recordEnd: (executionId, success) => {
const staticData = $getWorkflowStaticData('global');
const execution = staticData.metrics?.executions?.[executionId];
if (!execution) return;
const duration = Date.now() - execution.startTime;
// Update workflow metrics
const workflow = execution.workflow;
if (!staticData.metrics.byWorkflow[workflow]) {
staticData.metrics.byWorkflow[workflow] = {
total: 0,
successful: 0,
failed: 0,
totalDuration: 0
};
}
const wfMetrics = staticData.metrics.byWorkflow[workflow];
wfMetrics.total++;
if (success) {
wfMetrics.successful++;
} else {
wfMetrics.failed++;
}
wfMetrics.totalDuration += duration;
wfMetrics.avgDuration = wfMetrics.totalDuration / wfMetrics.total;
// Update hourly metrics
const hour = new Date().toISOString().slice(0, 13);
if (!staticData.metrics.byHour[hour]) {
staticData.metrics.byHour[hour] = { total: 0, successful: 0, failed: 0 };
}
staticData.metrics.byHour[hour].total++;
if (success) {
staticData.metrics.byHour[hour].successful++;
} else {
staticData.metrics.byHour[hour].failed++;
}
// Clean up old hourly data (keep 24 hours)
const hours = Object.keys(staticData.metrics.byHour).sort();
if (hours.length > 24) {
hours.slice(0, hours.length - 24).forEach(h => {
delete staticData.metrics.byHour[h];
});
}
// Clean up execution record
delete staticData.metrics.executions[executionId];
},
// Get metrics summary
getSummary: () => {
const staticData = $getWorkflowStaticData('global');
return {
byWorkflow: staticData.metrics?.byWorkflow || {},
byHour: staticData.metrics?.byHour || {},
activeExecutions: Object.keys(staticData.metrics?.executions || {}).length
};
}
};
// Usage
const executionId = $executionId;
const workflowName = $workflow.name;
metricsCollector.recordStart(executionId, workflowName);
return [{
json: {
...$json,
_metricsContext: {
executionId,
workflowName,
startTime: Date.now()
}
}
}];Alerting Integration
Multi-Channel Alert System
// Function node for alert routing
const alert = $json;
// Alert severity to channel mapping
const routingConfig = {
critical: {
channels: ['pagerduty', 'slack', 'email'],
slackChannel: '#incidents',
escalateAfter: 300 // 5 minutes
},
high: {
channels: ['slack', 'email'],
slackChannel: '#alerts',
escalateAfter: 1800 // 30 minutes
},
medium: {
channels: ['slack'],
slackChannel: '#alerts-low',
escalateAfter: null
},
low: {
channels: ['slack'],
slackChannel: '#alerts-info',
escalateAfter: null
}
};
const severity = alert.severity || 'medium';
const config = routingConfig[severity] || routingConfig.medium;
// Generate alerts for each channel
const alerts = [];
for (const channel of config.channels) {
alerts.push({
channel,
alert: {
...alert,
routedAt: new Date().toISOString(),
config: {
slackChannel: config.slackChannel,
escalateAfter: config.escalateAfter
}
}
});
}
return alerts.map(a => ({ json: a }));PagerDuty Integration
// Function node for PagerDuty alert
const alert = $json.alert;
const pagerdutyPayload = {
routing_key: $env.PAGERDUTY_ROUTING_KEY,
event_action: alert.resolved ? 'resolve' : 'trigger',
dedup_key: alert.alertId,
payload: {
summary: alert.title,
severity: mapSeverity(alert.severity),
source: alert.source || 'n8n-workflow',
timestamp: new Date().toISOString(),
custom_details: {
workflow: alert.workflow,
error: alert.error,
executionId: alert.executionId,
metrics: alert.metrics
}
},
links: [
{
href: `${$env.N8N_URL}/execution/${alert.executionId}`,
text: 'View Execution'
}
]
};
function mapSeverity(severity) {
const mapping = {
'critical': 'critical',
'high': 'error',
'medium': 'warning',
'low': 'info'
};
return mapping[severity] || 'warning';
}
return [{
json: pagerdutyPayload
}];Slack Alert Formatting
// Function node for Slack alert formatting
const alert = $json.alert;
const severityConfig = {
critical: { emoji: '🔴', color: '#FF0000' },
high: { emoji: '🟠', color: '#FF8C00' },
medium: { emoji: '🟡', color: '#FFD700' },
low: { emoji: '🟢', color: '#32CD32' }
};
const config = severityConfig[alert.severity] || severityConfig.medium;
const message = {
channel: alert.config?.slackChannel || '#alerts',
attachments: [
{
color: config.color,
blocks: [
{
type: 'header',
text: {
type: 'plain_text',
text: `${config.emoji} ${alert.severity.toUpperCase()}: ${alert.title}`,
emoji: true
}
},
{
type: 'section',
fields: [
{
type: 'mrkdwn',
text: `*Workflow:*\n${alert.workflow}`
},
{
type: 'mrkdwn',
text: `*Time:*\n<!date^${Math.floor(Date.now()/1000)}^{date_short_pretty} {time}|now>`
},
{
type: 'mrkdwn',
text: `*Execution ID:*\n${alert.executionId}`
},
{
type: 'mrkdwn',
text: `*Status:*\n${alert.status}`
}
]
}
]
}
]
};
// Add error details if present
if (alert.error) {
message.attachments[0].blocks.push({
type: 'section',
text: {
type: 'mrkdwn',
text: `*Error:*\n\`\`\`${alert.error.substring(0, 500)}\`\`\``
}
});
}
// Add action buttons
message.attachments[0].blocks.push({
type: 'actions',
elements: [
{
type: 'button',
text: { type: 'plain_text', text: '📋 View Execution' },
url: `${$env.N8N_URL}/execution/${alert.executionId}`
},
{
type: 'button',
text: { type: 'plain_text', text: '✅ Acknowledge' },
action_id: 'ack_alert',
value: alert.alertId
},
{
type: 'button',
text: { type: 'plain_text', text: '🔇 Silence 1h' },
action_id: 'silence_alert',
value: JSON.stringify({ alertId: alert.alertId, duration: 3600 })
}
]
});
return [{ json: message }];Self-Healing Patterns
Automatic Retry with Backoff
// Function node for intelligent retry
const maxRetries = 5;
const baseDelay = 1000; // 1 second
const currentRetry = $json.retryCount || 0;
const lastError = $json.lastError;
// Check if we should retry
if (currentRetry >= maxRetries) {
// Max retries exceeded - escalate
return [{
json: {
action: 'escalate',
reason: 'Max retries exceeded',
totalRetries: currentRetry,
lastError,
originalData: $json.originalData
}
}];
}
// Calculate backoff delay with jitter
const exponentialDelay = baseDelay * Math.pow(2, currentRetry);
const jitter = Math.random() * 1000;
const delay = Math.min(exponentialDelay + jitter, 60000); // Max 1 minute
// Determine if error is retryable
const retryableErrors = [
'ECONNRESET',
'ETIMEDOUT',
'ECONNREFUSED',
'rate_limit',
'429',
'503',
'504'
];
const isRetryable = retryableErrors.some(
e => lastError?.includes(e) || lastError?.code === e
);
if (!isRetryable) {
return [{
json: {
action: 'fail',
reason: 'Non-retryable error',
error: lastError,
originalData: $json.originalData
}
}];
}
// Schedule retry
return [{
json: {
action: 'retry',
retryCount: currentRetry + 1,
delay,
scheduledFor: new Date(Date.now() + delay).toISOString(),
originalData: $json.originalData || $json
}
}];Circuit Breaker Implementation
// Function node for circuit breaker
const serviceName = $json.serviceName || 'default';
const staticData = $getWorkflowStaticData('global');
// Initialize circuit breaker state
if (!staticData.circuitBreakers) {
staticData.circuitBreakers = {};
}
if (!staticData.circuitBreakers[serviceName]) {
staticData.circuitBreakers[serviceName] = {
state: 'closed', // closed, open, half-open
failures: 0,
successes: 0,
lastFailure: null,
openedAt: null
};
}
const breaker = staticData.circuitBreakers[serviceName];
const config = {
failureThreshold: 5,
successThreshold: 3,
timeout: 60000 // 1 minute
};
// Check circuit state
const now = Date.now();
if (breaker.state === 'open') {
// Check if timeout has passed
if (now - breaker.openedAt > config.timeout) {
breaker.state = 'half-open';
breaker.successes = 0;
console.log(`Circuit breaker for ${serviceName} entering half-open state`);
} else {
// Circuit is open - fail fast
return [{
json: {
proceed: false,
reason: 'Circuit breaker open',
service: serviceName,
retryAfter: new Date(breaker.openedAt + config.timeout).toISOString()
}
}];
}
}
// Record result helper
const recordResult = (success) => {
if (success) {
breaker.failures = 0;
if (breaker.state === 'half-open') {
breaker.successes++;
if (breaker.successes >= config.successThreshold) {
breaker.state = 'closed';
console.log(`Circuit breaker for ${serviceName} closed`);
}
}
} else {
breaker.failures++;
breaker.lastFailure = now;
if (breaker.failures >= config.failureThreshold) {
breaker.state = 'open';
breaker.openedAt = now;
console.log(`Circuit breaker for ${serviceName} opened`);
}
}
};
return [{
json: {
proceed: true,
service: serviceName,
circuitState: breaker.state,
_recordResult: recordResult.toString() // Pass function for later use
}
}];Automatic Failover
// Function node for service failover
const primaryEndpoint = $env.PRIMARY_API_URL;
const failoverEndpoint = $env.FAILOVER_API_URL;
const staticData = $getWorkflowStaticData('global');
// Track endpoint health
if (!staticData.endpointHealth) {
staticData.endpointHealth = {
primary: { healthy: true, lastCheck: Date.now(), failures: 0 },
failover: { healthy: true, lastCheck: Date.now(), failures: 0 }
};
}
const health = staticData.endpointHealth;
// Determine which endpoint to use
let selectedEndpoint;
let endpointType;
if (health.primary.healthy) {
selectedEndpoint = primaryEndpoint;
endpointType = 'primary';
} else if (health.failover.healthy) {
selectedEndpoint = failoverEndpoint;
endpointType = 'failover';
console.log('Using failover endpoint');
} else {
// Both unhealthy - try primary anyway with fresh start
health.primary.failures = 0;
selectedEndpoint = primaryEndpoint;
endpointType = 'primary';
console.log('Both endpoints unhealthy, resetting to primary');
}
// Health check function
const recordEndpointResult = (type, success) => {
const endpoint = staticData.endpointHealth[type];
if (success) {
endpoint.failures = 0;
endpoint.healthy = true;
} else {
endpoint.failures++;
if (endpoint.failures >= 3) {
endpoint.healthy = false;
console.log(`Endpoint ${type} marked unhealthy after ${endpoint.failures} failures`);
}
}
endpoint.lastCheck = Date.now();
};
return [{
json: {
endpoint: selectedEndpoint,
endpointType,
originalPayload: $json,
_recordResult: (success) => recordEndpointResult(endpointType, success)
}
}];Dashboard and Reporting
Workflow Status Dashboard Data
// Function node to generate dashboard data
const staticData = $getWorkflowStaticData('global');
// Get all workflows status
const workflows = [
'data-sync',
'order-processing',
'notification-sender',
'report-generator'
];
const dashboardData = {
timestamp: new Date().toISOString(),
systemHealth: 'healthy',
workflows: [],
summary: {
totalExecutions24h: 0,
successRate24h: 0,
avgExecutionTime: 0,
activeAlerts: 0
}
};
const now = Date.now();
const twentyFourHoursAgo = now - 86400000;
for (const workflow of workflows) {
const metrics = staticData.metrics?.byWorkflow?.[workflow] || {
total: 0,
successful: 0,
failed: 0,
avgDuration: 0
};
const successRate = metrics.total > 0
? (metrics.successful / metrics.total * 100).toFixed(1)
: 100;
let health = 'healthy';
if (successRate < 95) health = 'degraded';
if (successRate < 80) health = 'unhealthy';
dashboardData.workflows.push({
name: workflow,
health,
metrics: {
total: metrics.total,
successful: metrics.successful,
failed: metrics.failed,
successRate: parseFloat(successRate),
avgDuration: Math.round(metrics.avgDuration)
},
lastExecution: metrics.lastExecution || null
});
dashboardData.summary.totalExecutions24h += metrics.total;
}
// Calculate overall success rate
const totalSuccessful = dashboardData.workflows
.reduce((sum, w) => sum + w.metrics.successful, 0);
dashboardData.summary.successRate24h = dashboardData.summary.totalExecutions24h > 0
? (totalSuccessful / dashboardData.summary.totalExecutions24h * 100).toFixed(1)
: 100;
// Determine system health
const unhealthyCount = dashboardData.workflows
.filter(w => w.health === 'unhealthy').length;
const degradedCount = dashboardData.workflows
.filter(w => w.health === 'degraded').length;
if (unhealthyCount > 0) {
dashboardData.systemHealth = 'unhealthy';
} else if (degradedCount > 0) {
dashboardData.systemHealth = 'degraded';
}
return [{ json: dashboardData }];Best Practices Summary
- Comprehensive Health Checks - Monitor workflow execution, duration, and error rates
- Metrics Collection - Export metrics to Prometheus or similar for visualization
- Multi-Channel Alerting - Route alerts based on severity to appropriate channels
- Self-Healing Patterns - Implement retries, circuit breakers, and failover
- Dashboard Visibility - Provide real-time visibility into workflow health
By implementing these monitoring patterns, you can build reliable, observable n8n workflows that automatically recover from failures.