n8n Monitoring and Observability: Building Self-Healing Workflows
Production n8n workflows require robust monitoring and observability. This guide covers implementing comprehensive monitoring with health checks, metrics collection, alerting, and self-healing patterns.
Workflow Health Monitoring
Execution Health Tracker
// Function node for workflow health tracking
const workflowId = $workflow.id;
const executionId = $executionId;
const startTime = $json.startTime || Date.now();
// Store execution metrics
const execution = {
workflowId,
executionId,
startTime,
status: 'running',
metadata: {
triggerType: $json.triggerType || 'unknown',
inputSize: JSON.stringify($json).length,
retryCount: $json.retryCount || 0
}
};
// Use workflow static data for tracking
const staticData = $getWorkflowStaticData('global');
if (!staticData.executions) {
staticData.executions = {};
}
staticData.executions[executionId] = execution;
// Clean old executions (keep last 100)
const executionIds = Object.keys(staticData.executions);
if (executionIds.length > 100) {
const toDelete = executionIds.slice(0, executionIds.length - 100);
toDelete.forEach(id => delete staticData.executions[id]);
}
return [{
json: {
...$json,
_monitoring: {
executionId,
startTime,
trackingEnabled: true
}
}
}];Health Check Endpoint Workflow
// Webhook trigger workflow for health checks
// Returns workflow system health status
// Function node to check system health
const staticData = $getWorkflowStaticData('global');
const now = Date.now();
// Calculate metrics
const metrics = {
uptime: now - (staticData.startTime || now),
totalExecutions: staticData.totalExecutions || 0,
successfulExecutions: staticData.successfulExecutions || 0,
failedExecutions: staticData.failedExecutions || 0,
averageExecutionTime: staticData.averageExecutionTime || 0,
lastExecutionTime: staticData.lastExecutionTime || null
};
// Check recent failure rate
const recentExecutions = Object.values(staticData.executions || {})
.filter(e => now - e.startTime < 3600000); // Last hour
const recentFailures = recentExecutions.filter(e => e.status === 'failed').length;
const failureRate = recentExecutions.length > 0
? recentFailures / recentExecutions.length
: 0;
// Determine health status
let status = 'healthy';
let issues = [];
if (failureRate > 0.5) {
status = 'unhealthy';
issues.push(`High failure rate: ${(failureRate * 100).toFixed(1)}%`);
} else if (failureRate > 0.2) {
status = 'degraded';
issues.push(`Elevated failure rate: ${(failureRate * 100).toFixed(1)}%`);
}
// Check for stale executions
const staleExecutions = recentExecutions.filter(
e => e.status === 'running' && now - e.startTime > 300000 // 5 min timeout
);
if (staleExecutions.length > 0) {
status = 'degraded';
issues.push(`${staleExecutions.length} stale executions detected`);
}
return [{
json: {
status,
timestamp: new Date().toISOString(),
metrics,
recentActivity: {
executionsLastHour: recentExecutions.length,
failureRate: (failureRate * 100).toFixed(1) + '%',
staleExecutions: staleExecutions.length
},
issues
}
}];Metrics Collection
Prometheus Metrics Export
// Function node to export Prometheus metrics
const staticData = $getWorkflowStaticData('global');
const now = Date.now();
// Collect metrics
const metrics = [];
// Workflow execution counter
metrics.push(`# HELP n8n_workflow_executions_total Total workflow executions`);
metrics.push(`# TYPE n8n_workflow_executions_total counter`);
metrics.push(`n8n_workflow_executions_total{status="success"} ${staticData.successfulExecutions || 0}`);
metrics.push(`n8n_workflow_executions_total{status="failed"} ${staticData.failedExecutions || 0}`);
// Execution duration histogram
metrics.push(`# HELP n8n_workflow_execution_duration_seconds Workflow execution duration`);
metrics.push(`# TYPE n8n_workflow_execution_duration_seconds histogram`);
const durationBuckets = [0.1, 0.5, 1, 2.5, 5, 10, 30, 60, 120];
const durations = (staticData.executionDurations || []).slice(-1000);
for (const bucket of durationBuckets) {
const count = durations.filter(d => d <= bucket * 1000).length;
metrics.push(`n8n_workflow_execution_duration_seconds_bucket{le="${bucket}"} ${count}`);
}
metrics.push(`n8n_workflow_execution_duration_seconds_bucket{le="+Inf"} ${durations.length}`);
metrics.push(`n8n_workflow_execution_duration_seconds_sum ${durations.reduce((a, b) => a + b, 0) / 1000}`);
metrics.push(`n8n_workflow_execution_duration_seconds_count ${durations.length}`);
// Active executions gauge
const activeExecutions = Object.values(staticData.executions || {})
.filter(e => e.status === 'running').length;
metrics.push(`# HELP n8n_workflow_active_executions Currently running executions`);
metrics.push(`# TYPE n8n_workflow_active_executions gauge`);
metrics.push(`n8n_workflow_active_executions ${activeExecutions}`);
// Queue size (if applicable)
metrics.push(`# HELP n8n_workflow_queue_size Items waiting in queue`);
metrics.push(`# TYPE n8n_workflow_queue_size gauge`);
metrics.push(`n8n_workflow_queue_size ${staticData.queueSize || 0}`);
return [{
json: {
contentType: 'text/plain',
body: metrics.join('\n')
}
}];Custom Metrics Collection
// Function node for detailed metrics collection
const metricsCollector = {
// Record execution start
recordStart: (executionId, workflowName) => {
const staticData = $getWorkflowStaticData('global');
if (!staticData.metrics) {
staticData.metrics = {
executions: {},
byWorkflow: {},
byHour: {}
};
}
staticData.metrics.executions[executionId] = {
workflow: workflowName,
startTime: Date.now(),
nodeTimings: {}
};
},
// Record node execution
recordNode: (executionId, nodeName, duration, success) => {
const staticData = $getWorkflowStaticData('global');
const execution = staticData.metrics?.executions?.[executionId];
if (execution) {
execution.nodeTimings[nodeName] = {
duration,
success,
timestamp: Date.now()
};
}
},
// Record execution end
recordEnd: (executionId, success) => {
const staticData = $getWorkflowStaticData('global');
const execution = staticData.metrics?.executions?.[executionId];
if (!execution) return;
const duration = Date.now() - execution.startTime;
// Update workflow metrics
const workflow = execution.workflow;
if (!staticData.metrics.byWorkflow[workflow]) {
staticData.metrics.byWorkflow[workflow] = {
total: 0,
successful: 0,
failed: 0,
totalDuration: 0
};
}
const wfMetrics = staticData.metrics.byWorkflow[workflow];
wfMetrics.total++;
if (success) {
wfMetrics.successful++;
} else {
wfMetrics.failed++;
}
wfMetrics.totalDuration += duration;
wfMetrics.avgDuration = wfMetrics.totalDuration / wfMetrics.total;
// Update hourly metrics
const hour = new Date().toISOString().slice(0, 13);
if (!staticData.metrics.byHour[hour]) {
staticData.metrics.byHour[hour] = { total: 0, successful: 0, failed: 0 };
}
staticData.metrics.byHour[hour].total++;
if (success) {
staticData.metrics.byHour[hour].successful++;
} else {
staticData.metrics.byHour[hour].failed++;
}
// Clean up old hourly data (keep 24 hours)
const hours = Object.keys(staticData.metrics.byHour).sort();
if (hours.length > 24) {
hours.slice(0, hours.length - 24).forEach(h => {
delete staticData.metrics.byHour[h];
});
}
// Clean up execution record
delete staticData.metrics.executions[executionId];
},
// Get metrics summary
getSummary: () => {
const staticData = $getWorkflowStaticData('global');
return {
byWorkflow: staticData.metrics?.byWorkflow || {},
byHour: staticData.metrics?.byHour || {},
activeExecutions: Object.keys(staticData.metrics?.executions || {}).length
};
}
};
// Usage
const executionId = $executionId;
const workflowName = $workflow.name;
metricsCollector.recordStart(executionId, workflowName);
return [{
json: {
...$json,
_metricsContext: {
executionId,
workflowName,
startTime: Date.now()
}
}
}];Alerting Integration
Multi-Channel Alert System
// Function node for alert routing
const alert = $json;
// Alert severity to channel mapping
const routingConfig = {
critical: {
channels: ['pagerduty', 'slack', 'email'],
slackChannel: '#incidents',
escalateAfter: 300 // 5 minutes
},
high: {
channels: ['slack', 'email'],
slackChannel: '#alerts',
escalateAfter: 1800 // 30 minutes
},
medium: {
channels: ['slack'],
slackChannel: '#alerts-low',
escalateAfter: null
},
low: {
channels: ['slack'],
slackChannel: '#alerts-info',
escalateAfter: null
}
};
const severity = alert.severity || 'medium';
const config = routingConfig[severity] || routingConfig.medium;
// Generate alerts for each channel
const alerts = [];
for (const channel of config.channels) {
alerts.push({
channel,
alert: {
...alert,
routedAt: new Date().toISOString(),
config: {
slackChannel: config.slackChannel,
escalateAfter: config.escalateAfter
}
}
});
}
return alerts.map(a => ({ json: a }));PagerDuty Integration
// Function node for PagerDuty alert
const alert = $json.alert;
const pagerdutyPayload = {
routing_key: $env.PAGERDUTY_ROUTING_KEY,
event_action: alert.resolved ? 'resolve' : 'trigger',
dedup_key: alert.alertId,
payload: {
summary: alert.title,
severity: mapSeverity(alert.severity),
source: alert.source || 'n8n-workflow',
timestamp: new Date().toISOString(),
custom_details: {
workflow: alert.workflow,
error: alert.error,
executionId: alert.executionId,
metrics: alert.metrics
}
},
links: [
{
href: `${$env.N8N_URL}/execution/${alert.executionId}`,
text: 'View Execution'
}
]
};
function mapSeverity(severity) {
const mapping = {
'critical': 'critical',
'high': 'error',
'medium': 'warning',
'low': 'info'
};
return mapping[severity] || 'warning';
}
return [{
json: pagerdutyPayload
}];Slack Alert Formatting
// Function node for Slack alert formatting
const alert = $json.alert;
const severityConfig = {
critical: { emoji: '🔴', color: '#FF0000' },
high: { emoji: '🟠', color: '#FF8C00' },
medium: { emoji: '🟡', color: '#FFD700' },
low: { emoji: '🟢', color: '#32CD32' }
};
const config = severityConfig[alert.severity] || severityConfig.medium;
const message = {
channel: alert.config?.slackChannel || '#alerts',
attachments: [
{
color: config.color,
blocks: [
{
type: 'header',
text: {
type: 'plain_text',
text: `${config.emoji} ${alert.severity.toUpperCase()}: ${alert.title}`,
emoji: true
}
},
{
type: 'section',
fields: [
{
type: 'mrkdwn',
text: `*Workflow:*\n${alert.workflow}`
},
{
type: 'mrkdwn',
text: `*Time:*\n<!date^${Math.floor(Date.now()/1000)}^{date_short_pretty} {time}|now>`
},
{
type: 'mrkdwn',
text: `*Execution ID:*\n${alert.executionId}`
},
{
type: 'mrkdwn',
text: `*Status:*\n${alert.status}`
}
]
}
]
}
]
};
// Add error details if present
if (alert.error) {
message.attachments[0].blocks.push({
type: 'section',
text: {
type: 'mrkdwn',
text: `*Error:*\n\`\`\`${alert.error.substring(0, 500)}\`\`\``
}
});
}
// Add action buttons
message.attachments[0].blocks.push({
type: 'actions',
elements: [
{
type: 'button',
text: { type: 'plain_text', text: '📋 View Execution' },
url: `${$env.N8N_URL}/execution/${alert.executionId}`
},
{
type: 'button',
text: { type: 'plain_text', text: '✅ Acknowledge' },
action_id: 'ack_alert',
value: alert.alertId
},
{
type: 'button',
text: { type: 'plain_text', text: '🔇 Silence 1h' },
action_id: 'silence_alert',
value: JSON.stringify({ alertId: alert.alertId, duration: 3600 })
}
]
});
return [{ json: message }];Self-Healing Patterns
Automatic Retry with Backoff
// Function node for intelligent retry
const maxRetries = 5;
const baseDelay = 1000; // 1 second
const currentRetry = $json.retryCount || 0;
const lastError = $json.lastError;
// Check if we should retry
if (currentRetry >= maxRetries) {
// Max retries exceeded - escalate
return [{
json: {
action: 'escalate',
reason: 'Max retries exceeded',
totalRetries: currentRetry,
lastError,
originalData: $json.originalData
}
}];
}
// Calculate backoff delay with jitter
const exponentialDelay = baseDelay * Math.pow(2, currentRetry);
const jitter = Math.random() * 1000;
const delay = Math.min(exponentialDelay + jitter, 60000); // Max 1 minute
// Determine if error is retryable
const retryableErrors = [
'ECONNRESET',
'ETIMEDOUT',
'ECONNREFUSED',
'rate_limit',
'429',
'503',
'504'
];
const isRetryable = retryableErrors.some(
e => lastError?.includes(e) || lastError?.code === e
);
if (!isRetryable) {
return [{
json: {
action: 'fail',
reason: 'Non-retryable error',
error: lastError,
originalData: $json.originalData
}
}];
}
// Schedule retry
return [{
json: {
action: 'retry',
retryCount: currentRetry + 1,
delay,
scheduledFor: new Date(Date.now() + delay).toISOString(),
originalData: $json.originalData || $json
}
}];Circuit Breaker Implementation
// Function node for circuit breaker
const serviceName = $json.serviceName || 'default';
const staticData = $getWorkflowStaticData('global');
// Initialize circuit breaker state
if (!staticData.circuitBreakers) {
staticData.circuitBreakers = {};
}
if (!staticData.circuitBreakers[serviceName]) {
staticData.circuitBreakers[serviceName] = {
state: 'closed', // closed, open, half-open
failures: 0,
successes: 0,
lastFailure: null,
openedAt: null
};
}
const breaker = staticData.circuitBreakers[serviceName];
const config = {
failureThreshold: 5,
successThreshold: 3,
timeout: 60000 // 1 minute
};
// Check circuit state
const now = Date.now();
if (breaker.state === 'open') {
// Check if timeout has passed
if (now - breaker.openedAt > config.timeout) {
breaker.state = 'half-open';
breaker.successes = 0;
console.log(`Circuit breaker for ${serviceName} entering half-open state`);
} else {
// Circuit is open - fail fast
return [{
json: {
proceed: false,
reason: 'Circuit breaker open',
service: serviceName,
retryAfter: new Date(breaker.openedAt + config.timeout).toISOString()
}
}];
}
}
// Record result helper
const recordResult = (success) => {
if (success) {
breaker.failures = 0;
if (breaker.state === 'half-open') {
breaker.successes++;
if (breaker.successes >= config.successThreshold) {
breaker.state = 'closed';
console.log(`Circuit breaker for ${serviceName} closed`);
}
}
} else {
breaker.failures++;
breaker.lastFailure = now;
if (breaker.failures >= config.failureThreshold) {
breaker.state = 'open';
breaker.openedAt = now;
console.log(`Circuit breaker for ${serviceName} opened`);
}
}
};
return [{
json: {
proceed: true,
service: serviceName,
circuitState: breaker.state,
_recordResult: recordResult.toString() // Pass function for later use
}
}];Automatic Failover
// Function node for service failover
const primaryEndpoint = $env.PRIMARY_API_URL;
const failoverEndpoint = $env.FAILOVER_API_URL;
const staticData = $getWorkflowStaticData('global');
// Track endpoint health
if (!staticData.endpointHealth) {
staticData.endpointHealth = {
primary: { healthy: true, lastCheck: Date.now(), failures: 0 },
failover: { healthy: true, lastCheck: Date.now(), failures: 0 }
};
}
const health = staticData.endpointHealth;
// Determine which endpoint to use
let selectedEndpoint;
let endpointType;
if (health.primary.healthy) {
selectedEndpoint = primaryEndpoint;
endpointType = 'primary';
} else if (health.failover.healthy) {
selectedEndpoint = failoverEndpoint;
endpointType = 'failover';
console.log('Using failover endpoint');
} else {
// Both unhealthy - try primary anyway with fresh start
health.primary.failures = 0;
selectedEndpoint = primaryEndpoint;
endpointType = 'primary';
console.log('Both endpoints unhealthy, resetting to primary');
}
// Health check function
const recordEndpointResult = (type, success) => {
const endpoint = staticData.endpointHealth[type];
if (success) {
endpoint.failures = 0;
endpoint.healthy = true;
} else {
endpoint.failures++;
if (endpoint.failures >= 3) {
endpoint.healthy = false;
console.log(`Endpoint ${type} marked unhealthy after ${endpoint.failures} failures`);
}
}
endpoint.lastCheck = Date.now();
};
return [{
json: {
endpoint: selectedEndpoint,
endpointType,
originalPayload: $json,
_recordResult: (success) => recordEndpointResult(endpointType, success)
}
}];Dashboard and Reporting
Workflow Status Dashboard Data
// Function node to generate dashboard data
const staticData = $getWorkflowStaticData('global');
// Get all workflows status
const workflows = [
'data-sync',
'order-processing',
'notification-sender',
'report-generator'
];
const dashboardData = {
timestamp: new Date().toISOString(),
systemHealth: 'healthy',
workflows: [],
summary: {
totalExecutions24h: 0,
successRate24h: 0,
avgExecutionTime: 0,
activeAlerts: 0
}
};
const now = Date.now();
const twentyFourHoursAgo = now - 86400000;
for (const workflow of workflows) {
const metrics = staticData.metrics?.byWorkflow?.[workflow] || {
total: 0,
successful: 0,
failed: 0,
avgDuration: 0
};
const successRate = metrics.total > 0
? (metrics.successful / metrics.total * 100).toFixed(1)
: 100;
let health = 'healthy';
if (successRate < 95) health = 'degraded';
if (successRate < 80) health = 'unhealthy';
dashboardData.workflows.push({
name: workflow,
health,
metrics: {
total: metrics.total,
successful: metrics.successful,
failed: metrics.failed,
successRate: parseFloat(successRate),
avgDuration: Math.round(metrics.avgDuration)
},
lastExecution: metrics.lastExecution || null
});
dashboardData.summary.totalExecutions24h += metrics.total;
}
// Calculate overall success rate
const totalSuccessful = dashboardData.workflows
.reduce((sum, w) => sum + w.metrics.successful, 0);
dashboardData.summary.successRate24h = dashboardData.summary.totalExecutions24h > 0
? (totalSuccessful / dashboardData.summary.totalExecutions24h * 100).toFixed(1)
: 100;
// Determine system health
const unhealthyCount = dashboardData.workflows
.filter(w => w.health === 'unhealthy').length;
const degradedCount = dashboardData.workflows
.filter(w => w.health === 'degraded').length;
if (unhealthyCount > 0) {
dashboardData.systemHealth = 'unhealthy';
} else if (degradedCount > 0) {
dashboardData.systemHealth = 'degraded';
}
return [{ json: dashboardData }];Best Practices Summary
- Comprehensive Health Checks - Monitor workflow execution, duration, and error rates
- Metrics Collection - Export metrics to Prometheus or similar for visualization
- Multi-Channel Alerting - Route alerts based on severity to appropriate channels
- Self-Healing Patterns - Implement retries, circuit breakers, and failover
- Dashboard Visibility - Provide real-time visibility into workflow health
By implementing these monitoring patterns, you can build reliable, observable n8n workflows that automatically recover from failures.
Is your AI system compliant with the EU AI Act? Free risk assessment - find out in 2 minutes →