Document processing automation combines OCR, AI extraction, and workflow orchestration to handle invoices, contracts, forms, and other business documents at scale. This guide shows you how to build intelligent document processing pipelines with n8n.
Document Classification System
Automatically classify incoming documents by type:
// Document Classification Node - JavaScript Code
const documentTypes = {
invoice: {
keywords: ['invoice', 'bill to', 'payment due', 'total amount', 'subtotal', 'tax'],
patterns: [
/invoice\s*(number|no|#)/i,
/amount\s*due/i,
/payment\s*terms/i
],
weight: 0
},
contract: {
keywords: ['agreement', 'parties', 'whereas', 'terms and conditions', 'hereby', 'witness'],
patterns: [
/party\s*of\s*the\s*(first|second)/i,
/effective\s*date/i,
/termination/i
],
weight: 0
},
receipt: {
keywords: ['receipt', 'paid', 'thank you', 'transaction', 'change', 'cashier'],
patterns: [
/receipt\s*(number|no|#)/i,
/payment\s*received/i,
/thank\s*you\s*for/i
],
weight: 0
},
purchase_order: {
keywords: ['purchase order', 'po number', 'ship to', 'vendor', 'quantity', 'unit price'],
patterns: [
/p\.?o\.?\s*(number|no|#)/i,
/ship\s*to/i,
/delivery\s*date/i
],
weight: 0
},
application_form: {
keywords: ['application', 'applicant', 'signature', 'date of birth', 'address', 'phone'],
patterns: [
/applicant\s*information/i,
/please\s*(print|fill)/i,
/signature\s*date/i
],
weight: 0
}
};
function classifyDocument(text) {
const normalizedText = text.toLowerCase();
const results = {};
for (const [docType, config] of Object.entries(documentTypes)) {
let score = 0;
// Keyword matching
for (const keyword of config.keywords) {
const regex = new RegExp(keyword, 'gi');
const matches = (normalizedText.match(regex) || []).length;
score += matches * 2;
}
// Pattern matching
for (const pattern of config.patterns) {
if (pattern.test(text)) {
score += 5;
}
}
results[docType] = score;
}
// Find highest scoring type
const sorted = Object.entries(results).sort((a, b) => b[1] - a[1]);
const [topType, topScore] = sorted[0];
const [secondType, secondScore] = sorted[1] || ['unknown', 0];
// Calculate confidence
const totalScore = Object.values(results).reduce((a, b) => a + b, 0);
const confidence = totalScore > 0 ? (topScore / totalScore) * 100 : 0;
return {
documentType: topScore >= 5 ? topType : 'unknown',
confidence: Math.round(confidence),
scores: results,
needsReview: confidence < 60 || (topScore - secondScore) < 3
};
}
// Get OCR text from previous node
const ocrText = $input.first().json.extractedText;
// Classify the document
const classification = classifyDocument(ocrText);
return [{
json: {
...classification,
originalText: ocrText,
timestamp: new Date().toISOString()
}
}];Invoice Data Extraction
Extract structured data from invoices using AI:
// Invoice Extraction Node - Function
async function extractInvoiceData(text, aiModel) {
const extractionPrompt = `
Extract the following information from this invoice text. Return a JSON object with these fields:
- vendorName: Company name on the invoice
- vendorAddress: Full address of vendor
- invoiceNumber: Invoice or bill number
- invoiceDate: Date of invoice (YYYY-MM-DD format)
- dueDate: Payment due date (YYYY-MM-DD format)
- customerName: Bill to / customer name
- customerAddress: Customer address
- lineItems: Array of {description, quantity, unitPrice, total}
- subtotal: Subtotal amount (number)
- taxRate: Tax percentage if shown
- taxAmount: Tax amount (number)
- totalAmount: Total amount due (number)
- currency: Currency code (USD, EUR, etc.)
- paymentTerms: Payment terms if specified
- notes: Any special notes or comments
If a field is not found, use null. For amounts, extract numbers only.
Invoice Text:
${text}
`;
// This would call your AI model (OpenAI, Claude, etc.)
const response = await aiModel.complete({
prompt: extractionPrompt,
maxTokens: 2000,
temperature: 0.1
});
try {
return JSON.parse(response);
} catch (e) {
// Try to extract JSON from response
const jsonMatch = response.match(/\{[\s\S]*\}/);
if (jsonMatch) {
return JSON.parse(jsonMatch[0]);
}
throw new Error('Failed to parse AI response');
}
}
// Validation and normalization
function validateInvoiceData(data) {
const errors = [];
const warnings = [];
// Required fields
const requiredFields = ['invoiceNumber', 'invoiceDate', 'totalAmount', 'vendorName'];
for (const field of requiredFields) {
if (!data[field]) {
errors.push(`Missing required field: ${field}`);
}
}
// Validate dates
if (data.invoiceDate && !/^\d{4}-\d{2}-\d{2}$/.test(data.invoiceDate)) {
warnings.push('Invoice date format may be incorrect');
}
// Validate amounts
if (data.lineItems && data.lineItems.length > 0) {
const calculatedSubtotal = data.lineItems.reduce(
(sum, item) => sum + (parseFloat(item.total) || 0), 0
);
if (Math.abs(calculatedSubtotal - (data.subtotal || 0)) > 0.01) {
warnings.push('Line item totals do not match subtotal');
}
}
// Validate total calculation
const expectedTotal = (data.subtotal || 0) + (data.taxAmount || 0);
if (data.totalAmount && Math.abs(expectedTotal - data.totalAmount) > 0.01) {
warnings.push('Subtotal + tax does not equal total');
}
return {
isValid: errors.length === 0,
errors,
warnings,
confidenceScore: calculateConfidence(data, errors, warnings)
};
}
function calculateConfidence(data, errors, warnings) {
let score = 100;
// Deduct for errors
score -= errors.length * 20;
// Deduct for warnings
score -= warnings.length * 5;
// Deduct for missing optional fields
const optionalFields = ['dueDate', 'customerName', 'paymentTerms'];
for (const field of optionalFields) {
if (!data[field]) {
score -= 3;
}
}
return Math.max(0, Math.min(100, score));
}
// Main extraction workflow
const ocrText = $input.first().json.extractedText;
const extractedData = await extractInvoiceData(ocrText, $ai);
const validation = validateInvoiceData(extractedData);
return [{
json: {
invoiceData: extractedData,
validation,
processingTimestamp: new Date().toISOString(),
requiresManualReview: !validation.isValid || validation.confidenceScore < 80
}
}];Contract Analysis Pipeline
Analyze contracts for key terms and obligations:
// Contract Analysis Node
const analysisPrompt = `
Analyze this contract and extract:
1. PARTIES:
- List all parties involved with their roles
2. KEY DATES:
- Effective date
- Termination date
- Renewal dates
- Important deadlines
3. FINANCIAL TERMS:
- Payment amounts
- Payment schedule
- Penalties/fees
- Price adjustments
4. OBLIGATIONS:
- List key obligations for each party
- Deliverables and timelines
5. TERMINATION CLAUSES:
- Termination conditions
- Notice periods
- Exit procedures
6. RISK FACTORS:
- Liability limitations
- Indemnification clauses
- Insurance requirements
- Warranties/guarantees
7. COMPLIANCE REQUIREMENTS:
- Regulatory requirements mentioned
- Data protection obligations
- Audit rights
8. SPECIAL PROVISIONS:
- Non-compete clauses
- Confidentiality terms
- IP ownership
- Exclusivity
Return as structured JSON with risk_level (low/medium/high) for each section.
`;
async function analyzeContract(contractText) {
const response = await $ai.complete({
prompt: `${analysisPrompt}\n\nContract:\n${contractText}`,
maxTokens: 4000,
temperature: 0.2
});
const analysis = JSON.parse(response);
// Calculate overall risk score
const riskLevels = { low: 1, medium: 2, high: 3 };
const sections = Object.values(analysis);
let totalRisk = 0;
let riskCount = 0;
for (const section of sections) {
if (section.risk_level) {
totalRisk += riskLevels[section.risk_level] || 0;
riskCount++;
}
}
const avgRisk = riskCount > 0 ? totalRisk / riskCount : 0;
return {
...analysis,
overallRiskScore: avgRisk,
overallRiskLevel: avgRisk < 1.5 ? 'low' : avgRisk < 2.5 ? 'medium' : 'high',
analyzedAt: new Date().toISOString()
};
}
// Extract key dates for calendar integration
function extractCalendarEvents(analysis) {
const events = [];
if (analysis.KEY_DATES) {
const dates = analysis.KEY_DATES;
if (dates.termination_date) {
// Reminder 30 days before termination
const termDate = new Date(dates.termination_date);
const reminderDate = new Date(termDate);
reminderDate.setDate(reminderDate.getDate() - 30);
events.push({
title: 'Contract Termination Reminder',
date: reminderDate.toISOString(),
description: `Contract terminates on ${dates.termination_date}`,
type: 'reminder'
});
events.push({
title: 'Contract Termination',
date: termDate.toISOString(),
description: 'Contract termination date',
type: 'deadline'
});
}
if (dates.renewal_dates) {
for (const renewalDate of dates.renewal_dates) {
const renDate = new Date(renewalDate);
const reminderDate = new Date(renDate);
reminderDate.setDate(reminderDate.getDate() - 60);
events.push({
title: 'Contract Renewal Review',
date: reminderDate.toISOString(),
description: `Review contract before renewal on ${renewalDate}`,
type: 'review'
});
}
}
}
return events;
}
const contractText = $input.first().json.extractedText;
const analysis = await analyzeContract(contractText);
const calendarEvents = extractCalendarEvents(analysis);
return [{
json: {
contractAnalysis: analysis,
calendarEvents,
alerts: generateAlerts(analysis)
}
}];
function generateAlerts(analysis) {
const alerts = [];
if (analysis.overallRiskLevel === 'high') {
alerts.push({
level: 'critical',
message: 'High-risk contract requires legal review',
action: 'route_to_legal'
});
}
if (analysis.RISK_FACTORS?.liability_limitations?.risk_level === 'high') {
alerts.push({
level: 'warning',
message: 'Significant liability limitations detected',
action: 'review_required'
});
}
if (analysis.COMPLIANCE_REQUIREMENTS?.data_protection_obligations) {
alerts.push({
level: 'info',
message: 'Data protection obligations identified',
action: 'notify_dpo'
});
}
return alerts;
}Form Data Extraction
Process application forms and surveys:
// Form Field Extraction Node
const formExtractionConfig = {
application_form: {
fields: [
{ name: 'applicantName', label: 'Full Name', type: 'text', required: true },
{ name: 'dateOfBirth', label: 'Date of Birth', type: 'date', required: true },
{ name: 'email', label: 'Email', type: 'email', required: true },
{ name: 'phone', label: 'Phone', type: 'phone', required: true },
{ name: 'address', label: 'Address', type: 'address', required: true },
{ name: 'ssn', label: 'SSN', type: 'ssn', required: false, sensitive: true },
{ name: 'signature', label: 'Signature', type: 'signature', required: true },
{ name: 'signatureDate', label: 'Date', type: 'date', required: true }
]
},
survey: {
fields: [
{ name: 'respondentId', label: 'ID', type: 'text' },
{ name: 'responses', label: 'Responses', type: 'array' }
]
}
};
async function extractFormFields(text, formType) {
const config = formExtractionConfig[formType];
if (!config) {
throw new Error(`Unknown form type: ${formType}`);
}
const fieldDescriptions = config.fields.map(f =>
`- ${f.name}: ${f.label} (${f.type}${f.required ? ', required' : ''})`
).join('\n');
const prompt = `
Extract form field values from this document. Fields to extract:
${fieldDescriptions}
Rules:
- For dates, use YYYY-MM-DD format
- For phone numbers, use E.164 format (+1XXXXXXXXXX)
- For addresses, return as structured object with street, city, state, zip, country
- If a field is not found or illegible, use null
- For checkboxes, use true/false
- For signatures, return "present" or "missing"
Document text:
${text}
Return as JSON with field names as keys.
`;
const response = await $ai.complete({
prompt,
maxTokens: 2000,
temperature: 0.1
});
return JSON.parse(response);
}
function validateFormData(data, formType) {
const config = formExtractionConfig[formType];
const errors = [];
const warnings = [];
for (const field of config.fields) {
const value = data[field.name];
// Check required fields
if (field.required && (value === null || value === undefined || value === '')) {
errors.push(`Missing required field: ${field.label}`);
continue;
}
if (value === null) continue;
// Type-specific validation
switch (field.type) {
case 'email':
if (!/^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(value)) {
warnings.push(`Invalid email format: ${value}`);
}
break;
case 'phone':
if (!/^\+?[\d\s\-()]+$/.test(value)) {
warnings.push(`Invalid phone format: ${value}`);
}
break;
case 'date':
if (!/^\d{4}-\d{2}-\d{2}$/.test(value)) {
warnings.push(`Invalid date format for ${field.label}: ${value}`);
}
break;
case 'ssn':
// Should be masked or valid format
if (value && !/^\d{3}-?\d{2}-?\d{4}$|^\*{3}-\*{2}-\d{4}$/.test(value)) {
warnings.push('SSN format appears invalid');
}
break;
case 'signature':
if (value !== 'present') {
errors.push('Signature not detected');
}
break;
}
}
return {
isComplete: errors.length === 0,
errors,
warnings,
completionPercentage: calculateCompletion(data, config.fields)
};
}
function calculateCompletion(data, fields) {
const totalFields = fields.length;
let completedFields = 0;
for (const field of fields) {
const value = data[field.name];
if (value !== null && value !== undefined && value !== '') {
completedFields++;
}
}
return Math.round((completedFields / totalFields) * 100);
}
// Mask sensitive data before storage
function maskSensitiveData(data, formType) {
const config = formExtractionConfig[formType];
const masked = { ...data };
for (const field of config.fields) {
if (field.sensitive && masked[field.name]) {
const value = masked[field.name];
if (field.type === 'ssn') {
// Keep last 4 digits
masked[field.name] = `***-**-${value.slice(-4)}`;
} else {
// Generic masking
masked[field.name] = '***REDACTED***';
}
}
}
return masked;
}
// Main processing
const { extractedText, documentType } = $input.first().json;
const extractedData = await extractFormFields(extractedText, documentType);
const validation = validateFormData(extractedData, documentType);
const maskedData = maskSensitiveData(extractedData, documentType);
return [{
json: {
formType: documentType,
extractedData: maskedData,
rawDataReference: `secure-vault://${generateSecureId()}`, // Store sensitive data separately
validation,
processedAt: new Date().toISOString()
}
}];
function generateSecureId() {
return 'doc_' + Math.random().toString(36).substr(2, 9);
}Document Routing Workflow
Route processed documents to appropriate destinations:
// Document Router Node
const routingRules = [
{
name: 'High-Value Invoices',
conditions: {
documentType: 'invoice',
'invoiceData.totalAmount': { $gte: 10000 }
},
actions: [
{ type: 'notify', channel: 'slack', recipients: ['#finance-approvals'] },
{ type: 'assign', queue: 'manager-approval' },
{ type: 'flag', priority: 'high' }
]
},
{
name: 'Standard Invoices',
conditions: {
documentType: 'invoice',
'validation.isValid': true,
'invoiceData.totalAmount': { $lt: 10000 }
},
actions: [
{ type: 'process', workflow: 'auto-payment' },
{ type: 'store', destination: 'accounting-system' }
]
},
{
name: 'Invalid Invoices',
conditions: {
documentType: 'invoice',
'validation.isValid': false
},
actions: [
{ type: 'assign', queue: 'manual-review' },
{ type: 'notify', channel: 'email', recipients: ['ap@company.com'] }
]
},
{
name: 'High-Risk Contracts',
conditions: {
documentType: 'contract',
'contractAnalysis.overallRiskLevel': 'high'
},
actions: [
{ type: 'assign', queue: 'legal-review' },
{ type: 'notify', channel: 'email', recipients: ['legal@company.com'] },
{ type: 'flag', priority: 'urgent' }
]
},
{
name: 'Standard Contracts',
conditions: {
documentType: 'contract',
'contractAnalysis.overallRiskLevel': { $in: ['low', 'medium'] }
},
actions: [
{ type: 'store', destination: 'contract-repository' },
{ type: 'createCalendarEvents' },
{ type: 'notify', channel: 'slack', recipients: ['#contracts'] }
]
},
{
name: 'Complete Applications',
conditions: {
documentType: 'application_form',
'validation.isComplete': true
},
actions: [
{ type: 'process', workflow: 'application-processing' },
{ type: 'notify', channel: 'email', template: 'application-received' }
]
},
{
name: 'Incomplete Applications',
conditions: {
documentType: 'application_form',
'validation.isComplete': false
},
actions: [
{ type: 'assign', queue: 'follow-up' },
{ type: 'notify', channel: 'email', template: 'application-incomplete' }
]
}
];
function evaluateCondition(doc, condition) {
for (const [path, expectedValue] of Object.entries(condition)) {
const actualValue = getNestedValue(doc, path);
if (typeof expectedValue === 'object' && expectedValue !== null) {
// Handle operators
if ('$gte' in expectedValue && actualValue < expectedValue.$gte) return false;
if ('$lte' in expectedValue && actualValue > expectedValue.$lte) return false;
if ('$lt' in expectedValue && actualValue >= expectedValue.$lt) return false;
if ('$gt' in expectedValue && actualValue <= expectedValue.$gt) return false;
if ('$in' in expectedValue && !expectedValue.$in.includes(actualValue)) return false;
if ('$ne' in expectedValue && actualValue === expectedValue.$ne) return false;
} else {
if (actualValue !== expectedValue) return false;
}
}
return true;
}
function getNestedValue(obj, path) {
return path.split('.').reduce((current, key) =>
current && current[key] !== undefined ? current[key] : null, obj
);
}
function findMatchingRules(document) {
const matched = [];
for (const rule of routingRules) {
if (evaluateCondition(document, rule.conditions)) {
matched.push(rule);
}
}
return matched;
}
async function executeActions(document, actions) {
const results = [];
for (const action of actions) {
try {
switch (action.type) {
case 'notify':
results.push(await sendNotification(document, action));
break;
case 'assign':
results.push(await assignToQueue(document, action.queue));
break;
case 'process':
results.push(await triggerWorkflow(document, action.workflow));
break;
case 'store':
results.push(await storeDocument(document, action.destination));
break;
case 'flag':
results.push({ type: 'flag', priority: action.priority, success: true });
break;
case 'createCalendarEvents':
results.push(await createCalendarEvents(document));
break;
}
} catch (error) {
results.push({ type: action.type, success: false, error: error.message });
}
}
return results;
}
// Mock implementations - replace with actual integrations
async function sendNotification(doc, action) {
return { type: 'notify', channel: action.channel, success: true };
}
async function assignToQueue(doc, queue) {
return { type: 'assign', queue, success: true };
}
async function triggerWorkflow(doc, workflow) {
return { type: 'process', workflow, success: true };
}
async function storeDocument(doc, destination) {
return { type: 'store', destination, success: true };
}
async function createCalendarEvents(doc) {
return { type: 'createCalendarEvents', eventsCreated: doc.calendarEvents?.length || 0, success: true };
}
// Main routing logic
const document = $input.first().json;
const matchedRules = findMatchingRules(document);
if (matchedRules.length === 0) {
// Default handling
return [{
json: {
document,
routing: {
rules: [],
actions: [{ type: 'assign', queue: 'unclassified' }],
actionResults: [{ type: 'assign', queue: 'unclassified', success: true }]
},
processedAt: new Date().toISOString()
}
}];
}
// Execute all matching rule actions
const allActions = matchedRules.flatMap(rule => rule.actions);
const actionResults = await executeActions(document, allActions);
return [{
json: {
document,
routing: {
rules: matchedRules.map(r => r.name),
actions: allActions,
actionResults
},
processedAt: new Date().toISOString()
}
}];OCR Enhancement with AI
Improve OCR accuracy with AI post-processing:
// OCR Enhancement Node
async function enhanceOCROutput(rawOcrText, documentMetadata) {
// Common OCR errors and corrections
const commonCorrections = {
'0': 'O', // Zero vs O in text context
'1': 'l', // One vs lowercase L
'|': 'I', // Pipe vs I
'rn': 'm', // rn often misread as m
'cl': 'd', // cl sometimes misread
'()': 'O', // Parentheses vs O
};
// AI-powered correction prompt
const correctionPrompt = `
You are an OCR correction assistant. Fix common OCR errors in this text while preserving the original meaning and structure.
Common issues to fix:
- Character substitutions (0/O, 1/l/I, rn/m)
- Word boundary errors
- Special character misreads
- Number/letter confusion in context
- Maintain original formatting (line breaks, spacing)
Document type: ${documentMetadata.type || 'unknown'}
Original OCR text:
${rawOcrText}
Return the corrected text only, no explanations.
`;
const correctedText = await $ai.complete({
prompt: correctionPrompt,
maxTokens: rawOcrText.length * 2,
temperature: 0.1
});
// Calculate confidence based on changes
const originalWords = rawOcrText.split(/\s+/).length;
const correctedWords = correctedText.split(/\s+/).length;
const changesDetected = calculateLevenshteinDistance(rawOcrText, correctedText);
const confidence = Math.max(0, 100 - (changesDetected / rawOcrText.length * 100));
return {
originalText: rawOcrText,
correctedText,
confidence: Math.round(confidence),
wordCount: correctedWords,
correctionsApplied: changesDetected > 0,
correctionRatio: (changesDetected / rawOcrText.length * 100).toFixed(2) + '%'
};
}
function calculateLevenshteinDistance(str1, str2) {
const m = str1.length;
const n = str2.length;
const dp = Array(m + 1).fill(null).map(() => Array(n + 1).fill(0));
for (let i = 0; i <= m; i++) dp[i][0] = i;
for (let j = 0; j <= n; j++) dp[0][j] = j;
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (str1[i - 1] === str2[j - 1]) {
dp[i][j] = dp[i - 1][j - 1];
} else {
dp[i][j] = Math.min(
dp[i - 1][j - 1] + 1, // substitution
dp[i - 1][j] + 1, // deletion
dp[i][j - 1] + 1 // insertion
);
}
}
}
return dp[m][n];
}
// Table extraction enhancement
async function extractTablesFromOCR(ocrText) {
const tablePrompt = `
Analyze this OCR text and identify any tabular data. Extract tables as structured JSON.
For each table found, return:
{
"tables": [
{
"name": "descriptive name",
"headers": ["col1", "col2", ...],
"rows": [
["val1", "val2", ...],
...
]
}
]
}
If no tables are found, return {"tables": []}.
OCR Text:
${ocrText}
`;
const response = await $ai.complete({
prompt: tablePrompt,
maxTokens: 3000,
temperature: 0.1
});
return JSON.parse(response);
}
// Main enhancement workflow
const rawOcr = $input.first().json.rawOcrOutput;
const metadata = $input.first().json.metadata;
const enhanced = await enhanceOCROutput(rawOcr, metadata);
const tables = await extractTablesFromOCR(enhanced.correctedText);
return [{
json: {
...enhanced,
extractedTables: tables.tables,
processingMetadata: {
originalLength: rawOcr.length,
enhancedLength: enhanced.correctedText.length,
tablesFound: tables.tables.length,
processedAt: new Date().toISOString()
}
}
}];Conclusion
Document processing automation with n8n and AI transforms manual, error-prone workflows into efficient, accurate systems. By combining OCR, intelligent classification, AI-powered extraction, and smart routing, you can process thousands of documents with minimal human intervention. Start with high-volume document types like invoices, then expand to contracts and forms as you refine your extraction models. Remember to implement validation layers and human review queues for edge cases to maintain accuracy at scale.