Document automation eliminates manual file handling and enables scalable business processes. This guide covers building robust file processing workflows with n8n for various document types and cloud storage systems.
File Handling Fundamentals
Binary Data Processing
// Function Node: File Processing Utilities
const fileBuffer = $input.first().binary.data;
const fileName = $input.first().binary.fileName;
const mimeType = $input.first().binary.mimeType;
// Determine processing based on file type
const processors = {
'application/pdf': processPDF,
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': processExcel,
'text/csv': processCSV,
'image/jpeg': processImage,
'image/png': processImage,
'application/json': processJSON
};
const processor = processors[mimeType];
if (!processor) {
throw new Error(`Unsupported file type: ${mimeType}`);
}
// File metadata
const metadata = {
fileName,
mimeType,
size: Buffer.from(fileBuffer, 'base64').length,
processedAt: new Date().toISOString()
};
return [{
json: {
metadata,
processorType: mimeType
},
binary: {
data: fileBuffer
}
}];File Validation and Security
// Function Node: Secure File Validation
const file = $input.first();
const buffer = Buffer.from(file.binary.data, 'base64');
// Configuration
const config = {
maxSizeBytes: 50 * 1024 * 1024, // 50MB
allowedMimeTypes: [
'application/pdf',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'text/csv',
'image/jpeg',
'image/png'
],
blockedExtensions: ['.exe', '.bat', '.cmd', '.sh', '.ps1', '.vbs']
};
// Validation checks
const validations = [];
// Size check
if (buffer.length > config.maxSizeBytes) {
validations.push({
check: 'size',
passed: false,
message: `File exceeds maximum size of ${config.maxSizeBytes / 1024 / 1024}MB`
});
} else {
validations.push({ check: 'size', passed: true });
}
// MIME type check
if (!config.allowedMimeTypes.includes(file.binary.mimeType)) {
validations.push({
check: 'mimeType',
passed: false,
message: `MIME type ${file.binary.mimeType} is not allowed`
});
} else {
validations.push({ check: 'mimeType', passed: true });
}
// Extension check
const fileName = file.binary.fileName || '';
const extension = fileName.substring(fileName.lastIndexOf('.')).toLowerCase();
if (config.blockedExtensions.includes(extension)) {
validations.push({
check: 'extension',
passed: false,
message: `File extension ${extension} is blocked`
});
} else {
validations.push({ check: 'extension', passed: true });
}
// Magic bytes check for common file types
const magicBytes = {
'application/pdf': [0x25, 0x50, 0x44, 0x46], // %PDF
'image/jpeg': [0xFF, 0xD8, 0xFF],
'image/png': [0x89, 0x50, 0x4E, 0x47]
};
const expectedMagic = magicBytes[file.binary.mimeType];
if (expectedMagic) {
const fileMagic = [...buffer.slice(0, expectedMagic.length)];
const magicMatch = expectedMagic.every((byte, i) => byte === fileMagic[i]);
if (!magicMatch) {
validations.push({
check: 'magicBytes',
passed: false,
message: 'File content does not match declared MIME type'
});
} else {
validations.push({ check: 'magicBytes', passed: true });
}
}
// Overall result
const allPassed = validations.every(v => v.passed);
return [{
json: {
valid: allPassed,
validations,
fileName: file.binary.fileName,
mimeType: file.binary.mimeType,
sizeBytes: buffer.length
},
binary: allPassed ? { data: file.binary.data } : undefined
}];PDF Processing Workflows
PDF Text Extraction
// Function Node: PDF Text Extraction (using pdf-parse)
const pdfParse = require('pdf-parse');
const pdfBuffer = Buffer.from($input.first().binary.data, 'base64');
const options = {
max: 0, // Parse all pages (0 = no limit)
pagerender: function(pageData) {
return pageData.getTextContent().then(function(textContent) {
let text = '';
let lastY = null;
for (const item of textContent.items) {
// Add newline when Y position changes significantly
if (lastY !== null && Math.abs(lastY - item.transform[5]) > 5) {
text += '\n';
}
text += item.str + ' ';
lastY = item.transform[5];
}
return text;
});
}
};
const data = await pdfParse(pdfBuffer, options);
return [{
json: {
text: data.text,
numPages: data.numpages,
info: {
title: data.info?.Title,
author: data.info?.Author,
creator: data.info?.Creator,
creationDate: data.info?.CreationDate
},
metadata: data.metadata
}
}];PDF Generation from Template
// Function Node: Generate PDF from HTML Template
const puppeteer = require('puppeteer');
const templateData = $input.first().json;
// HTML template with data binding
const htmlTemplate = `
<!DOCTYPE html>
<html>
<head>
<style>
body { font-family: Arial, sans-serif; margin: 40px; }
.header { border-bottom: 2px solid #333; padding-bottom: 20px; margin-bottom: 30px; }
.logo { max-height: 60px; }
h1 { color: #2563eb; }
table { width: 100%; border-collapse: collapse; margin: 20px 0; }
th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }
th { background: #f5f5f5; }
.total { font-size: 18px; font-weight: bold; text-align: right; }
.footer { margin-top: 40px; padding-top: 20px; border-top: 1px solid #ddd; font-size: 12px; color: #666; }
</style>
</head>
<body>
<div class="header">
<h1>Invoice #${templateData.invoiceNumber}</h1>
<p>Date: ${templateData.date}</p>
<p>Due: ${templateData.dueDate}</p>
</div>
<div class="customer">
<h3>Bill To:</h3>
<p>${templateData.customer.name}</p>
<p>${templateData.customer.address}</p>
<p>${templateData.customer.email}</p>
</div>
<table>
<thead>
<tr>
<th>Description</th>
<th>Quantity</th>
<th>Unit Price</th>
<th>Total</th>
</tr>
</thead>
<tbody>
${templateData.items.map(item => `
<tr>
<td>${item.description}</td>
<td>${item.quantity}</td>
<td>$${item.unitPrice.toFixed(2)}</td>
<td>$${(item.quantity * item.unitPrice).toFixed(2)}</td>
</tr>
`).join('')}
</tbody>
</table>
<div class="total">
<p>Subtotal: $${templateData.subtotal.toFixed(2)}</p>
<p>Tax (${templateData.taxRate}%): $${templateData.tax.toFixed(2)}</p>
<p>Total: $${templateData.total.toFixed(2)}</p>
</div>
<div class="footer">
<p>Thank you for your business!</p>
<p>Payment terms: Net 30</p>
</div>
</body>
</html>
`;
// Generate PDF
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.setContent(htmlTemplate);
const pdfBuffer = await page.pdf({
format: 'A4',
margin: { top: '20mm', right: '20mm', bottom: '20mm', left: '20mm' },
printBackground: true
});
await browser.close();
return [{
json: {
fileName: `invoice-${templateData.invoiceNumber}.pdf`,
generated: new Date().toISOString()
},
binary: {
data: pdfBuffer.toString('base64'),
mimeType: 'application/pdf',
fileName: `invoice-${templateData.invoiceNumber}.pdf`
}
}];Spreadsheet Processing
Excel File Processing
// Function Node: Excel Processing with XLSX
const XLSX = require('xlsx');
const excelBuffer = Buffer.from($input.first().binary.data, 'base64');
const workbook = XLSX.read(excelBuffer, { type: 'buffer' });
// Get all sheet names
const sheetNames = workbook.SheetNames;
// Process each sheet
const sheets = {};
for (const sheetName of sheetNames) {
const worksheet = workbook.Sheets[sheetName];
// Convert to JSON
const jsonData = XLSX.utils.sheet_to_json(worksheet, {
header: 1, // Use first row as headers
defval: null, // Default value for empty cells
blankrows: false
});
// Get headers (first row)
const headers = jsonData[0] || [];
// Convert to array of objects
const rows = jsonData.slice(1).map(row => {
const obj = {};
headers.forEach((header, index) => {
if (header) {
obj[header] = row[index];
}
});
return obj;
});
sheets[sheetName] = {
headers,
rows,
rowCount: rows.length
};
}
return [{
json: {
fileName: $input.first().binary.fileName,
sheetNames,
sheets,
totalSheets: sheetNames.length
}
}];Create Excel from Data
// Function Node: Generate Excel Report
const XLSX = require('xlsx');
const reportData = $input.first().json;
// Create workbook
const workbook = XLSX.utils.book_new();
// Create summary sheet
const summaryData = [
['Report Generated', new Date().toISOString()],
['Period', reportData.period],
['Total Records', reportData.totalRecords],
[],
['Metric', 'Value'],
...Object.entries(reportData.metrics)
];
const summarySheet = XLSX.utils.aoa_to_sheet(summaryData);
XLSX.utils.book_append_sheet(workbook, summarySheet, 'Summary');
// Create data sheet
const dataSheet = XLSX.utils.json_to_sheet(reportData.records);
// Apply column widths
const colWidths = Object.keys(reportData.records[0] || {}).map(key => ({
wch: Math.max(key.length, 15)
}));
dataSheet['!cols'] = colWidths;
XLSX.utils.book_append_sheet(workbook, dataSheet, 'Data');
// Generate buffer
const excelBuffer = XLSX.write(workbook, {
type: 'buffer',
bookType: 'xlsx'
});
return [{
json: {
fileName: `report-${reportData.period}.xlsx`,
sheets: ['Summary', 'Data']
},
binary: {
data: excelBuffer.toString('base64'),
mimeType: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
fileName: `report-${reportData.period}.xlsx`
}
}];CSV Processing
// Function Node: CSV Parser with Validation
const Papa = require('papaparse');
const csvContent = Buffer.from(
$input.first().binary.data, 'base64'
).toString('utf8');
// Parse configuration
const config = {
header: true,
skipEmptyLines: true,
transformHeader: (header) => header.trim().toLowerCase().replace(/\s+/g, '_'),
transform: (value) => value.trim()
};
const result = Papa.parse(csvContent, config);
// Validation
const errors = [];
const validRows = [];
// Define expected schema
const schema = {
required: ['email', 'name'],
email: {
pattern: /^[^\s@]+@[^\s@]+\.[^\s@]+$/,
message: 'Invalid email format'
},
phone: {
pattern: /^\+?[\d\s-()]{10,}$/,
message: 'Invalid phone format'
}
};
result.data.forEach((row, index) => {
const rowErrors = [];
// Check required fields
for (const field of schema.required) {
if (!row[field]) {
rowErrors.push(`Missing required field: ${field}`);
}
}
// Validate patterns
for (const [field, rules] of Object.entries(schema)) {
if (field === 'required') continue;
if (row[field] && rules.pattern && !rules.pattern.test(row[field])) {
rowErrors.push(`${field}: ${rules.message}`);
}
}
if (rowErrors.length > 0) {
errors.push({ row: index + 2, errors: rowErrors }); // +2 for header and 0-index
} else {
validRows.push(row);
}
});
return [{
json: {
totalRows: result.data.length,
validRows: validRows.length,
errorRows: errors.length,
errors: errors.slice(0, 100), // Limit error reporting
headers: result.meta.fields,
data: validRows
}
}];Cloud Storage Integration
Multi-Cloud File Sync
// Function Node: Cloud Storage Router
const file = $input.first();
const config = $input.first().json.config;
// Determine target storage based on file type and size
function determineStorage(file, config) {
const size = Buffer.from(file.binary.data, 'base64').length;
const mimeType = file.binary.mimeType;
// Large files go to S3
if (size > 100 * 1024 * 1024) { // 100MB
return 's3';
}
// Images go to Cloudinary for CDN
if (mimeType.startsWith('image/')) {
return 'cloudinary';
}
// Documents go to Google Drive for collaboration
if (mimeType.includes('document') || mimeType.includes('spreadsheet')) {
return 'google_drive';
}
// Default to S3
return 's3';
}
const targetStorage = determineStorage(file, config);
// Generate storage path
const timestamp = Date.now();
const sanitizedName = file.binary.fileName
.replace(/[^a-zA-Z0-9.-]/g, '_')
.toLowerCase();
const storagePath = `uploads/${new Date().toISOString().slice(0, 7)}/${timestamp}-${sanitizedName}`;
return [{
json: {
targetStorage,
storagePath,
fileName: file.binary.fileName,
mimeType: file.binary.mimeType,
size: Buffer.from(file.binary.data, 'base64').length
},
binary: {
data: file.binary.data
}
}];S3 Upload with Presigned URL
// Function Node: Generate S3 Presigned URL
const AWS = require('aws-sdk');
const s3 = new AWS.S3({
accessKeyId: $env.AWS_ACCESS_KEY_ID,
secretAccessKey: $env.AWS_SECRET_ACCESS_KEY,
region: $env.AWS_REGION
});
const fileInfo = $input.first().json;
// Generate presigned URL for upload
const uploadParams = {
Bucket: $env.S3_BUCKET,
Key: fileInfo.storagePath,
ContentType: fileInfo.mimeType,
Expires: 3600 // 1 hour
};
const uploadUrl = s3.getSignedUrl('putObject', uploadParams);
// Generate presigned URL for download
const downloadParams = {
Bucket: $env.S3_BUCKET,
Key: fileInfo.storagePath,
Expires: 86400 // 24 hours
};
const downloadUrl = s3.getSignedUrl('getObject', downloadParams);
return [{
json: {
uploadUrl,
downloadUrl,
bucket: $env.S3_BUCKET,
key: fileInfo.storagePath,
expiresIn: {
upload: 3600,
download: 86400
}
}
}];Document Conversion Workflows
Format Conversion Pipeline
// Function Node: Document Conversion Router
const file = $input.first();
const targetFormat = $input.first().json.targetFormat;
const conversionMap = {
// Source format -> supported targets
'application/pdf': ['image/png', 'text/plain', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['application/pdf', 'text/plain', 'text/html'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['text/csv', 'application/pdf', 'application/json'],
'text/csv': ['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/json'],
'image/jpeg': ['image/png', 'image/webp', 'application/pdf'],
'image/png': ['image/jpeg', 'image/webp', 'application/pdf']
};
const sourceMime = file.binary.mimeType;
const supportedTargets = conversionMap[sourceMime] || [];
if (!supportedTargets.includes(targetFormat)) {
throw new Error(
`Cannot convert ${sourceMime} to ${targetFormat}. Supported: ${supportedTargets.join(', ')}`
);
}
// Determine conversion method
let conversionMethod;
if (sourceMime.startsWith('image/') && targetFormat.startsWith('image/')) {
conversionMethod = 'image_transform';
} else if (targetFormat === 'application/pdf') {
conversionMethod = 'to_pdf';
} else if (sourceMime === 'application/pdf') {
conversionMethod = 'from_pdf';
} else {
conversionMethod = 'document_convert';
}
return [{
json: {
sourceFormat: sourceMime,
targetFormat,
conversionMethod,
fileName: file.binary.fileName
},
binary: {
data: file.binary.data
}
}];Image Processing
// Function Node: Image Processing with Sharp
const sharp = require('sharp');
const imageBuffer = Buffer.from($input.first().binary.data, 'base64');
const options = $input.first().json.options || {};
// Default options
const config = {
width: options.width || null,
height: options.height || null,
format: options.format || 'jpeg',
quality: options.quality || 80,
fit: options.fit || 'inside' // cover, contain, fill, inside, outside
};
let processor = sharp(imageBuffer);
// Resize if dimensions specified
if (config.width || config.height) {
processor = processor.resize({
width: config.width,
height: config.height,
fit: config.fit,
withoutEnlargement: true
});
}
// Format conversion
switch (config.format) {
case 'jpeg':
case 'jpg':
processor = processor.jpeg({ quality: config.quality });
break;
case 'png':
processor = processor.png({ compressionLevel: 9 });
break;
case 'webp':
processor = processor.webp({ quality: config.quality });
break;
case 'avif':
processor = processor.avif({ quality: config.quality });
break;
}
const outputBuffer = await processor.toBuffer();
const metadata = await sharp(outputBuffer).metadata();
// Generate new filename
const originalName = $input.first().binary.fileName;
const baseName = originalName.substring(0, originalName.lastIndexOf('.'));
const newFileName = `${baseName}.${config.format}`;
return [{
json: {
originalSize: imageBuffer.length,
newSize: outputBuffer.length,
compressionRatio: ((1 - outputBuffer.length / imageBuffer.length) * 100).toFixed(1) + '%',
dimensions: {
width: metadata.width,
height: metadata.height
},
format: metadata.format
},
binary: {
data: outputBuffer.toString('base64'),
mimeType: `image/${config.format}`,
fileName: newFileName
}
}];Batch Processing Workflows
Batch File Processor
// Function Node: Batch Processing Coordinator
const files = $input.all();
const batchConfig = {
maxConcurrent: 5,
retryAttempts: 3,
timeoutMs: 30000
};
// Group files by type for efficient processing
const fileGroups = {};
for (const file of files) {
const type = file.binary?.mimeType || 'unknown';
if (!fileGroups[type]) {
fileGroups[type] = [];
}
fileGroups[type].push(file);
}
// Create batch jobs
const batchJobs = [];
let jobId = 0;
for (const [type, groupFiles] of Object.entries(fileGroups)) {
// Split into chunks for concurrent processing
for (let i = 0; i < groupFiles.length; i += batchConfig.maxConcurrent) {
const chunk = groupFiles.slice(i, i + batchConfig.maxConcurrent);
batchJobs.push({
jobId: ++jobId,
fileType: type,
files: chunk.map(f => ({
fileName: f.binary?.fileName,
size: f.binary?.data ? Buffer.from(f.binary.data, 'base64').length : 0
})),
fileCount: chunk.length
});
}
}
return batchJobs.map(job => ({
json: {
...job,
config: batchConfig,
timestamp: new Date().toISOString()
}
}));Error Handling and Logging
Comprehensive File Processing Logger
// Function Node: File Processing Logger
const result = $input.first().json;
const startTime = $input.first().json.startTime || Date.now();
const logEntry = {
timestamp: new Date().toISOString(),
processingTime: Date.now() - startTime,
operation: result.operation,
status: result.success ? 'success' : 'failed',
input: {
fileName: result.inputFile,
mimeType: result.inputMimeType,
size: result.inputSize
},
output: result.success ? {
fileName: result.outputFile,
mimeType: result.outputMimeType,
size: result.outputSize
} : null,
error: result.error || null,
metadata: {
workflowId: $workflow.id,
executionId: $execution.id,
nodeId: $node.id
}
};
// Determine log level
let level = 'info';
if (result.error) {
level = result.error.includes('timeout') ? 'warn' : 'error';
}
return [{
json: {
...logEntry,
level
}
}];Best Practices Summary
- Always validate file types using magic bytes, not just extensions
- Implement size limits to prevent resource exhaustion
- Use streaming for large files to minimize memory usage
- Handle errors gracefully with proper logging
- Sanitize file names before storage
- Use presigned URLs for secure file access
- Batch process when handling multiple files
- Monitor processing times to identify bottlenecks
Document automation with n8n enables powerful file processing pipelines while maintaining security and scalability. Combine these patterns based on your specific use case requirements.