n8n File Processing and Document Automation Workflows

Document automation eliminates manual file handling and enables scalable business processes. This guide covers building robust file processing workflows with n8n for various document types and cloud storage systems.

File Handling Fundamentals

Binary Data Processing

// Function Node: File Processing Utilities
const fileBuffer = $input.first().binary.data;
const fileName = $input.first().binary.fileName;
const mimeType = $input.first().binary.mimeType;
 
// Determine processing based on file type
const processors = {
  'application/pdf': processPDF,
  'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': processExcel,
  'text/csv': processCSV,
  'image/jpeg': processImage,
  'image/png': processImage,
  'application/json': processJSON
};
 
const processor = processors[mimeType];
if (!processor) {
  throw new Error(`Unsupported file type: ${mimeType}`);
}
 
// File metadata
const metadata = {
  fileName,
  mimeType,
  size: Buffer.from(fileBuffer, 'base64').length,
  processedAt: new Date().toISOString()
};
 
return [{
  json: {
    metadata,
    processorType: mimeType
  },
  binary: {
    data: fileBuffer
  }
}];

File Validation and Security

// Function Node: Secure File Validation
const file = $input.first();
const buffer = Buffer.from(file.binary.data, 'base64');
 
// Configuration
const config = {
  maxSizeBytes: 50 * 1024 * 1024, // 50MB
  allowedMimeTypes: [
    'application/pdf',
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
    'text/csv',
    'image/jpeg',
    'image/png'
  ],
  blockedExtensions: ['.exe', '.bat', '.cmd', '.sh', '.ps1', '.vbs']
};
 
// Validation checks
const validations = [];
 
// Size check
if (buffer.length > config.maxSizeBytes) {
  validations.push({
    check: 'size',
    passed: false,
    message: `File exceeds maximum size of ${config.maxSizeBytes / 1024 / 1024}MB`
  });
} else {
  validations.push({ check: 'size', passed: true });
}
 
// MIME type check
if (!config.allowedMimeTypes.includes(file.binary.mimeType)) {
  validations.push({
    check: 'mimeType',
    passed: false,
    message: `MIME type ${file.binary.mimeType} is not allowed`
  });
} else {
  validations.push({ check: 'mimeType', passed: true });
}
 
// Extension check
const fileName = file.binary.fileName || '';
const extension = fileName.substring(fileName.lastIndexOf('.')).toLowerCase();
if (config.blockedExtensions.includes(extension)) {
  validations.push({
    check: 'extension',
    passed: false,
    message: `File extension ${extension} is blocked`
  });
} else {
  validations.push({ check: 'extension', passed: true });
}
 
// Magic bytes check for common file types
const magicBytes = {
  'application/pdf': [0x25, 0x50, 0x44, 0x46], // %PDF
  'image/jpeg': [0xFF, 0xD8, 0xFF],
  'image/png': [0x89, 0x50, 0x4E, 0x47]
};
 
const expectedMagic = magicBytes[file.binary.mimeType];
if (expectedMagic) {
  const fileMagic = [...buffer.slice(0, expectedMagic.length)];
  const magicMatch = expectedMagic.every((byte, i) => byte === fileMagic[i]);
 
  if (!magicMatch) {
    validations.push({
      check: 'magicBytes',
      passed: false,
      message: 'File content does not match declared MIME type'
    });
  } else {
    validations.push({ check: 'magicBytes', passed: true });
  }
}
 
// Overall result
const allPassed = validations.every(v => v.passed);
 
return [{
  json: {
    valid: allPassed,
    validations,
    fileName: file.binary.fileName,
    mimeType: file.binary.mimeType,
    sizeBytes: buffer.length
  },
  binary: allPassed ? { data: file.binary.data } : undefined
}];

PDF Processing Workflows

PDF Text Extraction

// Function Node: PDF Text Extraction (using pdf-parse)
const pdfParse = require('pdf-parse');
 
const pdfBuffer = Buffer.from($input.first().binary.data, 'base64');
 
const options = {
  max: 0, // Parse all pages (0 = no limit)
  pagerender: function(pageData) {
    return pageData.getTextContent().then(function(textContent) {
      let text = '';
      let lastY = null;
 
      for (const item of textContent.items) {
        // Add newline when Y position changes significantly
        if (lastY !== null && Math.abs(lastY - item.transform[5]) > 5) {
          text += '\n';
        }
        text += item.str + ' ';
        lastY = item.transform[5];
      }
 
      return text;
    });
  }
};
 
const data = await pdfParse(pdfBuffer, options);
 
return [{
  json: {
    text: data.text,
    numPages: data.numpages,
    info: {
      title: data.info?.Title,
      author: data.info?.Author,
      creator: data.info?.Creator,
      creationDate: data.info?.CreationDate
    },
    metadata: data.metadata
  }
}];

PDF Generation from Template

// Function Node: Generate PDF from HTML Template
const puppeteer = require('puppeteer');
 
const templateData = $input.first().json;
 
// HTML template with data binding
const htmlTemplate = `
<!DOCTYPE html>
<html>
<head>
  <style>
    body { font-family: Arial, sans-serif; margin: 40px; }
    .header { border-bottom: 2px solid #333; padding-bottom: 20px; margin-bottom: 30px; }
    .logo { max-height: 60px; }
    h1 { color: #2563eb; }
    table { width: 100%; border-collapse: collapse; margin: 20px 0; }
    th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }
    th { background: #f5f5f5; }
    .total { font-size: 18px; font-weight: bold; text-align: right; }
    .footer { margin-top: 40px; padding-top: 20px; border-top: 1px solid #ddd; font-size: 12px; color: #666; }
  </style>
</head>
<body>
  <div class="header">
    <h1>Invoice #${templateData.invoiceNumber}</h1>
    <p>Date: ${templateData.date}</p>
    <p>Due: ${templateData.dueDate}</p>
  </div>
 
  <div class="customer">
    <h3>Bill To:</h3>
    <p>${templateData.customer.name}</p>
    <p>${templateData.customer.address}</p>
    <p>${templateData.customer.email}</p>
  </div>
 
  <table>
    <thead>
      <tr>
        <th>Description</th>
        <th>Quantity</th>
        <th>Unit Price</th>
        <th>Total</th>
      </tr>
    </thead>
    <tbody>
      ${templateData.items.map(item => `
        <tr>
          <td>${item.description}</td>
          <td>${item.quantity}</td>
          <td>$${item.unitPrice.toFixed(2)}</td>
          <td>$${(item.quantity * item.unitPrice).toFixed(2)}</td>
        </tr>
      `).join('')}
    </tbody>
  </table>
 
  <div class="total">
    <p>Subtotal: $${templateData.subtotal.toFixed(2)}</p>
    <p>Tax (${templateData.taxRate}%): $${templateData.tax.toFixed(2)}</p>
    <p>Total: $${templateData.total.toFixed(2)}</p>
  </div>
 
  <div class="footer">
    <p>Thank you for your business!</p>
    <p>Payment terms: Net 30</p>
  </div>
</body>
</html>
`;
 
// Generate PDF
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.setContent(htmlTemplate);
 
const pdfBuffer = await page.pdf({
  format: 'A4',
  margin: { top: '20mm', right: '20mm', bottom: '20mm', left: '20mm' },
  printBackground: true
});
 
await browser.close();
 
return [{
  json: {
    fileName: `invoice-${templateData.invoiceNumber}.pdf`,
    generated: new Date().toISOString()
  },
  binary: {
    data: pdfBuffer.toString('base64'),
    mimeType: 'application/pdf',
    fileName: `invoice-${templateData.invoiceNumber}.pdf`
  }
}];

Spreadsheet Processing

Excel File Processing

// Function Node: Excel Processing with XLSX
const XLSX = require('xlsx');
 
const excelBuffer = Buffer.from($input.first().binary.data, 'base64');
const workbook = XLSX.read(excelBuffer, { type: 'buffer' });
 
// Get all sheet names
const sheetNames = workbook.SheetNames;
 
// Process each sheet
const sheets = {};
for (const sheetName of sheetNames) {
  const worksheet = workbook.Sheets[sheetName];
 
  // Convert to JSON
  const jsonData = XLSX.utils.sheet_to_json(worksheet, {
    header: 1, // Use first row as headers
    defval: null, // Default value for empty cells
    blankrows: false
  });
 
  // Get headers (first row)
  const headers = jsonData[0] || [];
 
  // Convert to array of objects
  const rows = jsonData.slice(1).map(row => {
    const obj = {};
    headers.forEach((header, index) => {
      if (header) {
        obj[header] = row[index];
      }
    });
    return obj;
  });
 
  sheets[sheetName] = {
    headers,
    rows,
    rowCount: rows.length
  };
}
 
return [{
  json: {
    fileName: $input.first().binary.fileName,
    sheetNames,
    sheets,
    totalSheets: sheetNames.length
  }
}];

Create Excel from Data

// Function Node: Generate Excel Report
const XLSX = require('xlsx');
 
const reportData = $input.first().json;
 
// Create workbook
const workbook = XLSX.utils.book_new();
 
// Create summary sheet
const summaryData = [
  ['Report Generated', new Date().toISOString()],
  ['Period', reportData.period],
  ['Total Records', reportData.totalRecords],
  [],
  ['Metric', 'Value'],
  ...Object.entries(reportData.metrics)
];
 
const summarySheet = XLSX.utils.aoa_to_sheet(summaryData);
XLSX.utils.book_append_sheet(workbook, summarySheet, 'Summary');
 
// Create data sheet
const dataSheet = XLSX.utils.json_to_sheet(reportData.records);
 
// Apply column widths
const colWidths = Object.keys(reportData.records[0] || {}).map(key => ({
  wch: Math.max(key.length, 15)
}));
dataSheet['!cols'] = colWidths;
 
XLSX.utils.book_append_sheet(workbook, dataSheet, 'Data');
 
// Generate buffer
const excelBuffer = XLSX.write(workbook, {
  type: 'buffer',
  bookType: 'xlsx'
});
 
return [{
  json: {
    fileName: `report-${reportData.period}.xlsx`,
    sheets: ['Summary', 'Data']
  },
  binary: {
    data: excelBuffer.toString('base64'),
    mimeType: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
    fileName: `report-${reportData.period}.xlsx`
  }
}];

CSV Processing

// Function Node: CSV Parser with Validation
const Papa = require('papaparse');
 
const csvContent = Buffer.from(
  $input.first().binary.data, 'base64'
).toString('utf8');
 
// Parse configuration
const config = {
  header: true,
  skipEmptyLines: true,
  transformHeader: (header) => header.trim().toLowerCase().replace(/\s+/g, '_'),
  transform: (value) => value.trim()
};
 
const result = Papa.parse(csvContent, config);
 
// Validation
const errors = [];
const validRows = [];
 
// Define expected schema
const schema = {
  required: ['email', 'name'],
  email: {
    pattern: /^[^\s@]+@[^\s@]+\.[^\s@]+$/,
    message: 'Invalid email format'
  },
  phone: {
    pattern: /^\+?[\d\s-()]{10,}$/,
    message: 'Invalid phone format'
  }
};
 
result.data.forEach((row, index) => {
  const rowErrors = [];
 
  // Check required fields
  for (const field of schema.required) {
    if (!row[field]) {
      rowErrors.push(`Missing required field: ${field}`);
    }
  }
 
  // Validate patterns
  for (const [field, rules] of Object.entries(schema)) {
    if (field === 'required') continue;
    if (row[field] && rules.pattern && !rules.pattern.test(row[field])) {
      rowErrors.push(`${field}: ${rules.message}`);
    }
  }
 
  if (rowErrors.length > 0) {
    errors.push({ row: index + 2, errors: rowErrors }); // +2 for header and 0-index
  } else {
    validRows.push(row);
  }
});
 
return [{
  json: {
    totalRows: result.data.length,
    validRows: validRows.length,
    errorRows: errors.length,
    errors: errors.slice(0, 100), // Limit error reporting
    headers: result.meta.fields,
    data: validRows
  }
}];

Cloud Storage Integration

Multi-Cloud File Sync

// Function Node: Cloud Storage Router
const file = $input.first();
const config = $input.first().json.config;
 
// Determine target storage based on file type and size
function determineStorage(file, config) {
  const size = Buffer.from(file.binary.data, 'base64').length;
  const mimeType = file.binary.mimeType;
 
  // Large files go to S3
  if (size > 100 * 1024 * 1024) { // 100MB
    return 's3';
  }
 
  // Images go to Cloudinary for CDN
  if (mimeType.startsWith('image/')) {
    return 'cloudinary';
  }
 
  // Documents go to Google Drive for collaboration
  if (mimeType.includes('document') || mimeType.includes('spreadsheet')) {
    return 'google_drive';
  }
 
  // Default to S3
  return 's3';
}
 
const targetStorage = determineStorage(file, config);
 
// Generate storage path
const timestamp = Date.now();
const sanitizedName = file.binary.fileName
  .replace(/[^a-zA-Z0-9.-]/g, '_')
  .toLowerCase();
 
const storagePath = `uploads/${new Date().toISOString().slice(0, 7)}/${timestamp}-${sanitizedName}`;
 
return [{
  json: {
    targetStorage,
    storagePath,
    fileName: file.binary.fileName,
    mimeType: file.binary.mimeType,
    size: Buffer.from(file.binary.data, 'base64').length
  },
  binary: {
    data: file.binary.data
  }
}];

S3 Upload with Presigned URL

// Function Node: Generate S3 Presigned URL
const AWS = require('aws-sdk');
 
const s3 = new AWS.S3({
  accessKeyId: $env.AWS_ACCESS_KEY_ID,
  secretAccessKey: $env.AWS_SECRET_ACCESS_KEY,
  region: $env.AWS_REGION
});
 
const fileInfo = $input.first().json;
 
// Generate presigned URL for upload
const uploadParams = {
  Bucket: $env.S3_BUCKET,
  Key: fileInfo.storagePath,
  ContentType: fileInfo.mimeType,
  Expires: 3600 // 1 hour
};
 
const uploadUrl = s3.getSignedUrl('putObject', uploadParams);
 
// Generate presigned URL for download
const downloadParams = {
  Bucket: $env.S3_BUCKET,
  Key: fileInfo.storagePath,
  Expires: 86400 // 24 hours
};
 
const downloadUrl = s3.getSignedUrl('getObject', downloadParams);
 
return [{
  json: {
    uploadUrl,
    downloadUrl,
    bucket: $env.S3_BUCKET,
    key: fileInfo.storagePath,
    expiresIn: {
      upload: 3600,
      download: 86400
    }
  }
}];

Document Conversion Workflows

Format Conversion Pipeline

// Function Node: Document Conversion Router
const file = $input.first();
const targetFormat = $input.first().json.targetFormat;
 
const conversionMap = {
  // Source format -> supported targets
  'application/pdf': ['image/png', 'text/plain', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'],
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['application/pdf', 'text/plain', 'text/html'],
  'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['text/csv', 'application/pdf', 'application/json'],
  'text/csv': ['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/json'],
  'image/jpeg': ['image/png', 'image/webp', 'application/pdf'],
  'image/png': ['image/jpeg', 'image/webp', 'application/pdf']
};
 
const sourceMime = file.binary.mimeType;
const supportedTargets = conversionMap[sourceMime] || [];
 
if (!supportedTargets.includes(targetFormat)) {
  throw new Error(
    `Cannot convert ${sourceMime} to ${targetFormat}. Supported: ${supportedTargets.join(', ')}`
  );
}
 
// Determine conversion method
let conversionMethod;
if (sourceMime.startsWith('image/') && targetFormat.startsWith('image/')) {
  conversionMethod = 'image_transform';
} else if (targetFormat === 'application/pdf') {
  conversionMethod = 'to_pdf';
} else if (sourceMime === 'application/pdf') {
  conversionMethod = 'from_pdf';
} else {
  conversionMethod = 'document_convert';
}
 
return [{
  json: {
    sourceFormat: sourceMime,
    targetFormat,
    conversionMethod,
    fileName: file.binary.fileName
  },
  binary: {
    data: file.binary.data
  }
}];

Image Processing

// Function Node: Image Processing with Sharp
const sharp = require('sharp');
 
const imageBuffer = Buffer.from($input.first().binary.data, 'base64');
const options = $input.first().json.options || {};
 
// Default options
const config = {
  width: options.width || null,
  height: options.height || null,
  format: options.format || 'jpeg',
  quality: options.quality || 80,
  fit: options.fit || 'inside' // cover, contain, fill, inside, outside
};
 
let processor = sharp(imageBuffer);
 
// Resize if dimensions specified
if (config.width || config.height) {
  processor = processor.resize({
    width: config.width,
    height: config.height,
    fit: config.fit,
    withoutEnlargement: true
  });
}
 
// Format conversion
switch (config.format) {
  case 'jpeg':
  case 'jpg':
    processor = processor.jpeg({ quality: config.quality });
    break;
  case 'png':
    processor = processor.png({ compressionLevel: 9 });
    break;
  case 'webp':
    processor = processor.webp({ quality: config.quality });
    break;
  case 'avif':
    processor = processor.avif({ quality: config.quality });
    break;
}
 
const outputBuffer = await processor.toBuffer();
const metadata = await sharp(outputBuffer).metadata();
 
// Generate new filename
const originalName = $input.first().binary.fileName;
const baseName = originalName.substring(0, originalName.lastIndexOf('.'));
const newFileName = `${baseName}.${config.format}`;
 
return [{
  json: {
    originalSize: imageBuffer.length,
    newSize: outputBuffer.length,
    compressionRatio: ((1 - outputBuffer.length / imageBuffer.length) * 100).toFixed(1) + '%',
    dimensions: {
      width: metadata.width,
      height: metadata.height
    },
    format: metadata.format
  },
  binary: {
    data: outputBuffer.toString('base64'),
    mimeType: `image/${config.format}`,
    fileName: newFileName
  }
}];

Batch Processing Workflows

Batch File Processor

// Function Node: Batch Processing Coordinator
const files = $input.all();
const batchConfig = {
  maxConcurrent: 5,
  retryAttempts: 3,
  timeoutMs: 30000
};
 
// Group files by type for efficient processing
const fileGroups = {};
for (const file of files) {
  const type = file.binary?.mimeType || 'unknown';
  if (!fileGroups[type]) {
    fileGroups[type] = [];
  }
  fileGroups[type].push(file);
}
 
// Create batch jobs
const batchJobs = [];
let jobId = 0;
 
for (const [type, groupFiles] of Object.entries(fileGroups)) {
  // Split into chunks for concurrent processing
  for (let i = 0; i < groupFiles.length; i += batchConfig.maxConcurrent) {
    const chunk = groupFiles.slice(i, i + batchConfig.maxConcurrent);
    batchJobs.push({
      jobId: ++jobId,
      fileType: type,
      files: chunk.map(f => ({
        fileName: f.binary?.fileName,
        size: f.binary?.data ? Buffer.from(f.binary.data, 'base64').length : 0
      })),
      fileCount: chunk.length
    });
  }
}
 
return batchJobs.map(job => ({
  json: {
    ...job,
    config: batchConfig,
    timestamp: new Date().toISOString()
  }
}));

Error Handling and Logging

Comprehensive File Processing Logger

// Function Node: File Processing Logger
const result = $input.first().json;
const startTime = $input.first().json.startTime || Date.now();
 
const logEntry = {
  timestamp: new Date().toISOString(),
  processingTime: Date.now() - startTime,
  operation: result.operation,
  status: result.success ? 'success' : 'failed',
  input: {
    fileName: result.inputFile,
    mimeType: result.inputMimeType,
    size: result.inputSize
  },
  output: result.success ? {
    fileName: result.outputFile,
    mimeType: result.outputMimeType,
    size: result.outputSize
  } : null,
  error: result.error || null,
  metadata: {
    workflowId: $workflow.id,
    executionId: $execution.id,
    nodeId: $node.id
  }
};
 
// Determine log level
let level = 'info';
if (result.error) {
  level = result.error.includes('timeout') ? 'warn' : 'error';
}
 
return [{
  json: {
    ...logEntry,
    level
  }
}];

Best Practices Summary

Always validate file types using magic bytes, not just extensions
Implement size limits to prevent resource exhaustion
Use streaming for large files to minimize memory usage
Handle errors gracefully with proper logging
Sanitize file names before storage
Use presigned URLs for secure file access
Batch process when handling multiple files
Monitor processing times to identify bottlenecks

Document automation with n8n enables powerful file processing pipelines while maintaining security and scalability. Combine these patterns based on your specific use case requirements.