GPU Infrastructure for ML: Cost Optimization and Scaling Strategies
GPU costs are often the largest line item in ML budgets. A single A100 instance on AWS costs $32/hour — that's $23,000/month if left running. This guide covers practical strategies to reduce GPU costs by 60-80% without sacrificing training speed or model quality.
The GPU Cost Problem
| GPU | AWS (on-demand) | GCP (on-demand) | Monthly (24/7) | |-----|----------------|----------------|----------------| | T4 (16GB) | $0.53/hr | $0.35/hr | $380-$250 | | A10G (24GB) | $1.21/hr | — | $870 | | V100 (16GB) | $3.06/hr | $2.48/hr | $2,200-$1,800 | | A100 (40GB) | $4.10/hr | $3.67/hr | $2,950-$2,640 | | A100 (80GB) | $32.77/hr | $11.84/hr | $23,600-$8,520 | | H100 (80GB) | $65.93/hr | $12.72/hr | $47,400-$9,160 |
Most teams waste 40-70% of their GPU budget on idle instances, over-provisioned hardware, and inefficient training.
Strategy 1: Spot/Preemptible Instances (50-90% Savings)
Cloud providers offer unused GPU capacity at massive discounts. The trade-off: instances can be reclaimed with short notice.
# AWS Spot Fleet configuration for training
import boto3
ec2 = boto3.client("ec2")
spot_fleet_config = {
"SpotFleetRequestConfig": {
"TargetCapacity": 4,
"SpotPrice": "2.00", # Max price per GPU instance
"LaunchSpecifications": [
{
"InstanceType": "g5.xlarge", # A10G GPU
"ImageId": "ami-deeplearning-gpu",
"SubnetId": "subnet-xxx",
"IamInstanceProfile": {"Arn": "arn:aws:iam::instance-profile/ml-training"},
},
{
"InstanceType": "g4dn.xlarge", # T4 GPU (fallback)
"ImageId": "ami-deeplearning-gpu",
"SubnetId": "subnet-xxx",
"IamInstanceProfile": {"Arn": "arn:aws:iam::instance-profile/ml-training"},
},
],
"AllocationStrategy": "lowestPrice",
"InstanceInterruptionBehavior": "stop",
}
}Making Spot Instances Work for ML Training
import torch
import os
class CheckpointManager:
"""Save checkpoints frequently to survive spot instance interruptions."""
def __init__(self, checkpoint_dir: str, save_every_n_steps: int = 500):
self.checkpoint_dir = checkpoint_dir
self.save_interval = save_every_n_steps
def save(self, model, optimizer, step: int, metrics: dict):
if step % self.save_interval != 0:
return
checkpoint = {
"step": step,
"model_state": model.state_dict(),
"optimizer_state": optimizer.state_dict(),
"metrics": metrics,
}
path = os.path.join(self.checkpoint_dir, f"checkpoint-{step}.pt")
torch.save(checkpoint, path)
# Upload to S3 immediately (survive instance loss)
self._upload_to_s3(path)
# Keep only last 3 checkpoints
self._cleanup_old_checkpoints(keep=3)
def load_latest(self, model, optimizer) -> int:
"""Resume from latest checkpoint after spot interruption."""
latest = self._find_latest_checkpoint()
if latest is None:
return 0
checkpoint = torch.load(latest)
model.load_state_dict(checkpoint["model_state"])
optimizer.load_state_dict(checkpoint["optimizer_state"])
return checkpoint["step"]Strategy 2: Right-Sizing GPU Selection
Don't use A100s for inference that runs fine on T4s:
class GPURecommender:
"""Recommend optimal GPU based on workload characteristics."""
GPU_SPECS = {
"t4": {"vram_gb": 16, "fp32_tflops": 8.1, "cost_hr": 0.53},
"a10g": {"vram_gb": 24, "fp32_tflops": 31.2, "cost_hr": 1.21},
"v100": {"vram_gb": 16, "fp32_tflops": 15.7, "cost_hr": 3.06},
"a100": {"vram_gb": 40, "fp32_tflops": 19.5, "cost_hr": 4.10},
"a100_80": {"vram_gb": 80, "fp32_tflops": 19.5, "cost_hr": 32.77},
}
def recommend(self, workload: dict) -> list[dict]:
model_size_gb = workload["model_size_gb"]
batch_size = workload.get("batch_size", 32)
est_vram_needed = model_size_gb * 3 # Model + gradients + optimizer
workload_type = workload["type"] # "training" or "inference"
recommendations = []
for gpu, specs in self.GPU_SPECS.items():
if specs["vram_gb"] >= est_vram_needed:
score = specs["fp32_tflops"] / specs["cost_hr"] # Performance per dollar
if workload_type == "inference":
score *= 2 if gpu in ("t4", "a10g") else 1 # Favor cheaper GPUs for inference
recommendations.append({
"gpu": gpu,
"cost_per_hour": specs["cost_hr"],
"vram_gb": specs["vram_gb"],
"performance_per_dollar": round(score, 2),
"headroom_gb": specs["vram_gb"] - est_vram_needed,
})
return sorted(recommendations, key=lambda x: x["performance_per_dollar"], reverse=True)Strategy 3: Scheduling and Auto-Scaling
# Kubernetes GPU auto-scaling for ML workloads
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: ml-inference-hpa
namespace: ml-serving
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: model-server
minReplicas: 1
maxReplicas: 8
metrics:
- type: Pods
pods:
metric:
name: gpu_utilization
target:
type: AverageValue
averageValue: "70" # Scale up when GPU utilization > 70%
- type: Pods
pods:
metric:
name: prediction_queue_depth
target:
type: AverageValue
averageValue: "50"
behavior:
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Pods
value: 2
periodSeconds: 120
scaleDown:
stabilizationWindowSeconds: 300 # Wait 5 min before scaling down
policies:
- type: Pods
value: 1
periodSeconds: 120Time-Based Scheduling
class GPUScheduler:
"""Schedule GPU workloads during off-peak hours for cost savings."""
PEAK_HOURS = range(9, 18) # 9 AM - 6 PM (inference serving)
OFF_PEAK_DISCOUNT = 0.3 # 30% cheaper during off-peak
def schedule_training(self, estimated_hours: float, priority: str = "normal"):
from datetime import datetime, timedelta
now = datetime.utcnow()
if priority == "urgent":
return {"start": now, "strategy": "on_demand"}
# Schedule non-urgent training for off-peak
next_offpeak = now.replace(hour=18, minute=0, second=0)
if now.hour >= 18:
next_offpeak = now # Already off-peak
elif now.hour < 9:
next_offpeak = now # Still off-peak
return {
"start": next_offpeak,
"strategy": "spot_instances",
"estimated_cost_savings": f"{self.OFF_PEAK_DISCOUNT * 100:.0f}%",
"estimated_duration_hours": estimated_hours,
}Strategy 4: Training Optimization
Mixed Precision Training (2x Speedup, Less VRAM)
import torch
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for batch in dataloader:
optimizer.zero_grad()
# Forward pass in mixed precision (float16 + float32)
with autocast():
output = model(batch["input"].cuda())
loss = criterion(output, batch["target"].cuda())
# Backward pass with gradient scaling
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()Gradient Accumulation (Larger Effective Batch on Smaller GPUs)
accumulation_steps = 4 # Effective batch = actual_batch * 4
optimizer.zero_grad()
for i, batch in enumerate(dataloader):
output = model(batch["input"].cuda())
loss = criterion(output, batch["target"].cuda()) / accumulation_steps
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()Cost Monitoring
class GPUCostMonitor:
"""Track and alert on GPU spending."""
def __init__(self, monthly_budget: float):
self.budget = monthly_budget
self.spend_log = []
def log_usage(self, gpu_type: str, hours: float, cost_per_hour: float, purpose: str):
self.spend_log.append({
"gpu": gpu_type,
"hours": hours,
"cost": hours * cost_per_hour,
"purpose": purpose,
"timestamp": datetime.utcnow().isoformat(),
})
def get_monthly_summary(self) -> dict:
from datetime import datetime
current_month = datetime.utcnow().strftime("%Y-%m")
monthly = [s for s in self.spend_log if s["timestamp"].startswith(current_month)]
total_cost = sum(s["cost"] for s in monthly)
by_purpose = {}
for s in monthly:
by_purpose.setdefault(s["purpose"], 0)
by_purpose[s["purpose"]] += s["cost"]
return {
"month": current_month,
"total_cost": total_cost,
"budget": self.budget,
"utilization": total_cost / self.budget * 100,
"by_purpose": by_purpose,
"alert": total_cost > self.budget * 0.8,
}Cost Optimization Summary
| Strategy | Savings | Effort | Risk | |----------|---------|--------|------| | Spot/Preemptible instances | 50-90% | Medium | Interruptions | | Right-size GPU selection | 30-60% | Low | None | | Off-peak scheduling | 20-30% | Low | Delayed training | | Mixed precision training | 40-50% (time) | Low | Minimal accuracy impact | | Auto-scaling inference | 30-60% | Medium | Cold start latency | | Reserved instances | 30-50% | Low | Commitment | | Multi-GPU parallelism | Faster (not cheaper) | High | Complexity |
Related Resources
- Kubeflow Pipelines — Kubernetes-native GPU scheduling
- Model serving architecture — GPU inference optimization
- MLOps best practices — Infrastructure planning
- What is MLOps? — MLOps fundamentals including infrastructure
Need to optimize your ML infrastructure costs? DeviDevs designs cost-efficient GPU infrastructure for training and serving. Get a free assessment →