Your LLM API Dies When the GPU Fails
A GPU hits a thermal limit and throttles. The vLLM process crashes from an OOM spike. The CUDA driver hangs after an ECC error. Without a fallback strategy, your entire LLM service goes offline until someone manually restarts the process. Production systems on your GPU server need automatic failure detection and graceful degradation so users always get a response — even if it comes from a slower backend.
Health Check Implementation
Detect failures before users do with active and passive health checks:
import asyncio, httpx, time
from enum import Enum
class BackendStatus(Enum):
HEALTHY = "healthy"
DEGRADED = "degraded"
DOWN = "down"
class HealthChecker:
def __init__(self, backends):
self.backends = {b: BackendStatus.HEALTHY for b in backends}
self.last_check = {b: 0 for b in backends}
self.consecutive_failures = {b: 0 for b in backends}
async def check_backend(self, url):
try:
async with httpx.AsyncClient(timeout=5.0) as client:
start = time.time()
resp = await client.get(f"{url}/health")
latency = time.time() - start
if resp.status_code == 200 and latency < 2.0:
self.consecutive_failures[url] = 0
return BackendStatus.HEALTHY
elif resp.status_code == 200:
return BackendStatus.DEGRADED # Slow but alive
return BackendStatus.DOWN
except Exception:
self.consecutive_failures[url] += 1
if self.consecutive_failures[url] >= 3:
return BackendStatus.DOWN
return BackendStatus.DEGRADED
async def monitor(self, interval=10):
while True:
for backend in self.backends:
self.backends[backend] = await self.check_backend(backend)
await asyncio.sleep(interval)
def get_healthy_backends(self):
return [b for b, s in self.backends.items()
if s != BackendStatus.DOWN]
Multi-Tier Fallback Chain
Define a priority order for backends with automatic promotion when primaries fail:
class FallbackRouter:
def __init__(self):
self.tiers = [
# Tier 1: Primary GPU backend (fastest)
{"url": "http://gpu1:8000", "type": "vllm_gpu",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct"},
# Tier 2: Secondary GPU (different server)
{"url": "http://gpu2:8000", "type": "vllm_gpu",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct"},
# Tier 3: Smaller model on same GPU (if primary model crashed)
{"url": "http://localhost:11434", "type": "ollama",
"model": "llama3.1:8b-q4"},
# Tier 4: CPU inference (slow but always available)
{"url": "http://localhost:8080", "type": "llamacpp_cpu",
"model": "llama-3.1-8b-q4_k_m.gguf"},
]
self.health = HealthChecker([t["url"] for t in self.tiers])
async def route(self, payload):
for tier in self.tiers:
if self.health.backends.get(tier["url"]) != BackendStatus.DOWN:
try:
result = await self.call_backend(tier, payload)
result["_backend"] = tier["type"]
return result
except Exception:
self.health.backends[tier["url"]] = BackendStatus.DOWN
continue
# All backends down — return a static error response
return {"error": "All inference backends unavailable",
"retry_after": 30}
async def call_backend(self, tier, payload):
async with httpx.AsyncClient(timeout=60.0) as client:
payload["model"] = tier["model"]
if tier["type"] == "ollama":
resp = await client.post(
f"{tier['url']}/api/chat", json=payload)
else:
resp = await client.post(
f"{tier['url']}/v1/chat/completions", json=payload)
resp.raise_for_status()
return resp.json()
Graceful Degradation Strategies
When falling back to weaker backends, adjust expectations accordingly:
async def degraded_response(payload, tier_type):
if tier_type == "llamacpp_cpu":
# CPU is slow — reduce max_tokens to keep latency bearable
payload["max_tokens"] = min(payload.get("max_tokens", 500), 200)
payload["temperature"] = 0 # Greedy for speed
# Add disclaimer to response
result = await call_cpu_backend(payload)
result["_degraded"] = True
result["_notice"] = "Response from fallback — may be slower or shorter"
return result
elif tier_type == "ollama":
# Ollama with smaller quant — still decent quality
payload["max_tokens"] = min(payload.get("max_tokens", 500), 400)
return await call_ollama(payload)
Automatic Recovery and Process Restart
Restart crashed processes without human intervention:
# systemd service with automatic restart
# /etc/systemd/system/vllm.service
[Unit]
Description=vLLM Inference Server
After=network.target
[Service]
Type=simple
User=inference
ExecStart=/opt/vllm/venv/bin/python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
--port 8000
Restart=always
RestartSec=10
StartLimitInterval=300
StartLimitBurst=5
# Health check with watchdog
WatchdogSec=30
ExecStartPost=/usr/local/bin/wait-for-healthy.sh localhost 8000
[Install]
WantedBy=multi-user.target
Failure Alerting
import smtplib
from email.mime.text import MIMEText
async def alert_on_failover(from_tier, to_tier, error):
msg = MIMEText(
f"LLM failover triggered\n"
f"From: {from_tier}\nTo: {to_tier}\nError: {error}\n"
f"Action: Check GPU server and restart primary if needed"
)
msg["Subject"] = f"[ALERT] LLM Failover: {from_tier} -> {to_tier}"
# Send via configured SMTP relay
Resilient LLM serving on your GPU server means users never see downtime. For vLLM production deployments, the production guide covers systemd and health check patterns. Ollama makes an excellent fallback tier. See the LLM hosting section for multi-backend architecture, infrastructure guides for monitoring, and tutorials for setup walkthroughs.
Reliable LLM Infrastructure
Multi-GPU servers with automatic failover. GigaGPU dedicated servers keep your AI always responding.
Browse GPU Servers