Building an AI Inference Server with FastAPI — Production LLM Serving Guide
How to build a production-grade AI model inference server with FastAPI and uvicorn. Covers async processing, batch inference, GPU utilization, and Kubernetes deployment.
TestForge Team ·
Why FastAPI for AI Inference?
FastAPI is the top choice for AI inference servers because:
- Async I/O: Handle other requests while waiting for inference
- Type hints + Pydantic: Automatic request/response schema validation
- Auto API docs: Swagger UI available at
/docsout of the box - Python ecosystem: Seamless integration with PyTorch and HuggingFace
Basic Inference Server Structure
# main.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from contextlib import asynccontextmanager
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
model = None
tokenizer = None
@asynccontextmanager
async def lifespan(app: FastAPI):
# Load model once at server startup
global model, tokenizer
tokenizer = AutoTokenizer.from_pretrained("your-model")
model = AutoModelForCausalLM.from_pretrained(
"your-model",
torch_dtype=torch.float16,
device_map="auto",
)
model.eval()
yield
# Cleanup on shutdown
del model, tokenizer
app = FastAPI(lifespan=lifespan)
class InferRequest(BaseModel):
prompt: str
max_tokens: int = 256
temperature: float = 0.7
class InferResponse(BaseModel):
text: str
tokens_used: int
@app.post("/infer", response_model=InferResponse)
async def infer(req: InferRequest):
inputs = tokenizer(req.prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=req.max_tokens,
temperature=req.temperature,
do_sample=True,
)
text = tokenizer.decode(output[0], skip_special_tokens=True)
return InferResponse(text=text, tokens_used=len(output[0]))
Batch Inference for Higher Throughput
Processing requests one at a time leaves GPU utilization low. Batch them together.
import asyncio
from collections import deque
batch_queue = deque()
BATCH_SIZE = 8
BATCH_TIMEOUT = 0.05 # 50ms
async def batch_processor():
while True:
await asyncio.sleep(BATCH_TIMEOUT)
if not batch_queue:
continue
batch = []
while batch_queue and len(batch) < BATCH_SIZE:
batch.append(batch_queue.popleft())
# Batch inference
prompts = [item["prompt"] for item in batch]
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=256)
for i, item in enumerate(batch):
text = tokenizer.decode(outputs[i], skip_special_tokens=True)
item["future"].set_result(text)
@app.on_event("startup")
async def start_batch_processor():
asyncio.create_task(batch_processor())
Timeout and Error Handling
import asyncio
from fastapi import HTTPException
@app.post("/infer")
async def infer(req: InferRequest):
try:
result = await asyncio.wait_for(
run_inference(req),
timeout=30.0 # 30-second timeout
)
return result
except asyncio.TimeoutError:
raise HTTPException(status_code=504, detail="Inference timeout")
except torch.cuda.OutOfMemoryError:
torch.cuda.empty_cache()
raise HTTPException(status_code=503, detail="GPU OOM, retry later")
Health Check Endpoint
@app.get("/health")
async def health():
gpu_available = torch.cuda.is_available()
gpu_memory = {}
if gpu_available:
gpu_memory = {
"allocated": f"{torch.cuda.memory_allocated() / 1e9:.2f}GB",
"reserved": f"{torch.cuda.memory_reserved() / 1e9:.2f}GB",
}
return {
"status": "ok",
"model_loaded": model is not None,
"gpu": gpu_available,
"gpu_memory": gpu_memory,
}
Kubernetes Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: ai-server
spec:
replicas: 2
template:
spec:
containers:
- name: ai-server
image: your-registry/ai-server:latest
resources:
limits:
nvidia.com/gpu: 1
memory: "16Gi"
requests:
nvidia.com/gpu: 1
memory: "12Gi"
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 120 # Allow time for model loading
periodSeconds: 30
Running the Server
# Development
uvicorn main:app --reload --host 0.0.0.0 --port 8000
# Production (workers=1 recommended — GPU memory sharing issues)
uvicorn main:app --host 0.0.0.0 --port 8000 --workers 1
# Or gunicorn + uvicorn worker
gunicorn main:app -w 1 -k uvicorn.workers.UvicornWorker --bind 0.0.0.0:8000
Warning about workers > 1 in production: Each worker occupies GPU memory independently.
Scale horizontally by replicating Pods, and keep each Pod at workers=1.