API Security¶
Protect your LLM API with authentication, rate limiting, and access control.
Overview¶
Security measures for exposed LLM APIs:
- Authentication - Verify client identity
- Rate limiting - Prevent abuse and overload
- Access control - Restrict endpoints and models
- Logging - Audit access patterns
- Encryption - Protect data in transit
Threat Model¶
| Threat | Risk | Mitigation |
|---|---|---|
| Unauthorized access | High | Authentication |
| Resource exhaustion | High | Rate limiting |
| Data exfiltration | Medium | Logging, access control |
| Prompt injection | Medium | Input validation |
| Model theft | Low | Endpoint restrictions |
Authentication¶
API Key Authentication¶
Using Caddy as reverse proxy:
# Caddyfile
:8080 {
@valid_key header Authorization "Bearer {$API_KEY}"
handle @valid_key {
reverse_proxy localhost:11434
}
handle {
respond "Unauthorized" 401
}
}
# Set environment variable
export API_KEY=your-secret-key
# Start Caddy
caddy run --config Caddyfile
Basic Authentication¶
:8080 {
basicauth {
user1 $2a$14$... # bcrypt hash
user2 $2a$14$...
}
reverse_proxy localhost:11434
}
Generate password hash:
nginx Basic Auth¶
# /etc/nginx/conf.d/llm.conf
server {
listen 8080;
auth_basic "LLM API";
auth_basic_user_file /etc/nginx/.htpasswd;
location / {
proxy_pass http://localhost:11434;
}
}
Rate Limiting¶
Caddy Rate Limiting¶
Using rate_limit plugin:
{
order rate_limit before reverse_proxy
}
:8080 {
rate_limit {
zone dynamic {
key {remote_host}
events 60
window 1m
}
}
reverse_proxy localhost:11434
}
nginx Rate Limiting¶
limit_req_zone $binary_remote_addr zone=llm_limit:10m rate=10r/s;
limit_conn_zone $binary_remote_addr zone=llm_conn:10m;
server {
listen 8080;
# Rate limit requests
limit_req zone=llm_limit burst=20 nodelay;
# Limit concurrent connections
limit_conn llm_conn 5;
location / {
proxy_pass http://localhost:11434;
}
}
Traefik Rate Limiting¶
labels:
- "traefik.http.middlewares.ratelimit.ratelimit.average=10"
- "traefik.http.middlewares.ratelimit.ratelimit.burst=20"
- "traefik.http.middlewares.ratelimit.ratelimit.period=1s"
- "traefik.http.routers.ollama.middlewares=ratelimit"
Endpoint Restrictions¶
Allow Only Read Endpoints¶
server {
listen 8080;
# Allow chat/completion endpoints
location ~ ^/v1/(chat/completions|completions|models|embeddings)$ {
proxy_pass http://localhost:11434;
}
# Allow health checks
location /health {
proxy_pass http://localhost:11434;
}
# Block everything else
location / {
return 403;
}
}
Block Administrative Endpoints¶
# Block Ollama management endpoints
location ~ ^/api/(pull|push|delete|copy|create)$ {
return 403;
}
Caddy Endpoint Filtering¶
:8080 {
@allowed {
path /v1/chat/completions
path /v1/completions
path /v1/models
path /v1/embeddings
path /health
}
handle @allowed {
reverse_proxy localhost:11434
}
handle {
respond "Forbidden" 403
}
}
Request Validation¶
Size Limits¶
# Limit request body size
client_max_body_size 1m;
# Limit header size
large_client_header_buffers 4 16k;
Timeout Protection¶
# Prevent long-running requests from blocking
proxy_read_timeout 120s;
proxy_connect_timeout 10s;
proxy_send_timeout 60s;
Logging¶
nginx Access Logs¶
log_format llm_json escape=json '{'
'"time":"$time_iso8601",'
'"remote_addr":"$remote_addr",'
'"request":"$request",'
'"status":"$status",'
'"body_bytes_sent":"$body_bytes_sent",'
'"request_time":"$request_time",'
'"user":"$remote_user"'
'}';
access_log /var/log/nginx/llm_access.log llm_json;
Caddy Logging¶
:8080 {
log {
output file /var/log/caddy/llm_access.log
format json
}
reverse_proxy localhost:11434
}
Log Analysis¶
# Recent requests
tail -f /var/log/nginx/llm_access.log | jq .
# Top users
jq -r '.remote_addr' /var/log/nginx/llm_access.log | sort | uniq -c | sort -rn
# Failed requests
jq 'select(.status != "200")' /var/log/nginx/llm_access.log
Full Secure Setup¶
Docker Compose with Caddy¶
version: '3.8'
services:
caddy:
image: caddy:latest
container_name: caddy
ports:
- "8080:8080"
volumes:
- ./Caddyfile:/etc/caddy/Caddyfile:ro
- caddy_data:/data
environment:
- API_KEY=${API_KEY}
restart: unless-stopped
ollama:
image: ollama/ollama
container_name: ollama
volumes:
- /tank/ai/models/ollama:/root/.ollama
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
restart: unless-stopped
volumes:
caddy_data:
# Caddyfile
:8080 {
# Logging
log {
output file /data/access.log
format json
}
# API Key authentication
@valid_key header Authorization "Bearer {$API_KEY}"
# Rate limiting per IP
@rate_exceeded expression {http.rate_limit.exceeded}
handle @rate_exceeded {
respond "Too Many Requests" 429
}
handle @valid_key {
# Only allow safe endpoints
@allowed {
path /v1/chat/completions
path /v1/completions
path /v1/models
path /v1/embeddings
path /health
path /
}
handle @allowed {
reverse_proxy ollama:11434
}
handle {
respond "Forbidden" 403
}
}
handle {
respond "Unauthorized" 401
}
}
nginx Full Configuration¶
# /etc/nginx/conf.d/llm-secure.conf
# Rate limiting zones
limit_req_zone $binary_remote_addr zone=llm_burst:10m rate=10r/s;
limit_conn_zone $binary_remote_addr zone=llm_conn:10m;
# Upstream
upstream ollama {
server 127.0.0.1:11434;
keepalive 32;
}
server {
listen 8080;
server_name _;
# Logging
access_log /var/log/nginx/llm_access.log;
error_log /var/log/nginx/llm_error.log;
# Rate limiting
limit_req zone=llm_burst burst=20 nodelay;
limit_conn llm_conn 10;
# Size limits
client_max_body_size 1m;
# Timeouts
proxy_read_timeout 120s;
proxy_connect_timeout 10s;
# Basic auth
auth_basic "LLM API";
auth_basic_user_file /etc/nginx/.htpasswd;
# Allowed endpoints
location ~ ^/v1/(chat/completions|completions|models|embeddings)$ {
proxy_pass http://ollama;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header Host $host;
proxy_buffering off; # For streaming
}
location /health {
auth_basic off; # Allow unauthenticated health checks
proxy_pass http://ollama;
}
location / {
return 403;
}
}
Testing Security¶
Test Authentication¶
# Without auth (should fail)
curl -I http://localhost:8080/v1/models
# Expect: 401 Unauthorized
# With auth
curl -H "Authorization: Bearer your-key" http://localhost:8080/v1/models
# Expect: 200 OK
Test Rate Limiting¶
# Send many requests
for i in {1..50}; do
curl -s -o /dev/null -w "%{http_code}\n" \
-H "Authorization: Bearer your-key" \
http://localhost:8080/v1/models
done
# Should see 429 responses after limit
Test Endpoint Restrictions¶
# Try blocked endpoint
curl -H "Authorization: Bearer your-key" \
-X POST http://localhost:8080/api/pull \
-d '{"name": "malicious-model"}'
# Expect: 403 Forbidden
Monitoring¶
Failed Auth Attempts¶
Rate Limited Requests¶
Unusual Patterns¶
# Large request counts from single IP
awk '{print $1}' /var/log/nginx/llm_access.log | \
sort | uniq -c | sort -rn | head -20
See Also¶
- Remote Access Index - Overview
- Tailscale Integration - Secure access
- Load Balancing - Proxy configuration
- UFW Configuration - Firewall rules