Quick Reference¶
Command cheat sheet for local LLM operations.
Ollama¶
Basic Commands¶
# Start server
ollama serve
# Run model (pulls if needed)
ollama run llama3.3:70b
# Pull model
ollama pull llama3.3:70b-instruct-q4_K_M
# List models
ollama list
# Show model info
ollama show llama3.3:70b
# Remove model
ollama rm llama3.3:70b
# Copy/rename model
ollama cp llama3.3:70b my-llama
# Running models
ollama ps
# Stop model
ollama stop llama3.3:70b
API Calls¶
# Chat completion (OpenAI compatible)
curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model": "llama3.3", "messages": [{"role": "user", "content": "Hello"}]}'
# Native generate
curl http://localhost:11434/api/generate \
-d '{"model": "llama3.3", "prompt": "Hello", "stream": false}'
# Embeddings
curl http://localhost:11434/api/embeddings \
-d '{"model": "nomic-embed-text", "prompt": "Hello world"}'
# List models
curl http://localhost:11434/v1/models
Environment Variables¶
export OLLAMA_HOST=0.0.0.0:11434
export OLLAMA_MODELS=/tank/ai/models/ollama
export OLLAMA_NUM_PARALLEL=4
export OLLAMA_MAX_LOADED_MODELS=2
export OLLAMA_KEEP_ALIVE=30m
llama.cpp¶
Build¶
# Clone
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
# Build (CUDA)
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release
# Build (ROCm)
cmake -B build -DGGML_HIP=ON
cmake --build build --config Release
# Build (Metal - macOS)
cmake -B build -DGGML_METAL=ON
cmake --build build --config Release
Server¶
# Basic server
./llama-server \
-m model.gguf \
--host 0.0.0.0 \
--port 8080
# Full options
./llama-server \
-m model.gguf \
--host 0.0.0.0 \
--port 8080 \
-c 8192 \ # Context length
-ngl 99 \ # GPU layers
--parallel 4 \ # Concurrent requests
--cont-batching \ # Continuous batching
--flash-attn \ # Flash attention
--metrics # Enable metrics
Benchmarking¶
./llama-bench \
-m model.gguf \
-p 512 \ # Prompt tokens
-n 128 \ # Generated tokens
-ngl 99 \ # GPU layers
-r 5 # Repetitions
CLI Inference¶
Docker¶
Ollama¶
# Start Ollama (NVIDIA)
docker run -d \
--gpus all \
-v /tank/ai/models/ollama:/root/.ollama \
-p 11434:11434 \
--name ollama \
ollama/ollama
# Start Ollama (AMD)
docker run -d \
--device=/dev/kfd --device=/dev/dri \
--group-add video --group-add render \
-v /tank/ai/models/ollama:/root/.ollama \
-p 11434:11434 \
--name ollama \
ollama/ollama:rocm
# Run command in container
docker exec ollama ollama pull llama3.3:70b
docker exec -it ollama ollama run llama3.3:70b
llama.cpp¶
# Start server (NVIDIA)
docker run -d \
--gpus all \
-v /tank/ai/models/gguf:/models \
-p 8080:8080 \
--name llama-server \
ghcr.io/ggml-org/llama.cpp:server-cuda \
-m /models/llama-3.3-70b-q4_k_m.gguf \
--host 0.0.0.0 -c 8192 -ngl 99
# Start server (AMD)
docker run -d \
--device=/dev/kfd --device=/dev/dri \
--group-add video --group-add render \
-v /tank/ai/models/gguf:/models \
-p 8080:8080 \
ghcr.io/ggml-org/llama.cpp:server-rocm \
-m /models/llama-3.3-70b-q4_k_m.gguf \
--host 0.0.0.0 -c 8192 -ngl 99
Open WebUI¶
docker run -d \
-p 3000:8080 \
-v /tank/ai/data/open-webui:/app/backend/data \
-e OLLAMA_BASE_URL=http://host.docker.internal:11434 \
--add-host=host.docker.internal:host-gateway \
--name open-webui \
ghcr.io/open-webui/open-webui:main
Hugging Face¶
# Login
huggingface-cli login
# Download model
huggingface-cli download meta-llama/Llama-3.3-70B-Instruct
# Download GGUF
huggingface-cli download bartowski/Llama-3.3-70B-Instruct-GGUF \
--include "*.Q4_K_M.gguf" \
--local-dir /tank/ai/models/gguf/
# Scan cache
huggingface-cli scan-cache
# Set cache location
export HF_HOME=/tank/ai/models/huggingface
GPU Monitoring¶
NVIDIA¶
# Basic status
nvidia-smi
# Continuous monitoring
nvidia-smi -l 1
# Memory only
nvidia-smi --query-gpu=memory.used,memory.total --format=csv
# Watch utilization
watch -n 1 nvidia-smi
AMD¶
ZFS for Models¶
# Create dataset
zfs create -o recordsize=1M -o compression=off tank/ai/models
# Create subdatasets
zfs create tank/ai/models/ollama
zfs create tank/ai/models/gguf
zfs create tank/ai/models/huggingface
# Snapshot
zfs snapshot tank/ai/models@backup
# Check usage
zfs list -r tank/ai/models
Tailscale¶
# Expose Ollama
tailscale serve --bg https+insecure://localhost:11434
# Check status
tailscale serve status
# Reset
tailscale serve reset
Coding Tools¶
Aider¶
# With Ollama
aider --model ollama/deepseek-coder-v2:16b
# With OpenAI-compatible
aider --openai-api-base http://localhost:8080/v1 --model llama3.3
# Add files
aider src/main.py tests/test_main.py
Environment Setup¶
# For OpenAI-compatible tools
export OPENAI_API_BASE=http://localhost:11434/v1
export OPENAI_API_KEY=not-needed
# For Ollama-native tools
export OLLAMA_HOST=http://localhost:11434
Testing¶
# Test Ollama
curl http://localhost:11434/
# Test llama.cpp
curl http://localhost:8080/health
# Test chat
curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model": "llama3.3", "messages": [{"role": "user", "content": "Hi"}]}'
# List models
curl http://localhost:11434/v1/models