Fine-Tuned Model Inference¶
Using your fine-tuned models for inference.
Loading Fine-Tuned Models¶
LoRA Adapters (Separate)¶
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.2-3B",
torch_dtype=torch.float16,
device_map="auto",
)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
# Load LoRA adapters
model = PeftModel.from_pretrained(base_model, "./lora-adapters")
model.eval()
Merged Model¶
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load merged model directly
model = AutoModelForCausalLM.from_pretrained(
"./merged-model",
torch_dtype=torch.float16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained("./merged-model")
model.eval()
With Quantization¶
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
model = AutoModelForCausalLM.from_pretrained(
"./merged-model",
quantization_config=bnb_config,
device_map="auto",
)
Generation¶
Basic Generation¶
def generate(prompt: str, max_tokens: int = 256) -> str:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response[len(prompt):] # Remove prompt from output
# Use
prompt = """### Instruction:
Summarize the following text in one sentence.
### Input:
Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed.
### Response:
"""
response = generate(prompt)
print(response)
Streaming Generation¶
from transformers import TextIteratorStreamer
from threading import Thread
def stream_generate(prompt: str, max_tokens: int = 256):
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
streamer = TextIteratorStreamer(
tokenizer,
skip_prompt=True,
skip_special_tokens=True
)
generation_kwargs = {
**inputs,
"max_new_tokens": max_tokens,
"temperature": 0.7,
"streamer": streamer,
}
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
for text in streamer:
print(text, end="", flush=True)
thread.join()
Batch Generation¶
def batch_generate(prompts: list[str], max_tokens: int = 256) -> list[str]:
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(
prompts,
return_tensors="pt",
padding=True,
truncation=True,
).to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)
return [r[len(p):] for r, p in zip(responses, prompts)]
Converting to GGUF¶
For use with llama.cpp, Ollama, or other inference engines.
Merge and Convert¶
# First merge LoRA adapters
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.2-3B",
torch_dtype=torch.float16,
)
model = PeftModel.from_pretrained(base_model, "./lora-adapters")
# Merge and save
merged = model.merge_and_unload()
merged.save_pretrained("./merged-model")
tokenizer.save_pretrained("./merged-model")
Convert to GGUF¶
# Clone llama.cpp
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
# Install requirements
pip install -r requirements.txt
# Convert
python convert_hf_to_gguf.py ../merged-model --outfile model-f16.gguf
# Quantize
./llama-quantize model-f16.gguf model-q4_k_m.gguf Q4_K_M
Quantization Options¶
| Quant | Size | Speed | Quality |
|---|---|---|---|
| Q2_K | Smallest | Fastest | Lowest |
| Q3_K_M | Small | Fast | Low |
| Q4_K_M | Medium | Good | Good |
| Q5_K_M | Large | Slower | Better |
| Q6_K | Larger | Slow | High |
| Q8_0 | Largest | Slowest | Highest |
Deploy with Ollama¶
Create Modelfile¶
# Modelfile
FROM ./model-q4_k_m.gguf
# Chat template (adjust for your model)
TEMPLATE """{{ if .System }}<|system|>
{{ .System }}<|end|>
{{ end }}{{ if .Prompt }}<|user|>
{{ .Prompt }}<|end|>
{{ end }}<|assistant|>
{{ .Response }}<|end|>"""
# Generation parameters
PARAMETER stop "<|end|>"
PARAMETER stop "<|user|>"
PARAMETER temperature 0.7
PARAMETER top_p 0.9
# System prompt (optional)
SYSTEM """You are a helpful assistant specialized in..."""
Create and Run¶
# Create model
ollama create my-finetuned -f Modelfile
# Test
ollama run my-finetuned "Your prompt here"
# API access
curl http://localhost:11434/api/generate -d '{
"model": "my-finetuned",
"prompt": "Your prompt here",
"stream": false
}'
Deploy with vLLM¶
High-throughput serving:
# Install
pip install vllm
# Serve (merged model or GPTQ)
python -m vllm.entrypoints.openai.api_server \
--model ./merged-model \
--dtype float16 \
--max-model-len 4096
# With quantization
python -m vllm.entrypoints.openai.api_server \
--model ./merged-model \
--quantization awq \
--dtype float16
# Client
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="none")
response = client.chat.completions.create(
model="./merged-model",
messages=[{"role": "user", "content": "Your prompt"}],
temperature=0.7,
)
print(response.choices[0].message.content)
FastAPI Server¶
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
app = FastAPI()
# Load model at startup
model = AutoModelForCausalLM.from_pretrained(
"./merged-model",
torch_dtype=torch.float16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained("./merged-model")
class GenerateRequest(BaseModel):
prompt: str
max_tokens: int = 256
temperature: float = 0.7
class GenerateResponse(BaseModel):
response: str
@app.post("/generate", response_model=GenerateResponse)
async def generate(request: GenerateRequest):
inputs = tokenizer(request.prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=request.max_tokens,
temperature=request.temperature,
do_sample=True,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return GenerateResponse(response=response[len(request.prompt):])
Performance Optimization¶
Batch Requests¶
# Accumulate requests and batch process
from asyncio import Queue, create_task, sleep
request_queue = Queue()
BATCH_SIZE = 8
BATCH_TIMEOUT = 0.1 # seconds
async def batch_processor():
while True:
batch = []
while len(batch) < BATCH_SIZE:
try:
item = await asyncio.wait_for(
request_queue.get(),
timeout=BATCH_TIMEOUT
)
batch.append(item)
except asyncio.TimeoutError:
break
if batch:
prompts = [item["prompt"] for item in batch]
responses = batch_generate(prompts)
for item, response in zip(batch, responses):
item["future"].set_result(response)
KV Cache¶
# Enable KV cache for faster generation
outputs = model.generate(
**inputs,
use_cache=True, # Default, but explicit
...
)
Flash Attention¶
# Install flash-attn
pip install flash-attn
# Load model with Flash Attention
model = AutoModelForCausalLM.from_pretrained(
"./merged-model",
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
device_map="auto",
)