ChromaDB¶
ChromaDB is an open-source, local-first vector database designed for simplicity and ease of use.
Installation¶
Basic Usage¶
In-Memory (Development)¶
import chromadb
# Ephemeral client (data lost on restart)
client = chromadb.Client()
# Create collection
collection = client.create_collection(name="my_collection")
# Add documents
collection.add(
documents=["Document 1 content", "Document 2 content"],
ids=["id1", "id2"]
)
# Query
results = collection.query(
query_texts=["search query"],
n_results=2
)
Persistent Storage¶
import chromadb
from chromadb.config import Settings
# Persistent client
client = chromadb.PersistentClient(
path="./chroma_db",
settings=Settings(anonymized_telemetry=False)
)
# Get or create collection
collection = client.get_or_create_collection(name="documents")
Client-Server Mode¶
Embedding Functions¶
Default (Sentence Transformers)¶
Ollama Embeddings¶
from chromadb.utils import embedding_functions
ollama_ef = embedding_functions.OllamaEmbeddingFunction(
model_name="nomic-embed-text",
url="http://localhost:11434/api/embeddings"
)
collection = client.create_collection(
name="ollama_collection",
embedding_function=ollama_ef
)
Custom Embedding Function¶
from chromadb import Documents, EmbeddingFunction, Embeddings
import requests
class CustomEmbedding(EmbeddingFunction):
def __call__(self, input: Documents) -> Embeddings:
# Call your embedding API
response = requests.post(
"http://localhost:11434/api/embeddings",
json={"model": "nomic-embed-text", "prompt": input[0]}
)
return [response.json()["embedding"]]
collection = client.create_collection(
name="custom",
embedding_function=CustomEmbedding()
)
Document Operations¶
Adding Documents¶
# With auto-generated embeddings
collection.add(
documents=["Doc 1", "Doc 2", "Doc 3"],
ids=["1", "2", "3"],
metadatas=[
{"source": "file1.pdf", "page": 1},
{"source": "file2.pdf", "page": 1},
{"source": "file2.pdf", "page": 2}
]
)
# With pre-computed embeddings
collection.add(
embeddings=[[0.1, 0.2, ...], [0.3, 0.4, ...]],
documents=["Doc 1", "Doc 2"],
ids=["1", "2"]
)
Updating Documents¶
collection.update(
ids=["1"],
documents=["Updated content"],
metadatas=[{"source": "file1.pdf", "page": 1, "updated": True}]
)
Upserting Documents¶
Deleting Documents¶
# By ID
collection.delete(ids=["1", "2"])
# By filter
collection.delete(where={"source": "file1.pdf"})
Querying¶
Basic Query¶
results = collection.query(
query_texts=["What is machine learning?"],
n_results=5
)
# Results structure
print(results["ids"]) # [["id1", "id2", ...]]
print(results["documents"]) # [["doc1", "doc2", ...]]
print(results["distances"]) # [[0.1, 0.2, ...]]
print(results["metadatas"]) # [[{...}, {...}, ...]]
With Pre-computed Embeddings¶
Metadata Filtering¶
# Equality
results = collection.query(
query_texts=["query"],
where={"source": "manual.pdf"}
)
# Comparison operators
results = collection.query(
query_texts=["query"],
where={"page": {"$gt": 10}} # page > 10
)
# Logical operators
results = collection.query(
query_texts=["query"],
where={
"$and": [
{"source": "manual.pdf"},
{"page": {"$lte": 50}}
]
}
)
# In list
results = collection.query(
query_texts=["query"],
where={"category": {"$in": ["tech", "science"]}}
)
Document Content Filtering¶
results = collection.query(
query_texts=["query"],
where_document={"$contains": "specific phrase"}
)
Include/Exclude Fields¶
results = collection.query(
query_texts=["query"],
include=["documents", "metadatas", "distances"] # exclude embeddings
)
Collection Management¶
List Collections¶
collections = client.list_collections()
for col in collections:
print(f"{col.name}: {col.count()} documents")
Get Collection Info¶
collection = client.get_collection(name="documents")
print(f"Count: {collection.count()}")
print(f"Metadata: {collection.metadata}")
Delete Collection¶
Collection Settings¶
from chromadb.config import Settings
collection = client.create_collection(
name="custom_settings",
metadata={
"hnsw:space": "cosine", # or "l2", "ip"
"hnsw:construction_ef": 100,
"hnsw:search_ef": 50,
"hnsw:M": 16
}
)
LangChain Integration¶
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text")
# Create from documents
vectorstore = Chroma.from_documents(
documents=docs,
embedding=embeddings,
persist_directory="./chroma_db",
collection_name="langchain_docs"
)
# Load existing
vectorstore = Chroma(
persist_directory="./chroma_db",
embedding_function=embeddings,
collection_name="langchain_docs"
)
# Search
results = vectorstore.similarity_search("query", k=4)
# As retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
Docker Deployment¶
# docker-compose.yml
services:
chroma:
image: chromadb/chroma:latest
container_name: chroma
ports:
- "8000:8000"
volumes:
- chroma_data:/chroma/chroma
environment:
- ANONYMIZED_TELEMETRY=False
- ALLOW_RESET=False
volumes:
chroma_data:
Performance Tips¶
Batch Operations¶
# Add in batches for large datasets
batch_size = 1000
for i in range(0, len(documents), batch_size):
batch_docs = documents[i:i + batch_size]
batch_ids = ids[i:i + batch_size]
collection.add(documents=batch_docs, ids=batch_ids)
Index Tuning¶
# Higher M = better recall, more memory
# Higher ef = better recall, slower search
collection = client.create_collection(
name="tuned",
metadata={
"hnsw:M": 32, # Default: 16
"hnsw:construction_ef": 200, # Default: 100
"hnsw:search_ef": 100 # Default: 10
}
)
Memory Management¶
# For large collections, use persistent client
client = chromadb.PersistentClient(path="./db")
# Reset connection periodically for long-running processes
client.reset()
Limitations¶
- Single-node only (no distributed mode)
- No GPU acceleration
- Limited to ~10M vectors efficiently
- Basic filtering compared to Qdrant/Milvus
See Also¶
- Vector Databases Overview
- Qdrant - For production workloads
- RAG Implementation