Files
knowledge-base/src/knowledge_rag/vector_store.py
Ernie Cook 11c3f705ce Initial setup: Knowledge base RAG system with LlamaIndex and ChromaDB
- Add Python project with uv package manager
- Implement LlamaIndex + ChromaDB RAG pipeline
- Add sentence-transformers for local embeddings (all-MiniLM-L6-v2)
- Create MCP server with semantic search, indexing, and stats tools
- Add Markdown chunker with heading/wikilink/frontmatter support
- Add Dockerfile and docker-compose.yaml for self-hosted deployment
- Include sample Obsidian vault files for testing
- Add .gitignore and .env.example
2026-03-03 20:42:42 -05:00

138 lines
4.2 KiB
Python

"""ChromaDB vector store wrapper for knowledge base."""
import os
from typing import TYPE_CHECKING, Any, List, Optional
from llama_index.core.schema import TextNode
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
if TYPE_CHECKING:
from llama_index.core.embeddings import BaseEmbedding
class KnowledgeVectorStore:
"""ChromaDB vector store for the knowledge base.
Handles persistence of embeddings and semantic search.
"""
def __init__(
self,
persist_dir: str | None = None,
collection_name: str = "knowledge_base",
embedding_model: "BaseEmbedding | None" = None,
):
self._collection_name = collection_name
self._embedding_model = embedding_model
# Use Docker path if available, otherwise use local data dir
if persist_dir is None:
if os.path.exists("/data"):
persist_dir = "/data/chroma_db"
else:
persist_dir = "./data/chroma_db"
self._persist_dir = persist_dir
# Ensure persist directory exists
os.makedirs(persist_dir, exist_ok=True)
# Initialize ChromaDB client
self._client = chromadb.PersistentClient(path=persist_dir)
# Get or create collection
self._collection = self._client.get_or_create_collection(
name=collection_name,
metadata={"description": "Knowledge base embeddings"}
)
# Wrap in LlamaIndex vector store
# Pass the chroma_collection directly for PersistentClient
self._vector_store = ChromaVectorStore(
chroma_collection=self._collection,
)
def set_embedding_model(self, embedding_model: "BaseEmbedding") -> None:
"""Set the embedding model for query embedding."""
self._embedding_model = embedding_model
@property
def vector_store(self) -> ChromaVectorStore:
"""Get the LlamaIndex ChromaVectorStore."""
return self._vector_store
def add_nodes(self, nodes: List[TextNode], embedding_model: "BaseEmbedding | None" = None) -> None:
"""Add nodes to the vector store."""
from llama_index.core import VectorStoreIndex, StorageContext
# Use provided embedding model or the stored one
model = embedding_model or self._embedding_model
if model is None:
raise ValueError("No embedding model provided")
# First embed the nodes
for node in nodes:
node.embedding = model.get_text_embedding(node.text)
# Then add to vector store
self._vector_store.add(nodes)
def search(
self,
query: str,
top_k: int = 5,
filter: Optional[dict[str, Any]] = None,
) -> List[dict[str, Any]]:
"""Semantic search for similar chunks.
Args:
query: The search query
top_k: Number of results to return
filter: Optional metadata filters
Returns:
List of search results with text and metadata
"""
from llama_index.core import VectorStoreIndex
# Use embedding model if provided, otherwise use the one from storage
embed_model = self._embedding_model
index = VectorStoreIndex.from_vector_store(
self._vector_store,
embed_model=embed_model,
)
query_engine = index.as_retriever(
similarity_top_k=top_k,
filters=filter,
)
results = query_engine.retrieve(query)
return [
{
"text": node.text,
"score": node.score,
"metadata": node.metadata,
}
for node in results
]
def clear(self) -> None:
"""Clear all embeddings from the store."""
self._client.delete_collection(self._collection_name)
self._collection = self._client.get_or_create_collection(
name=self._collection_name,
metadata={"description": "Knowledge base embeddings"}
)
def get_stats(self) -> dict[str, Any]:
"""Get vector store statistics."""
return {
"total_chunks": self._collection.count(),
"collection_name": self._collection_name,
}