"""ChromaDB vector store wrapper for knowledge base.""" import os from typing import TYPE_CHECKING, Any, List, Optional from llama_index.core.schema import TextNode from llama_index.vector_stores.chroma import ChromaVectorStore import chromadb if TYPE_CHECKING: from llama_index.core.embeddings import BaseEmbedding class KnowledgeVectorStore: """ChromaDB vector store for the knowledge base. Handles persistence of embeddings and semantic search. """ def __init__( self, persist_dir: str | None = None, collection_name: str = "knowledge_base", embedding_model: "BaseEmbedding | None" = None, ): self._collection_name = collection_name self._embedding_model = embedding_model # Use Docker path if available, otherwise use local data dir if persist_dir is None: if os.path.exists("/data"): persist_dir = "/data/chroma_db" else: persist_dir = "./data/chroma_db" self._persist_dir = persist_dir # Ensure persist directory exists os.makedirs(persist_dir, exist_ok=True) # Initialize ChromaDB client self._client = chromadb.PersistentClient(path=persist_dir) # Get or create collection self._collection = self._client.get_or_create_collection( name=collection_name, metadata={"description": "Knowledge base embeddings"} ) # Wrap in LlamaIndex vector store # Pass the chroma_collection directly for PersistentClient self._vector_store = ChromaVectorStore( chroma_collection=self._collection, ) def set_embedding_model(self, embedding_model: "BaseEmbedding") -> None: """Set the embedding model for query embedding.""" self._embedding_model = embedding_model @property def vector_store(self) -> ChromaVectorStore: """Get the LlamaIndex ChromaVectorStore.""" return self._vector_store def add_nodes(self, nodes: List[TextNode], embedding_model: "BaseEmbedding | None" = None) -> None: """Add nodes to the vector store.""" from llama_index.core import VectorStoreIndex, StorageContext # Use provided embedding model or the stored one model = embedding_model or self._embedding_model if model is None: raise ValueError("No embedding model provided") # First embed the nodes for node in nodes: node.embedding = model.get_text_embedding(node.text) # Then add to vector store self._vector_store.add(nodes) def search( self, query: str, top_k: int = 5, filter: Optional[dict[str, Any]] = None, ) -> List[dict[str, Any]]: """Semantic search for similar chunks. Args: query: The search query top_k: Number of results to return filter: Optional metadata filters Returns: List of search results with text and metadata """ from llama_index.core import VectorStoreIndex # Use embedding model if provided, otherwise use the one from storage embed_model = self._embedding_model index = VectorStoreIndex.from_vector_store( self._vector_store, embed_model=embed_model, ) query_engine = index.as_retriever( similarity_top_k=top_k, filters=filter, ) results = query_engine.retrieve(query) return [ { "text": node.text, "score": node.score, "metadata": node.metadata, } for node in results ] def clear(self) -> None: """Clear all embeddings from the store.""" self._client.delete_collection(self._collection_name) self._collection = self._client.get_or_create_collection( name=self._collection_name, metadata={"description": "Knowledge base embeddings"} ) def get_stats(self) -> dict[str, Any]: """Get vector store statistics.""" return { "total_chunks": self._collection.count(), "collection_name": self._collection_name, }