Initial setup: Knowledge base RAG system with LlamaIndex and ChromaDB

- Add Python project with uv package manager - Implement LlamaIndex + ChromaDB RAG pipeline - Add sentence-transformers for local embeddings (all-MiniLM-L6-v2) - Create MCP server with semantic search, indexing, and stats tools - Add Markdown chunker with heading/wikilink/frontmatter support - Add Dockerfile and docker-compose.yaml for self-hosted deployment - Include sample Obsidian vault files for testing - Add .gitignore and .env.example
2026-03-03 20:42:42 -05:00
parent 94dd158d1c
commit 11c3f705ce
11 changed files with 5319 additions and 0 deletions
--- a/src/knowledge_rag/vector_store.py
+++ b/src/knowledge_rag/vector_store.py
@ -0,0 +1,137 @@
+"""ChromaDB vector store wrapper for knowledge base."""
+
+import os
+from typing import TYPE_CHECKING, Any, List, Optional
+
+from llama_index.core.schema import TextNode
+from llama_index.vector_stores.chroma import ChromaVectorStore
+import chromadb
+
+if TYPE_CHECKING:
+    from llama_index.core.embeddings import BaseEmbedding
+
+
+class KnowledgeVectorStore:
+    """ChromaDB vector store for the knowledge base.
+
+    Handles persistence of embeddings and semantic search.
+    """
+
+    def __init__(
+        self,
+        persist_dir: str | None = None,
+        collection_name: str = "knowledge_base",
+        embedding_model: "BaseEmbedding | None" = None,
+    ):
+        self._collection_name = collection_name
+        self._embedding_model = embedding_model
+
+        # Use Docker path if available, otherwise use local data dir
+        if persist_dir is None:
+            if os.path.exists("/data"):
+                persist_dir = "/data/chroma_db"
+            else:
+                persist_dir = "./data/chroma_db"
+
+        self._persist_dir = persist_dir
+
+        # Ensure persist directory exists
+        os.makedirs(persist_dir, exist_ok=True)
+
+        # Initialize ChromaDB client
+        self._client = chromadb.PersistentClient(path=persist_dir)
+
+        # Get or create collection
+        self._collection = self._client.get_or_create_collection(
+            name=collection_name,
+            metadata={"description": "Knowledge base embeddings"}
+        )
+
+        # Wrap in LlamaIndex vector store
+        # Pass the chroma_collection directly for PersistentClient
+        self._vector_store = ChromaVectorStore(
+            chroma_collection=self._collection,
+        )
+
+    def set_embedding_model(self, embedding_model: "BaseEmbedding") -> None:
+        """Set the embedding model for query embedding."""
+        self._embedding_model = embedding_model
+
+    @property
+    def vector_store(self) -> ChromaVectorStore:
+        """Get the LlamaIndex ChromaVectorStore."""
+        return self._vector_store
+
+    def add_nodes(self, nodes: List[TextNode], embedding_model: "BaseEmbedding | None" = None) -> None:
+        """Add nodes to the vector store."""
+        from llama_index.core import VectorStoreIndex, StorageContext
+        
+        # Use provided embedding model or the stored one
+        model = embedding_model or self._embedding_model
+        
+        if model is None:
+            raise ValueError("No embedding model provided")
+        
+        # First embed the nodes
+        for node in nodes:
+            node.embedding = model.get_text_embedding(node.text)
+        
+        # Then add to vector store
+        self._vector_store.add(nodes)
+
+    def search(
+        self,
+        query: str,
+        top_k: int = 5,
+        filter: Optional[dict[str, Any]] = None,
+    ) -> List[dict[str, Any]]:
+        """Semantic search for similar chunks.
+
+        Args:
+            query: The search query
+            top_k: Number of results to return
+            filter: Optional metadata filters
+
+        Returns:
+            List of search results with text and metadata
+        """
+        from llama_index.core import VectorStoreIndex
+
+        # Use embedding model if provided, otherwise use the one from storage
+        embed_model = self._embedding_model
+
+        index = VectorStoreIndex.from_vector_store(
+            self._vector_store,
+            embed_model=embed_model,
+        )
+
+        query_engine = index.as_retriever(
+            similarity_top_k=top_k,
+            filters=filter,
+        )
+
+        results = query_engine.retrieve(query)
+
+        return [
+            {
+                "text": node.text,
+                "score": node.score,
+                "metadata": node.metadata,
+            }
+            for node in results
+        ]
+
+    def clear(self) -> None:
+        """Clear all embeddings from the store."""
+        self._client.delete_collection(self._collection_name)
+        self._collection = self._client.get_or_create_collection(
+            name=self._collection_name,
+            metadata={"description": "Knowledge base embeddings"}
+        )
+
+    def get_stats(self) -> dict[str, Any]:
+        """Get vector store statistics."""
+        return {
+            "total_chunks": self._collection.count(),
+            "collection_name": self._collection_name,
+        }