Initial setup: Knowledge base RAG system with LlamaIndex and ChromaDB
- Add Python project with uv package manager - Implement LlamaIndex + ChromaDB RAG pipeline - Add sentence-transformers for local embeddings (all-MiniLM-L6-v2) - Create MCP server with semantic search, indexing, and stats tools - Add Markdown chunker with heading/wikilink/frontmatter support - Add Dockerfile and docker-compose.yaml for self-hosted deployment - Include sample Obsidian vault files for testing - Add .gitignore and .env.example
This commit is contained in:
137
src/knowledge_rag/vector_store.py
Normal file
137
src/knowledge_rag/vector_store.py
Normal file
@ -0,0 +1,137 @@
|
||||
"""ChromaDB vector store wrapper for knowledge base."""
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Any, List, Optional
|
||||
|
||||
from llama_index.core.schema import TextNode
|
||||
from llama_index.vector_stores.chroma import ChromaVectorStore
|
||||
import chromadb
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from llama_index.core.embeddings import BaseEmbedding
|
||||
|
||||
|
||||
class KnowledgeVectorStore:
|
||||
"""ChromaDB vector store for the knowledge base.
|
||||
|
||||
Handles persistence of embeddings and semantic search.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
persist_dir: str | None = None,
|
||||
collection_name: str = "knowledge_base",
|
||||
embedding_model: "BaseEmbedding | None" = None,
|
||||
):
|
||||
self._collection_name = collection_name
|
||||
self._embedding_model = embedding_model
|
||||
|
||||
# Use Docker path if available, otherwise use local data dir
|
||||
if persist_dir is None:
|
||||
if os.path.exists("/data"):
|
||||
persist_dir = "/data/chroma_db"
|
||||
else:
|
||||
persist_dir = "./data/chroma_db"
|
||||
|
||||
self._persist_dir = persist_dir
|
||||
|
||||
# Ensure persist directory exists
|
||||
os.makedirs(persist_dir, exist_ok=True)
|
||||
|
||||
# Initialize ChromaDB client
|
||||
self._client = chromadb.PersistentClient(path=persist_dir)
|
||||
|
||||
# Get or create collection
|
||||
self._collection = self._client.get_or_create_collection(
|
||||
name=collection_name,
|
||||
metadata={"description": "Knowledge base embeddings"}
|
||||
)
|
||||
|
||||
# Wrap in LlamaIndex vector store
|
||||
# Pass the chroma_collection directly for PersistentClient
|
||||
self._vector_store = ChromaVectorStore(
|
||||
chroma_collection=self._collection,
|
||||
)
|
||||
|
||||
def set_embedding_model(self, embedding_model: "BaseEmbedding") -> None:
|
||||
"""Set the embedding model for query embedding."""
|
||||
self._embedding_model = embedding_model
|
||||
|
||||
@property
|
||||
def vector_store(self) -> ChromaVectorStore:
|
||||
"""Get the LlamaIndex ChromaVectorStore."""
|
||||
return self._vector_store
|
||||
|
||||
def add_nodes(self, nodes: List[TextNode], embedding_model: "BaseEmbedding | None" = None) -> None:
|
||||
"""Add nodes to the vector store."""
|
||||
from llama_index.core import VectorStoreIndex, StorageContext
|
||||
|
||||
# Use provided embedding model or the stored one
|
||||
model = embedding_model or self._embedding_model
|
||||
|
||||
if model is None:
|
||||
raise ValueError("No embedding model provided")
|
||||
|
||||
# First embed the nodes
|
||||
for node in nodes:
|
||||
node.embedding = model.get_text_embedding(node.text)
|
||||
|
||||
# Then add to vector store
|
||||
self._vector_store.add(nodes)
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
top_k: int = 5,
|
||||
filter: Optional[dict[str, Any]] = None,
|
||||
) -> List[dict[str, Any]]:
|
||||
"""Semantic search for similar chunks.
|
||||
|
||||
Args:
|
||||
query: The search query
|
||||
top_k: Number of results to return
|
||||
filter: Optional metadata filters
|
||||
|
||||
Returns:
|
||||
List of search results with text and metadata
|
||||
"""
|
||||
from llama_index.core import VectorStoreIndex
|
||||
|
||||
# Use embedding model if provided, otherwise use the one from storage
|
||||
embed_model = self._embedding_model
|
||||
|
||||
index = VectorStoreIndex.from_vector_store(
|
||||
self._vector_store,
|
||||
embed_model=embed_model,
|
||||
)
|
||||
|
||||
query_engine = index.as_retriever(
|
||||
similarity_top_k=top_k,
|
||||
filters=filter,
|
||||
)
|
||||
|
||||
results = query_engine.retrieve(query)
|
||||
|
||||
return [
|
||||
{
|
||||
"text": node.text,
|
||||
"score": node.score,
|
||||
"metadata": node.metadata,
|
||||
}
|
||||
for node in results
|
||||
]
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all embeddings from the store."""
|
||||
self._client.delete_collection(self._collection_name)
|
||||
self._collection = self._client.get_or_create_collection(
|
||||
name=self._collection_name,
|
||||
metadata={"description": "Knowledge base embeddings"}
|
||||
)
|
||||
|
||||
def get_stats(self) -> dict[str, Any]:
|
||||
"""Get vector store statistics."""
|
||||
return {
|
||||
"total_chunks": self._collection.count(),
|
||||
"collection_name": self._collection_name,
|
||||
}
|
||||
Reference in New Issue
Block a user