- Add Python project with uv package manager - Implement LlamaIndex + ChromaDB RAG pipeline - Add sentence-transformers for local embeddings (all-MiniLM-L6-v2) - Create MCP server with semantic search, indexing, and stats tools - Add Markdown chunker with heading/wikilink/frontmatter support - Add Dockerfile and docker-compose.yaml for self-hosted deployment - Include sample Obsidian vault files for testing - Add .gitignore and .env.example
138 lines
4.2 KiB
Python
138 lines
4.2 KiB
Python
"""ChromaDB vector store wrapper for knowledge base."""
|
|
|
|
import os
|
|
from typing import TYPE_CHECKING, Any, List, Optional
|
|
|
|
from llama_index.core.schema import TextNode
|
|
from llama_index.vector_stores.chroma import ChromaVectorStore
|
|
import chromadb
|
|
|
|
if TYPE_CHECKING:
|
|
from llama_index.core.embeddings import BaseEmbedding
|
|
|
|
|
|
class KnowledgeVectorStore:
|
|
"""ChromaDB vector store for the knowledge base.
|
|
|
|
Handles persistence of embeddings and semantic search.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
persist_dir: str | None = None,
|
|
collection_name: str = "knowledge_base",
|
|
embedding_model: "BaseEmbedding | None" = None,
|
|
):
|
|
self._collection_name = collection_name
|
|
self._embedding_model = embedding_model
|
|
|
|
# Use Docker path if available, otherwise use local data dir
|
|
if persist_dir is None:
|
|
if os.path.exists("/data"):
|
|
persist_dir = "/data/chroma_db"
|
|
else:
|
|
persist_dir = "./data/chroma_db"
|
|
|
|
self._persist_dir = persist_dir
|
|
|
|
# Ensure persist directory exists
|
|
os.makedirs(persist_dir, exist_ok=True)
|
|
|
|
# Initialize ChromaDB client
|
|
self._client = chromadb.PersistentClient(path=persist_dir)
|
|
|
|
# Get or create collection
|
|
self._collection = self._client.get_or_create_collection(
|
|
name=collection_name,
|
|
metadata={"description": "Knowledge base embeddings"}
|
|
)
|
|
|
|
# Wrap in LlamaIndex vector store
|
|
# Pass the chroma_collection directly for PersistentClient
|
|
self._vector_store = ChromaVectorStore(
|
|
chroma_collection=self._collection,
|
|
)
|
|
|
|
def set_embedding_model(self, embedding_model: "BaseEmbedding") -> None:
|
|
"""Set the embedding model for query embedding."""
|
|
self._embedding_model = embedding_model
|
|
|
|
@property
|
|
def vector_store(self) -> ChromaVectorStore:
|
|
"""Get the LlamaIndex ChromaVectorStore."""
|
|
return self._vector_store
|
|
|
|
def add_nodes(self, nodes: List[TextNode], embedding_model: "BaseEmbedding | None" = None) -> None:
|
|
"""Add nodes to the vector store."""
|
|
from llama_index.core import VectorStoreIndex, StorageContext
|
|
|
|
# Use provided embedding model or the stored one
|
|
model = embedding_model or self._embedding_model
|
|
|
|
if model is None:
|
|
raise ValueError("No embedding model provided")
|
|
|
|
# First embed the nodes
|
|
for node in nodes:
|
|
node.embedding = model.get_text_embedding(node.text)
|
|
|
|
# Then add to vector store
|
|
self._vector_store.add(nodes)
|
|
|
|
def search(
|
|
self,
|
|
query: str,
|
|
top_k: int = 5,
|
|
filter: Optional[dict[str, Any]] = None,
|
|
) -> List[dict[str, Any]]:
|
|
"""Semantic search for similar chunks.
|
|
|
|
Args:
|
|
query: The search query
|
|
top_k: Number of results to return
|
|
filter: Optional metadata filters
|
|
|
|
Returns:
|
|
List of search results with text and metadata
|
|
"""
|
|
from llama_index.core import VectorStoreIndex
|
|
|
|
# Use embedding model if provided, otherwise use the one from storage
|
|
embed_model = self._embedding_model
|
|
|
|
index = VectorStoreIndex.from_vector_store(
|
|
self._vector_store,
|
|
embed_model=embed_model,
|
|
)
|
|
|
|
query_engine = index.as_retriever(
|
|
similarity_top_k=top_k,
|
|
filters=filter,
|
|
)
|
|
|
|
results = query_engine.retrieve(query)
|
|
|
|
return [
|
|
{
|
|
"text": node.text,
|
|
"score": node.score,
|
|
"metadata": node.metadata,
|
|
}
|
|
for node in results
|
|
]
|
|
|
|
def clear(self) -> None:
|
|
"""Clear all embeddings from the store."""
|
|
self._client.delete_collection(self._collection_name)
|
|
self._collection = self._client.get_or_create_collection(
|
|
name=self._collection_name,
|
|
metadata={"description": "Knowledge base embeddings"}
|
|
)
|
|
|
|
def get_stats(self) -> dict[str, Any]:
|
|
"""Get vector store statistics."""
|
|
return {
|
|
"total_chunks": self._collection.count(),
|
|
"collection_name": self._collection_name,
|
|
}
|