Initial setup: Knowledge base RAG system with LlamaIndex and ChromaDB

- Add Python project with uv package manager - Implement LlamaIndex + ChromaDB RAG pipeline - Add sentence-transformers for local embeddings (all-MiniLM-L6-v2) - Create MCP server with semantic search, indexing, and stats tools - Add Markdown chunker with heading/wikilink/frontmatter support - Add Dockerfile and docker-compose.yaml for self-hosted deployment - Include sample Obsidian vault files for testing - Add .gitignore and .env.example
2026-03-03 20:42:42 -05:00
parent 94dd158d1c
commit 11c3f705ce
11 changed files with 5319 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,15 @@
 # Knowledge RAG Configuration
 # Path to your Obsidian vault (must contain markdown files)
 # This should be an absolute path or relative to where you run docker-compose
 VAULT_PATH=./knowledge
 # Embedding model to use
 # Default: all-MiniLM-L6-v2 (fast, good quality, ~90MB)
 # Other options:
 #   - all-mpnet-base-v2 (higher quality, slower, ~420MB)
 #   - BAAI/bge-small-en-v1.5 (good quality, ~130MB)
 EMBEDDING_MODEL=all-MiniLM-L6-v2
 # Optional: Log level (DEBUG, INFO, WARNING, ERROR)
 LOG_LEVEL=INFO
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,47 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 # Virtual environments
 venv/
 .venv/
 env/
 .env/
 # IDEs
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # uv
 .ruff_cache/
 .mypy_cache/
 .pytest_cache/
 # Data directories (should be mounted externally)
 data/
 knowledge/
 # Environment
 .env
 .env.local
--- a/33
+++ b/33
@ -0,0 +1,33 @@
 FROM python:3.11-slim
 # Install system dependencies for sentence-transformers
 RUN apt-get update && apt-get install -y --no-install-recommends \
    gcc \
    g++ \
    && rm -rf /var/lib/apt/lists/*
 # Set working directory
 WORKDIR /app
 # Install uv
 RUN pip install uv
 # Copy pyproject.toml
 COPY pyproject.toml .
 # Install dependencies
 RUN uv sync --frozen --no-dev
 # Copy source code
 COPY src/ ./src/
 # Create data directories
 RUN mkdir -p /data/vault /data/chroma_db /data/embeddings_cache
 # Set environment variables
 ENV PYTHONUNBUFFERED=1 \
    VAULT_PATH=/data/vault \
    EMBEDDINGS_CACHE_DIR=/data/embeddings_cache
 # Default command runs the MCP server
 CMD ["python", "-m", "knowledge_rag.server"]
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -0,0 +1,32 @@
 version: "3.8"
 services:
  knowledge-rag:
    build:
      context: .
      dockerfile: Dockerfile
    container_name: knowledge-rag
    volumes:
      # Mount your obsidian vault here
      - ${VAULT_PATH:-./knowledge}:/data/vault
      # Persist ChromaDB vector store
      - ./data/chroma_db:/data/chroma_db
      # Persist embeddings cache
      - ./data/embeddings_cache:/data/embeddings_cache
    environment:
      - VAULT_PATH=/data/vault
      - EMBEDDING_MODEL=${EMBEDDING_MODEL:-all-MiniLM-L6-v2}
      - EMBEDDINGS_CACHE_DIR=/data/embeddings_cache
    restart: unless-stopped
  # Optional: Watchtower for auto-updates
  # watchtower:
  #   image: containrr/watchtower
  #   container_name: watchtower
  #   volumes:
  #     - /var/run/docker.sock:/var/run/docker.sock
  #   environment:
  #     - WATCHTOWER_CLEANUP=true
  #     - WATCHTOWER_INCLUDE_STOPPED=true
  #   command: --interval 3600 knowledge-rag
  #   restart: unless-stopped
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,36 @@
 [project]
 name = "knowledge-rag"
 version = "0.1.0"
 description = "RAG system for Obsidian vault knowledge base with MCP server"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
    "llama-index>=0.10.0",
    "llama-index-vector-stores-chroma>=0.1.0",
    "chromadb>=0.4.0",
    "sentence-transformers>=2.2.0",
    "mcp>=1.0.0",
    "python-dotenv>=1.0.0",
    "pydantic>=2.0.0",
    "watchdog>=3.0.0",
    "httpx>=0.25.0",
 ]
 [project.optional-dependencies]
 dev = [
    "pytest>=7.0.0",
    "pytest-asyncio>=0.21.0",
    "ruff>=0.1.0",
 ]
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 [tool.ruff]
 line-length = 100
 target-version = "py311"
 [tool.ruff.lint]
 select = ["E", "F", "I", "N", "W"]
 ignore = ["E501"]
--- a/src/knowledge_rag/init.py
+++ b/src/knowledge_rag/init.py
@ -0,0 +1,3 @@
 """Knowledge RAG - RAG system for Obsidian vault knowledge base."""
 __version__ = "0.1.0"
--- a/src/knowledge_rag/chunker.py
+++ b/src/knowledge_rag/chunker.py
@ -0,0 +1,181 @@
 """Markdown-aware document chunking for Obsidian vault."""
 import os
 import re
 from pathlib import Path
 from typing import List, Optional
 from llama_index.core.schema import TextNode
 class MarkdownChunker:
    """Intelligent markdown chunker for Obsidian vaults.
    Chunks markdown files while preserving:
    - Document/folder structure context
    - Code blocks as atomic units
    - Heading hierarchy
    - Wiki links as metadata
    """
    # Default chunk settings
    DEFAULT_CHUNK_SIZE = 512
    DEFAULT_CHUNK_OVERLAP = 50
    def __init__(
        self,
        chunk_size: int = DEFAULT_CHUNK_SIZE,
        chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
    def chunk_file(self, file_path: str, content: str) -> List[TextNode]:
        """Chunk a single markdown file.
        Args:
            file_path: Path to the markdown file
            content: Raw markdown content
        Returns:
            List of TextNode chunks with metadata
        """
        # Extract frontmatter if present
        frontmatter, body = self._extract_frontmatter(content)
        # Extract wiki links for metadata
        wiki_links = self._extract_wiki_links(body)
        # Get relative path for context
        rel_path = os.path.relpath(file_path)
        # Split into sections based on headings
        sections = self._split_by_headings(body)
        chunks = []
        for i, section in enumerate(sections):
            if not section["content"].strip():
                continue
            # Create chunk with metadata
            # Note: wiki_links must be a string for ChromaDB compatibility
            node = TextNode(
                text=section["content"],
                metadata={
                    "source": rel_path,
                    "file_name": os.path.basename(file_path),
                    "heading": section.get("heading", ""),
                    "section_index": i,
                    "wiki_links": ",".join(wiki_links) if wiki_links else "",
                    "has_frontmatter": frontmatter is not None,
                },
                excluded_embed_metadata_keys=["wiki_links"],
                excluded_search_metadata_keys=["wiki_links"],
            )
            chunks.append(node)
        return chunks
    def chunk_directory(self, dir_path: str) -> List[TextNode]:
        """Chunk all markdown files in a directory recursively.
        Args:
            dir_path: Root directory containing markdown files
        Returns:
            List of all TextNode chunks
        """
        all_chunks = []
        dir_path = Path(dir_path)
        if not dir_path.exists():
            raise FileNotFoundError(f"Directory not found: {dir_path}")
        # Find all .md files
        md_files = list(dir_path.rglob("*.md"))
        for md_file in md_files:
            try:
                content = md_file.read_text(encoding="utf-8")
                chunks = self.chunk_file(str(md_file), content)
                all_chunks.extend(chunks)
            except Exception as e:
                print(f"Error chunking {md_file}: {e}")
                continue
        return all_chunks
    def _extract_frontmatter(
        self, content: str
    ) -> tuple[Optional[dict], str]:
        """Extract YAML frontmatter from markdown."""
        if not content.startswith("---"):
            return None, content
        # Find closing ---
        lines = content.split("\n")
        if len(lines) < 3:
            return None, content
        frontmatter_lines = []
        body_start = 2
        for i in range(1, len(lines)):
            if lines[i].strip() == "---":
                body_start = i + 1
                break
            frontmatter_lines.append(lines[i])
        # Parse simple key-value frontmatter
        frontmatter = {}
        for line in frontmatter_lines:
            if ":" in line:
                key, value = line.split(":", 1)
                frontmatter[key.strip()] = value.strip()
        body = "\n".join(lines[body_start:])
        return frontmatter, body
    def _extract_wiki_links(self, content: str) -> List[str]:
        """Extract [[wiki links]] from markdown content."""
        wiki_link_pattern = r"\[\[([^\]|]+)(?:\|[^\]]+)?\]]"
        return re.findall(wiki_link_pattern, content)
    def _split_by_headings(self, content: str) -> List[dict]:
        """Split content by markdown headings while preserving context."""
        # Split by heading lines (# ## ### etc)
        heading_pattern = r"^(#{1,6})\s+(.+)$"
        sections = []
        current_section = {
            "heading": "",
            "content": "",
        }
        lines = content.split("\n")
        for line in lines:
            match = re.match(heading_pattern, line)
            if match:
                # Save current section if non-empty
                if current_section["content"].strip():
                    sections.append(current_section)
                # Start new section
                level = len(match.group(1))
                heading_text = match.group(2).strip()
                current_section = {
                    "heading": heading_text,
                    "content": line + "\n",
                }
            else:
                current_section["content"] += line + "\n"
        # Don't forget the last section
        if current_section["content"].strip():
            sections.append(current_section)
        # If no headings found, treat entire content as one section
        if not sections:
            sections = [{"heading": "", "content": content}]
        return sections
--- a/src/knowledge_rag/embeddings.py
+++ b/src/knowledge_rag/embeddings.py
@ -0,0 +1,75 @@
 """Embedding model wrapper using sentence-transformers."""
 import os
 from typing import List, Any
 from llama_index.core.embeddings import BaseEmbedding
 from sentence_transformers import SentenceTransformer
 class LocalEmbeddingModel(BaseEmbedding):
    """Local embedding model using sentence-transformers.
    Uses a lightweight, high-quality model for semantic similarity.
    Default model: 'all-MiniLM-L6-v2' - fast and good quality.
    """
    def __init__(
        self,
        model_name: str = "all-MiniLM-L6-v2",
        cache_folder: str | None = None,
        **kwargs,
    ):
        # Store model name before super init
        self._model_name = model_name
        # Use persistent cache directory for Docker, or local cache for development
        if cache_folder is None:
            if os.path.exists("/data"):
                cache_folder = "/data/embeddings_cache"
            else:
                cache_folder = None
        # Load model first
        model = SentenceTransformer(model_name, cache_folder=cache_folder)
        embed_dim = model.get_sentence_embedding_dimension()
        # Initialize pydantic model with required fields
        super().__init__(
            embed_dim=embed_dim,
            model_name=model_name,
            **kwargs,
        )
        # Now set the model after pydantic init
        object.__setattr__(self, '_model', model)
    def _get_text_embedding(self, text: str) -> List[float]:
        """Get embedding for a single text."""
        return self._model.encode(text, convert_to_numpy=True).tolist()
    async def _aget_text_embedding(self, text: str) -> List[float]:
        """Async get embedding - synchronous for local model."""
        return self._get_text_embedding(text)
    def _get_query_embedding(self, query: str) -> List[float]:
        """Get embedding for a query."""
        return self._model.encode(query, convert_to_numpy=True).tolist()
    async def _aget_query_embedding(self, query: str) -> List[float]:
        """Async get query embedding - synchronous for local model."""
        return self._get_query_embedding(query)
    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        """Get embeddings for multiple texts."""
        return self._model.encode(texts, convert_to_numpy=True).tolist()
    async def _aget_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        """Async get embeddings - synchronous for local model."""
        return self._get_text_embeddings(texts)
 def get_embedding_model() -> LocalEmbeddingModel:
    """Factory function to create the embedding model."""
    model_name = os.environ.get("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
    return LocalEmbeddingModel(model_name=model_name)
--- a/src/knowledge_rag/server.py
+++ b/src/knowledge_rag/server.py
@ -0,0 +1,282 @@
 """MCP server for knowledge base RAG system."""
 import os
 import sys
 import logging
 from pathlib import Path
 from typing import Any
 from mcp.server import Server
 from mcp.server.stdio import stdio_server
 from mcp.types import Tool, TextContent
 from pydantic import AnyUrl
 from .chunker import MarkdownChunker
 from .embeddings import get_embedding_model
 from .vector_store import KnowledgeVectorStore
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 )
 logger = logging.getLogger(__name__)
 class KnowledgeMCPServer:
    """MCP server for semantic search in Obsidian vault.
    Provides tools to:
    - Search the knowledge base semantically
    - Index/update the knowledge base
    - Get statistics about indexed content
    """
    def __init__(self, vault_path: str | None = None):
        # Get vault path from environment or use default
        self.vault_path = vault_path or os.environ.get(
            "VAULT_PATH", "/data/vault"
        )
        # Ensure vault path exists
        if not Path(self.vault_path).exists():
            logger.warning(f"Vault path does not exist: {self.vault_path}")
        # Initialize components
        self.embedding_model = get_embedding_model()
        self.vector_store = KnowledgeVectorStore(
            embedding_model=self.embedding_model
        )
        self.chunker = MarkdownChunker()
        # Track indexing status
        self._indexed = False
        # Create MCP server
        self.server = Server("knowledge-rag")
        # Register handlers
        self._register_handlers()
    def _register_handlers(self):
        """Register MCP request handlers."""
        @self.server.list_tools()
        async def list_tools() -> list[Tool]:
            """List available MCP tools."""
            return [
                Tool(
                    name="search_knowledge",
                    description="Semantic search through the knowledge base. "
                                "Uses embeddings to find relevant content based on meaning, "
                                "not just keywords. Best for answering questions or finding "
                                "related concepts.",
                    inputSchema={
                        "type": "object",
                        "properties": {
                            "query": {
                                "type": "string",
                                "description": "The search query in natural language",
                            },
                            "top_k": {
                                "type": "integer",
                                "description": "Number of results to return",
                                "default": 5,
                            },
                        },
                        "required": ["query"],
                    },
                ),
                Tool(
                    name="index_knowledge",
                    description="Index or re-index the knowledge base. "
                                "Run this after adding new files to the vault. "
                                "Scans all markdown files and builds the search index.",
                    inputSchema={
                        "type": "object",
                        "properties": {
                            "force": {
                                "type": "boolean",
                                "description": "Force re-index (clear existing index first)",
                                "default": False,
                            },
                        },
                    },
                ),
                Tool(
                    name="get_knowledge_stats",
                    description="Get statistics about the indexed knowledge base.",
                    inputSchema={
                        "type": "object",
                        "properties": {},
                    },
                ),
            ]
        @self.server.call_tool()
        async def call_tool(
            name: str, arguments: dict | None
        ) -> list[TextContent]:
            """Handle tool calls."""
            if name == "search_knowledge":
                return await self._search_knowledge(arguments or {})
            elif name == "index_knowledge":
                return await self._index_knowledge(arguments or {})
            elif name == "get_knowledge_stats":
                return await self._get_stats()
            else:
                raise ValueError(f"Unknown tool: {name}")
    async def _search_knowledge(
        self, arguments: dict[str, Any]
    ) -> list[TextContent]:
        """Search the knowledge base semantically."""
        query = arguments.get("query", "")
        top_k = arguments.get("top_k", 5)
        if not query:
            return [TextContent(type="text", text="Query cannot be empty.")]
        # Ensure we've indexed
        if not self._indexed:
            await self._index_knowledge({})
        try:
            # Search with embeddings
            results = self.vector_store.search(
                query=query,
                top_k=top_k,
            )
            if not results:
                return [
                    TextContent(
                        type="text",
                        text="No results found. Try indexing your knowledge base first."
                    )
                ]
            # Format results
            output = []
            for i, result in enumerate(results, 1):
                source = result["metadata"].get("file_name", "unknown")
                heading = result["metadata"].get("heading", "")
                score = result.get("score", 0)
                text = result["text"][:500]  # Truncate long text
                if len(result["text"]) > 500:
                    text += "..."
                output.append(
                    f"--- Result {i} ---\n"
                    f"Source: {source}"
                    + (f" > {heading}" if heading else "")
                    + f"\nRelevance: {score:.2f}\n\n{text}\n"
                )
            return [TextContent(type="text", text="\n".join(output))]
        except Exception as e:
            logger.exception("Search error")
            return [TextContent(type="text", text=f"Search error: {str(e)}")]
    async def _index_knowledge(
        self, arguments: dict[str, Any]
    ) -> list[TextContent]:
        """Index the knowledge base."""
        force = arguments.get("force", False)
        vault_path = Path(self.vault_path)
        if not vault_path.exists():
            return [
                TextContent(
                    type="text",
                    text=f"Vault path does not exist: {self.vault_path}"
                )
            ]
        try:
            # Clear existing index if forced
            if force:
                logger.info("Force re-indexing...")
                self.vector_store.clear()
            else:
                logger.info("Indexing knowledge base...")
            # Chunk all markdown files
            chunks = self.chunker.chunk_directory(str(vault_path))
            if not chunks:
                return [
                    TextContent(
                        type="text",
                        text="No markdown files found in vault."
                    )
                ]
            logger.info(f"Created {len(chunks)} chunks, adding to vector store...")
            # Add to vector store (this embeds them)
            self.vector_store.add_nodes(chunks, embedding_model=self.embedding_model)
            self._indexed = True
            stats = self.vector_store.get_stats()
            return [
                TextContent(
                    type="text",
                    text=f"Successfully indexed {len(chunks)} chunks from the knowledge base.\n"
                         f"Total chunks in index: {stats['total_chunks']}"
                )
            ]
        except Exception as e:
            logger.exception("Indexing error")
            return [TextContent(type="text", text=f"Indexing error: {str(e)}")]
    async def _get_stats(self) -> list[TextContent]:
        """Get knowledge base statistics."""
        stats = self.vector_store.get_stats()
        vault_path = Path(self.vault_path)
        md_files = list(vault_path.rglob("*.md")) if vault_path.exists() else []
        return [
            TextContent(
                type="text",
                text=f"Knowledge Base Statistics:\n"
                     f"- Vault path: {self.vault_path}\n"
                     f"- Markdown files: {len(md_files)}\n"
                     f"- Indexed chunks: {stats['total_chunks']}\n"
                     f"- Index status: {'Ready' if self._indexed else 'Not indexed'}"
            )
        ]
    async def run(self):
        """Run the MCP server."""
        logger.info(f"Starting Knowledge RAG MCP Server")
        logger.info(f"Vault path: {self.vault_path}")
        # Auto-index on startup
        await self._index_knowledge({})
        # Run stdio server
        async with stdio_server() as (read_stream, write_stream):
            await self.server.run(
                read_stream,
                write_stream,
                self.server.create_initialization_options(),
            )
 async def main():
    """Main entry point."""
    server = KnowledgeMCPServer()
    await server.run()
 if __name__ == "__main__":
    import asyncio
    asyncio.run(main())
--- a/src/knowledge_rag/vector_store.py
+++ b/src/knowledge_rag/vector_store.py
@ -0,0 +1,137 @@
 """ChromaDB vector store wrapper for knowledge base."""
 import os
 from typing import TYPE_CHECKING, Any, List, Optional
 from llama_index.core.schema import TextNode
 from llama_index.vector_stores.chroma import ChromaVectorStore
 import chromadb
 if TYPE_CHECKING:
    from llama_index.core.embeddings import BaseEmbedding
 class KnowledgeVectorStore:
    """ChromaDB vector store for the knowledge base.
    Handles persistence of embeddings and semantic search.
    """
    def __init__(
        self,
        persist_dir: str | None = None,
        collection_name: str = "knowledge_base",
        embedding_model: "BaseEmbedding | None" = None,
    ):
        self._collection_name = collection_name
        self._embedding_model = embedding_model
        # Use Docker path if available, otherwise use local data dir
        if persist_dir is None:
            if os.path.exists("/data"):
                persist_dir = "/data/chroma_db"
            else:
                persist_dir = "./data/chroma_db"
        self._persist_dir = persist_dir
        # Ensure persist directory exists
        os.makedirs(persist_dir, exist_ok=True)
        # Initialize ChromaDB client
        self._client = chromadb.PersistentClient(path=persist_dir)
        # Get or create collection
        self._collection = self._client.get_or_create_collection(
            name=collection_name,
            metadata={"description": "Knowledge base embeddings"}
        )
        # Wrap in LlamaIndex vector store
        # Pass the chroma_collection directly for PersistentClient
        self._vector_store = ChromaVectorStore(
            chroma_collection=self._collection,
        )
    def set_embedding_model(self, embedding_model: "BaseEmbedding") -> None:
        """Set the embedding model for query embedding."""
        self._embedding_model = embedding_model
    @property
    def vector_store(self) -> ChromaVectorStore:
        """Get the LlamaIndex ChromaVectorStore."""
        return self._vector_store
    def add_nodes(self, nodes: List[TextNode], embedding_model: "BaseEmbedding | None" = None) -> None:
        """Add nodes to the vector store."""
        from llama_index.core import VectorStoreIndex, StorageContext
        # Use provided embedding model or the stored one
        model = embedding_model or self._embedding_model
        if model is None:
            raise ValueError("No embedding model provided")
        # First embed the nodes
        for node in nodes:
            node.embedding = model.get_text_embedding(node.text)
        # Then add to vector store
        self._vector_store.add(nodes)
    def search(
        self,
        query: str,
        top_k: int = 5,
        filter: Optional[dict[str, Any]] = None,
    ) -> List[dict[str, Any]]:
        """Semantic search for similar chunks.
        Args:
            query: The search query
            top_k: Number of results to return
            filter: Optional metadata filters
        Returns:
            List of search results with text and metadata
        """
        from llama_index.core import VectorStoreIndex
        # Use embedding model if provided, otherwise use the one from storage
        embed_model = self._embedding_model
        index = VectorStoreIndex.from_vector_store(
            self._vector_store,
            embed_model=embed_model,
        )
        query_engine = index.as_retriever(
            similarity_top_k=top_k,
            filters=filter,
        )
        results = query_engine.retrieve(query)
        return [
            {
                "text": node.text,
                "score": node.score,
                "metadata": node.metadata,
            }
            for node in results
        ]
    def clear(self) -> None:
        """Clear all embeddings from the store."""
        self._client.delete_collection(self._collection_name)
        self._collection = self._client.get_or_create_collection(
            name=self._collection_name,
            metadata={"description": "Knowledge base embeddings"}
        )
    def get_stats(self) -> dict[str, Any]:
        """Get vector store statistics."""
        return {
            "total_chunks": self._collection.count(),
            "collection_name": self._collection_name,
        }
--- a/uv.lock
+++ b/uv.lock
		`@ -0,0 +1,3 @@`
							`"""Knowledge RAG - RAG system for Obsidian vault knowledge base."""`

							`__version__ = "0.1.0"`