Initial setup: Knowledge base RAG system with LlamaIndex and ChromaDB

- Add Python project with uv package manager
- Implement LlamaIndex + ChromaDB RAG pipeline
- Add sentence-transformers for local embeddings (all-MiniLM-L6-v2)
- Create MCP server with semantic search, indexing, and stats tools
- Add Markdown chunker with heading/wikilink/frontmatter support
- Add Dockerfile and docker-compose.yaml for self-hosted deployment
- Include sample Obsidian vault files for testing
- Add .gitignore and .env.example
This commit is contained in:
2026-03-03 20:42:42 -05:00
parent 94dd158d1c
commit 11c3f705ce
11 changed files with 5319 additions and 0 deletions

15
.env.example Normal file
View File

@ -0,0 +1,15 @@
# Knowledge RAG Configuration
# Path to your Obsidian vault (must contain markdown files)
# This should be an absolute path or relative to where you run docker-compose
VAULT_PATH=./knowledge
# Embedding model to use
# Default: all-MiniLM-L6-v2 (fast, good quality, ~90MB)
# Other options:
# - all-mpnet-base-v2 (higher quality, slower, ~420MB)
# - BAAI/bge-small-en-v1.5 (good quality, ~130MB)
EMBEDDING_MODEL=all-MiniLM-L6-v2
# Optional: Log level (DEBUG, INFO, WARNING, ERROR)
LOG_LEVEL=INFO

47
.gitignore vendored Normal file
View File

@ -0,0 +1,47 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# Virtual environments
venv/
.venv/
env/
.env/
# IDEs
.vscode/
.idea/
*.swp
*.swo
*~
# uv
.ruff_cache/
.mypy_cache/
.pytest_cache/
# Data directories (should be mounted externally)
data/
knowledge/
# Environment
.env
.env.local

33
Dockerfile Normal file
View File

@ -0,0 +1,33 @@
FROM python:3.11-slim
# Install system dependencies for sentence-transformers
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
g++ \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /app
# Install uv
RUN pip install uv
# Copy pyproject.toml
COPY pyproject.toml .
# Install dependencies
RUN uv sync --frozen --no-dev
# Copy source code
COPY src/ ./src/
# Create data directories
RUN mkdir -p /data/vault /data/chroma_db /data/embeddings_cache
# Set environment variables
ENV PYTHONUNBUFFERED=1 \
VAULT_PATH=/data/vault \
EMBEDDINGS_CACHE_DIR=/data/embeddings_cache
# Default command runs the MCP server
CMD ["python", "-m", "knowledge_rag.server"]

32
docker-compose.yaml Normal file
View File

@ -0,0 +1,32 @@
version: "3.8"
services:
knowledge-rag:
build:
context: .
dockerfile: Dockerfile
container_name: knowledge-rag
volumes:
# Mount your obsidian vault here
- ${VAULT_PATH:-./knowledge}:/data/vault
# Persist ChromaDB vector store
- ./data/chroma_db:/data/chroma_db
# Persist embeddings cache
- ./data/embeddings_cache:/data/embeddings_cache
environment:
- VAULT_PATH=/data/vault
- EMBEDDING_MODEL=${EMBEDDING_MODEL:-all-MiniLM-L6-v2}
- EMBEDDINGS_CACHE_DIR=/data/embeddings_cache
restart: unless-stopped
# Optional: Watchtower for auto-updates
# watchtower:
# image: containrr/watchtower
# container_name: watchtower
# volumes:
# - /var/run/docker.sock:/var/run/docker.sock
# environment:
# - WATCHTOWER_CLEANUP=true
# - WATCHTOWER_INCLUDE_STOPPED=true
# command: --interval 3600 knowledge-rag
# restart: unless-stopped

36
pyproject.toml Normal file
View File

@ -0,0 +1,36 @@
[project]
name = "knowledge-rag"
version = "0.1.0"
description = "RAG system for Obsidian vault knowledge base with MCP server"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"llama-index>=0.10.0",
"llama-index-vector-stores-chroma>=0.1.0",
"chromadb>=0.4.0",
"sentence-transformers>=2.2.0",
"mcp>=1.0.0",
"python-dotenv>=1.0.0",
"pydantic>=2.0.0",
"watchdog>=3.0.0",
"httpx>=0.25.0",
]
[project.optional-dependencies]
dev = [
"pytest>=7.0.0",
"pytest-asyncio>=0.21.0",
"ruff>=0.1.0",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.ruff]
line-length = 100
target-version = "py311"
[tool.ruff.lint]
select = ["E", "F", "I", "N", "W"]
ignore = ["E501"]

View File

@ -0,0 +1,3 @@
"""Knowledge RAG - RAG system for Obsidian vault knowledge base."""
__version__ = "0.1.0"

View File

@ -0,0 +1,181 @@
"""Markdown-aware document chunking for Obsidian vault."""
import os
import re
from pathlib import Path
from typing import List, Optional
from llama_index.core.schema import TextNode
class MarkdownChunker:
"""Intelligent markdown chunker for Obsidian vaults.
Chunks markdown files while preserving:
- Document/folder structure context
- Code blocks as atomic units
- Heading hierarchy
- Wiki links as metadata
"""
# Default chunk settings
DEFAULT_CHUNK_SIZE = 512
DEFAULT_CHUNK_OVERLAP = 50
def __init__(
self,
chunk_size: int = DEFAULT_CHUNK_SIZE,
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_file(self, file_path: str, content: str) -> List[TextNode]:
"""Chunk a single markdown file.
Args:
file_path: Path to the markdown file
content: Raw markdown content
Returns:
List of TextNode chunks with metadata
"""
# Extract frontmatter if present
frontmatter, body = self._extract_frontmatter(content)
# Extract wiki links for metadata
wiki_links = self._extract_wiki_links(body)
# Get relative path for context
rel_path = os.path.relpath(file_path)
# Split into sections based on headings
sections = self._split_by_headings(body)
chunks = []
for i, section in enumerate(sections):
if not section["content"].strip():
continue
# Create chunk with metadata
# Note: wiki_links must be a string for ChromaDB compatibility
node = TextNode(
text=section["content"],
metadata={
"source": rel_path,
"file_name": os.path.basename(file_path),
"heading": section.get("heading", ""),
"section_index": i,
"wiki_links": ",".join(wiki_links) if wiki_links else "",
"has_frontmatter": frontmatter is not None,
},
excluded_embed_metadata_keys=["wiki_links"],
excluded_search_metadata_keys=["wiki_links"],
)
chunks.append(node)
return chunks
def chunk_directory(self, dir_path: str) -> List[TextNode]:
"""Chunk all markdown files in a directory recursively.
Args:
dir_path: Root directory containing markdown files
Returns:
List of all TextNode chunks
"""
all_chunks = []
dir_path = Path(dir_path)
if not dir_path.exists():
raise FileNotFoundError(f"Directory not found: {dir_path}")
# Find all .md files
md_files = list(dir_path.rglob("*.md"))
for md_file in md_files:
try:
content = md_file.read_text(encoding="utf-8")
chunks = self.chunk_file(str(md_file), content)
all_chunks.extend(chunks)
except Exception as e:
print(f"Error chunking {md_file}: {e}")
continue
return all_chunks
def _extract_frontmatter(
self, content: str
) -> tuple[Optional[dict], str]:
"""Extract YAML frontmatter from markdown."""
if not content.startswith("---"):
return None, content
# Find closing ---
lines = content.split("\n")
if len(lines) < 3:
return None, content
frontmatter_lines = []
body_start = 2
for i in range(1, len(lines)):
if lines[i].strip() == "---":
body_start = i + 1
break
frontmatter_lines.append(lines[i])
# Parse simple key-value frontmatter
frontmatter = {}
for line in frontmatter_lines:
if ":" in line:
key, value = line.split(":", 1)
frontmatter[key.strip()] = value.strip()
body = "\n".join(lines[body_start:])
return frontmatter, body
def _extract_wiki_links(self, content: str) -> List[str]:
"""Extract [[wiki links]] from markdown content."""
wiki_link_pattern = r"\[\[([^\]|]+)(?:\|[^\]]+)?\]]"
return re.findall(wiki_link_pattern, content)
def _split_by_headings(self, content: str) -> List[dict]:
"""Split content by markdown headings while preserving context."""
# Split by heading lines (# ## ### etc)
heading_pattern = r"^(#{1,6})\s+(.+)$"
sections = []
current_section = {
"heading": "",
"content": "",
}
lines = content.split("\n")
for line in lines:
match = re.match(heading_pattern, line)
if match:
# Save current section if non-empty
if current_section["content"].strip():
sections.append(current_section)
# Start new section
level = len(match.group(1))
heading_text = match.group(2).strip()
current_section = {
"heading": heading_text,
"content": line + "\n",
}
else:
current_section["content"] += line + "\n"
# Don't forget the last section
if current_section["content"].strip():
sections.append(current_section)
# If no headings found, treat entire content as one section
if not sections:
sections = [{"heading": "", "content": content}]
return sections

View File

@ -0,0 +1,75 @@
"""Embedding model wrapper using sentence-transformers."""
import os
from typing import List, Any
from llama_index.core.embeddings import BaseEmbedding
from sentence_transformers import SentenceTransformer
class LocalEmbeddingModel(BaseEmbedding):
"""Local embedding model using sentence-transformers.
Uses a lightweight, high-quality model for semantic similarity.
Default model: 'all-MiniLM-L6-v2' - fast and good quality.
"""
def __init__(
self,
model_name: str = "all-MiniLM-L6-v2",
cache_folder: str | None = None,
**kwargs,
):
# Store model name before super init
self._model_name = model_name
# Use persistent cache directory for Docker, or local cache for development
if cache_folder is None:
if os.path.exists("/data"):
cache_folder = "/data/embeddings_cache"
else:
cache_folder = None
# Load model first
model = SentenceTransformer(model_name, cache_folder=cache_folder)
embed_dim = model.get_sentence_embedding_dimension()
# Initialize pydantic model with required fields
super().__init__(
embed_dim=embed_dim,
model_name=model_name,
**kwargs,
)
# Now set the model after pydantic init
object.__setattr__(self, '_model', model)
def _get_text_embedding(self, text: str) -> List[float]:
"""Get embedding for a single text."""
return self._model.encode(text, convert_to_numpy=True).tolist()
async def _aget_text_embedding(self, text: str) -> List[float]:
"""Async get embedding - synchronous for local model."""
return self._get_text_embedding(text)
def _get_query_embedding(self, query: str) -> List[float]:
"""Get embedding for a query."""
return self._model.encode(query, convert_to_numpy=True).tolist()
async def _aget_query_embedding(self, query: str) -> List[float]:
"""Async get query embedding - synchronous for local model."""
return self._get_query_embedding(query)
def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
"""Get embeddings for multiple texts."""
return self._model.encode(texts, convert_to_numpy=True).tolist()
async def _aget_text_embeddings(self, texts: List[str]) -> List[List[float]]:
"""Async get embeddings - synchronous for local model."""
return self._get_text_embeddings(texts)
def get_embedding_model() -> LocalEmbeddingModel:
"""Factory function to create the embedding model."""
model_name = os.environ.get("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
return LocalEmbeddingModel(model_name=model_name)

282
src/knowledge_rag/server.py Normal file
View File

@ -0,0 +1,282 @@
"""MCP server for knowledge base RAG system."""
import os
import sys
import logging
from pathlib import Path
from typing import Any
from mcp.server import Server
from mcp.server.stdio import stdio_server
from mcp.types import Tool, TextContent
from pydantic import AnyUrl
from .chunker import MarkdownChunker
from .embeddings import get_embedding_model
from .vector_store import KnowledgeVectorStore
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
class KnowledgeMCPServer:
"""MCP server for semantic search in Obsidian vault.
Provides tools to:
- Search the knowledge base semantically
- Index/update the knowledge base
- Get statistics about indexed content
"""
def __init__(self, vault_path: str | None = None):
# Get vault path from environment or use default
self.vault_path = vault_path or os.environ.get(
"VAULT_PATH", "/data/vault"
)
# Ensure vault path exists
if not Path(self.vault_path).exists():
logger.warning(f"Vault path does not exist: {self.vault_path}")
# Initialize components
self.embedding_model = get_embedding_model()
self.vector_store = KnowledgeVectorStore(
embedding_model=self.embedding_model
)
self.chunker = MarkdownChunker()
# Track indexing status
self._indexed = False
# Create MCP server
self.server = Server("knowledge-rag")
# Register handlers
self._register_handlers()
def _register_handlers(self):
"""Register MCP request handlers."""
@self.server.list_tools()
async def list_tools() -> list[Tool]:
"""List available MCP tools."""
return [
Tool(
name="search_knowledge",
description="Semantic search through the knowledge base. "
"Uses embeddings to find relevant content based on meaning, "
"not just keywords. Best for answering questions or finding "
"related concepts.",
inputSchema={
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query in natural language",
},
"top_k": {
"type": "integer",
"description": "Number of results to return",
"default": 5,
},
},
"required": ["query"],
},
),
Tool(
name="index_knowledge",
description="Index or re-index the knowledge base. "
"Run this after adding new files to the vault. "
"Scans all markdown files and builds the search index.",
inputSchema={
"type": "object",
"properties": {
"force": {
"type": "boolean",
"description": "Force re-index (clear existing index first)",
"default": False,
},
},
},
),
Tool(
name="get_knowledge_stats",
description="Get statistics about the indexed knowledge base.",
inputSchema={
"type": "object",
"properties": {},
},
),
]
@self.server.call_tool()
async def call_tool(
name: str, arguments: dict | None
) -> list[TextContent]:
"""Handle tool calls."""
if name == "search_knowledge":
return await self._search_knowledge(arguments or {})
elif name == "index_knowledge":
return await self._index_knowledge(arguments or {})
elif name == "get_knowledge_stats":
return await self._get_stats()
else:
raise ValueError(f"Unknown tool: {name}")
async def _search_knowledge(
self, arguments: dict[str, Any]
) -> list[TextContent]:
"""Search the knowledge base semantically."""
query = arguments.get("query", "")
top_k = arguments.get("top_k", 5)
if not query:
return [TextContent(type="text", text="Query cannot be empty.")]
# Ensure we've indexed
if not self._indexed:
await self._index_knowledge({})
try:
# Search with embeddings
results = self.vector_store.search(
query=query,
top_k=top_k,
)
if not results:
return [
TextContent(
type="text",
text="No results found. Try indexing your knowledge base first."
)
]
# Format results
output = []
for i, result in enumerate(results, 1):
source = result["metadata"].get("file_name", "unknown")
heading = result["metadata"].get("heading", "")
score = result.get("score", 0)
text = result["text"][:500] # Truncate long text
if len(result["text"]) > 500:
text += "..."
output.append(
f"--- Result {i} ---\n"
f"Source: {source}"
+ (f" > {heading}" if heading else "")
+ f"\nRelevance: {score:.2f}\n\n{text}\n"
)
return [TextContent(type="text", text="\n".join(output))]
except Exception as e:
logger.exception("Search error")
return [TextContent(type="text", text=f"Search error: {str(e)}")]
async def _index_knowledge(
self, arguments: dict[str, Any]
) -> list[TextContent]:
"""Index the knowledge base."""
force = arguments.get("force", False)
vault_path = Path(self.vault_path)
if not vault_path.exists():
return [
TextContent(
type="text",
text=f"Vault path does not exist: {self.vault_path}"
)
]
try:
# Clear existing index if forced
if force:
logger.info("Force re-indexing...")
self.vector_store.clear()
else:
logger.info("Indexing knowledge base...")
# Chunk all markdown files
chunks = self.chunker.chunk_directory(str(vault_path))
if not chunks:
return [
TextContent(
type="text",
text="No markdown files found in vault."
)
]
logger.info(f"Created {len(chunks)} chunks, adding to vector store...")
# Add to vector store (this embeds them)
self.vector_store.add_nodes(chunks, embedding_model=self.embedding_model)
self._indexed = True
stats = self.vector_store.get_stats()
return [
TextContent(
type="text",
text=f"Successfully indexed {len(chunks)} chunks from the knowledge base.\n"
f"Total chunks in index: {stats['total_chunks']}"
)
]
except Exception as e:
logger.exception("Indexing error")
return [TextContent(type="text", text=f"Indexing error: {str(e)}")]
async def _get_stats(self) -> list[TextContent]:
"""Get knowledge base statistics."""
stats = self.vector_store.get_stats()
vault_path = Path(self.vault_path)
md_files = list(vault_path.rglob("*.md")) if vault_path.exists() else []
return [
TextContent(
type="text",
text=f"Knowledge Base Statistics:\n"
f"- Vault path: {self.vault_path}\n"
f"- Markdown files: {len(md_files)}\n"
f"- Indexed chunks: {stats['total_chunks']}\n"
f"- Index status: {'Ready' if self._indexed else 'Not indexed'}"
)
]
async def run(self):
"""Run the MCP server."""
logger.info(f"Starting Knowledge RAG MCP Server")
logger.info(f"Vault path: {self.vault_path}")
# Auto-index on startup
await self._index_knowledge({})
# Run stdio server
async with stdio_server() as (read_stream, write_stream):
await self.server.run(
read_stream,
write_stream,
self.server.create_initialization_options(),
)
async def main():
"""Main entry point."""
server = KnowledgeMCPServer()
await server.run()
if __name__ == "__main__":
import asyncio
asyncio.run(main())

View File

@ -0,0 +1,137 @@
"""ChromaDB vector store wrapper for knowledge base."""
import os
from typing import TYPE_CHECKING, Any, List, Optional
from llama_index.core.schema import TextNode
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
if TYPE_CHECKING:
from llama_index.core.embeddings import BaseEmbedding
class KnowledgeVectorStore:
"""ChromaDB vector store for the knowledge base.
Handles persistence of embeddings and semantic search.
"""
def __init__(
self,
persist_dir: str | None = None,
collection_name: str = "knowledge_base",
embedding_model: "BaseEmbedding | None" = None,
):
self._collection_name = collection_name
self._embedding_model = embedding_model
# Use Docker path if available, otherwise use local data dir
if persist_dir is None:
if os.path.exists("/data"):
persist_dir = "/data/chroma_db"
else:
persist_dir = "./data/chroma_db"
self._persist_dir = persist_dir
# Ensure persist directory exists
os.makedirs(persist_dir, exist_ok=True)
# Initialize ChromaDB client
self._client = chromadb.PersistentClient(path=persist_dir)
# Get or create collection
self._collection = self._client.get_or_create_collection(
name=collection_name,
metadata={"description": "Knowledge base embeddings"}
)
# Wrap in LlamaIndex vector store
# Pass the chroma_collection directly for PersistentClient
self._vector_store = ChromaVectorStore(
chroma_collection=self._collection,
)
def set_embedding_model(self, embedding_model: "BaseEmbedding") -> None:
"""Set the embedding model for query embedding."""
self._embedding_model = embedding_model
@property
def vector_store(self) -> ChromaVectorStore:
"""Get the LlamaIndex ChromaVectorStore."""
return self._vector_store
def add_nodes(self, nodes: List[TextNode], embedding_model: "BaseEmbedding | None" = None) -> None:
"""Add nodes to the vector store."""
from llama_index.core import VectorStoreIndex, StorageContext
# Use provided embedding model or the stored one
model = embedding_model or self._embedding_model
if model is None:
raise ValueError("No embedding model provided")
# First embed the nodes
for node in nodes:
node.embedding = model.get_text_embedding(node.text)
# Then add to vector store
self._vector_store.add(nodes)
def search(
self,
query: str,
top_k: int = 5,
filter: Optional[dict[str, Any]] = None,
) -> List[dict[str, Any]]:
"""Semantic search for similar chunks.
Args:
query: The search query
top_k: Number of results to return
filter: Optional metadata filters
Returns:
List of search results with text and metadata
"""
from llama_index.core import VectorStoreIndex
# Use embedding model if provided, otherwise use the one from storage
embed_model = self._embedding_model
index = VectorStoreIndex.from_vector_store(
self._vector_store,
embed_model=embed_model,
)
query_engine = index.as_retriever(
similarity_top_k=top_k,
filters=filter,
)
results = query_engine.retrieve(query)
return [
{
"text": node.text,
"score": node.score,
"metadata": node.metadata,
}
for node in results
]
def clear(self) -> None:
"""Clear all embeddings from the store."""
self._client.delete_collection(self._collection_name)
self._collection = self._client.get_or_create_collection(
name=self._collection_name,
metadata={"description": "Knowledge base embeddings"}
)
def get_stats(self) -> dict[str, Any]:
"""Get vector store statistics."""
return {
"total_chunks": self._collection.count(),
"collection_name": self._collection_name,
}

4478
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff