"""MCP server for knowledge base RAG system.""" import os import sys import logging from pathlib import Path from typing import Any from mcp.server import Server from mcp.server.stdio import stdio_server from mcp.types import Tool, TextContent from pydantic import AnyUrl from .chunker import MarkdownChunker from .embeddings import get_embedding_model from .vector_store import KnowledgeVectorStore # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) logger = logging.getLogger(__name__) class KnowledgeMCPServer: """MCP server for semantic search in Obsidian vault. Provides tools to: - Search the knowledge base semantically - Index/update the knowledge base - Get statistics about indexed content """ def __init__(self, vault_path: str | None = None): # Get vault path from environment or use default self.vault_path = vault_path or os.environ.get("VAULT_PATH", "/data/vault") # Ensure vault path exists if not Path(self.vault_path).exists(): logger.warning(f"Vault path does not exist: {self.vault_path}") # Initialize components self.embedding_model = get_embedding_model() self.vector_store = KnowledgeVectorStore(embedding_model=self.embedding_model) self.chunker = MarkdownChunker() # Track indexing status self._indexed = False # Create MCP server self.server = Server("knowledge-rag") # Register handlers self._register_handlers() def _register_handlers(self): """Register MCP request handlers.""" @self.server.list_tools() async def list_tools() -> list[Tool]: """List available MCP tools.""" return [ Tool( name="search_knowledge", description="Semantic search through the knowledge base. " "Uses embeddings to find relevant content based on meaning, " "not just keywords. Best for answering questions or finding " "related concepts.", inputSchema={ "type": "object", "properties": { "query": { "type": "string", "description": "The search query in natural language", }, "top_k": { "type": "integer", "description": "Number of results to return", "default": 5, }, }, "required": ["query"], }, ), Tool( name="index_knowledge", description="Index or re-index the knowledge base. " "Run this after adding new files to the vault. " "Scans all markdown files and builds the search index.", inputSchema={ "type": "object", "properties": { "force": { "type": "boolean", "description": "Force re-index (clear existing index first)", "default": False, }, }, }, ), Tool( name="get_knowledge_stats", description="Get statistics about the indexed knowledge base.", inputSchema={ "type": "object", "properties": {}, }, ), ] @self.server.call_tool() async def call_tool(name: str, arguments: dict | None) -> list[TextContent]: """Handle tool calls.""" if name == "search_knowledge": return await self._search_knowledge(arguments or {}) elif name == "index_knowledge": return await self._index_knowledge(arguments or {}) elif name == "get_knowledge_stats": return await self._get_stats() else: raise ValueError(f"Unknown tool: {name}") async def _search_knowledge(self, arguments: dict[str, Any]) -> list[TextContent]: """Search the knowledge base semantically.""" query = arguments.get("query", "") top_k = arguments.get("top_k", 5) if not query: return [TextContent(type="text", text="Query cannot be empty.")] # Ensure we've indexed if not self._indexed: await self._index_knowledge({}) try: # Search with embeddings results = self.vector_store.search( query=query, top_k=top_k, ) if not results: return [ TextContent( type="text", text="No results found. Try indexing your knowledge base first.", ) ] # Format results output = [] for i, result in enumerate(results, 1): source = result["metadata"].get("file_name", "unknown") heading = result["metadata"].get("heading", "") score = result.get("score", 0) text = result["text"][:500] # Truncate long text if len(result["text"]) > 500: text += "..." output.append( f"--- Result {i} ---\n" f"Source: {source}" + (f" > {heading}" if heading else "") + f"\nRelevance: {score:.2f}\n\n{text}\n" ) return [TextContent(type="text", text="\n".join(output))] except Exception as e: logger.exception("Search error") return [TextContent(type="text", text=f"Search error: {str(e)}")] async def _index_knowledge(self, arguments: dict[str, Any]) -> list[TextContent]: """Index the knowledge base.""" force = arguments.get("force", False) vault_path = Path(self.vault_path) if not vault_path.exists(): return [TextContent(type="text", text=f"Vault path does not exist: {self.vault_path}")] try: # Clear existing index if forced if force: logger.info("Force re-indexing...") self.vector_store.clear() chunks = self.chunker.chunk_directory(str(vault_path)) new_chunks = chunks else: logger.info("Indexing knowledge base (incremental)...") # Chunk all markdown files all_chunks = self.chunker.chunk_directory(str(vault_path)) if not all_chunks: return [TextContent(type="text", text="No markdown files found in vault.")] # Get current sources from the vault current_sources = set() for chunk in all_chunks: source = chunk.metadata.get("source") if source: current_sources.add(source) # Get indexed sources and detect deleted files indexed_sources = self.vector_store.get_existing_sources() deleted_sources = indexed_sources - current_sources # Remove chunks from deleted files if deleted_sources: logger.info(f"Removing chunks from deleted files: {deleted_sources}") for source in deleted_sources: self.vector_store.remove_chunks_by_source(source) # Filter to only new/modified chunks new_chunks = self.vector_store.filter_new_chunks(all_chunks) if not new_chunks: # Check if we have any existing index stats = self.vector_store.get_stats() if stats["total_chunks"] > 0: return [ TextContent( type="text", text=f"No new or modified files to index.\n" f"Total chunks in index: {stats['total_chunks']}", ) ] # No existing index, fall through to index everything new_chunks = all_chunks logger.info(f"Processing {len(new_chunks)} new/modified chunks...") # Add to vector store (this embeds them) if new_chunks: self.vector_store.add_nodes(new_chunks, embedding_model=self.embedding_model) self._indexed = True stats = self.vector_store.get_stats() return [ TextContent( type="text", text=f"Successfully indexed {len(new_chunks)} chunks from the knowledge base.\n" f"Total chunks in index: {stats['total_chunks']}", ) ] except Exception as e: logger.exception("Indexing error") return [TextContent(type="text", text=f"Indexing error: {str(e)}")] async def _get_stats(self) -> list[TextContent]: """Get knowledge base statistics.""" stats = self.vector_store.get_stats() vault_path = Path(self.vault_path) md_files = list(vault_path.rglob("*.md")) if vault_path.exists() else [] return [ TextContent( type="text", text=f"Knowledge Base Statistics:\n" f"- Vault path: {self.vault_path}\n" f"- Markdown files: {len(md_files)}\n" f"- Indexed chunks: {stats['total_chunks']}\n" f"- Index status: {'Ready' if self._indexed else 'Not indexed'}", ) ] async def run(self): """Run the MCP server.""" logger.info(f"Starting Knowledge RAG MCP Server") logger.info(f"Vault path: {self.vault_path}") # Auto-index on startup await self._index_knowledge({}) # Run stdio server async with stdio_server() as (read_stream, write_stream): await self.server.run( read_stream, write_stream, self.server.create_initialization_options(), ) async def main(): """Main entry point.""" server = KnowledgeMCPServer() await server.run() if __name__ == "__main__": import asyncio asyncio.run(main())