Add incremental indexing with deleted file detection
- Add file_mtime to chunk metadata for change detection - Add get_indexed_files() and get_existing_sources() methods - Add filter_new_chunks() to skip unchanged files - Add remove_chunks_by_source() to delete orphaned chunks - Update server to detect and remove deleted files on incremental index - Fix clear() to recreate ChromaVectorStore wrapper
This commit is contained in:
@ -35,9 +35,7 @@ class KnowledgeMCPServer:
|
||||
|
||||
def __init__(self, vault_path: str | None = None):
|
||||
# Get vault path from environment or use default
|
||||
self.vault_path = vault_path or os.environ.get(
|
||||
"VAULT_PATH", "/data/vault"
|
||||
)
|
||||
self.vault_path = vault_path or os.environ.get("VAULT_PATH", "/data/vault")
|
||||
|
||||
# Ensure vault path exists
|
||||
if not Path(self.vault_path).exists():
|
||||
@ -45,9 +43,7 @@ class KnowledgeMCPServer:
|
||||
|
||||
# Initialize components
|
||||
self.embedding_model = get_embedding_model()
|
||||
self.vector_store = KnowledgeVectorStore(
|
||||
embedding_model=self.embedding_model
|
||||
)
|
||||
self.vector_store = KnowledgeVectorStore(embedding_model=self.embedding_model)
|
||||
self.chunker = MarkdownChunker()
|
||||
|
||||
# Track indexing status
|
||||
@ -69,9 +65,9 @@ class KnowledgeMCPServer:
|
||||
Tool(
|
||||
name="search_knowledge",
|
||||
description="Semantic search through the knowledge base. "
|
||||
"Uses embeddings to find relevant content based on meaning, "
|
||||
"not just keywords. Best for answering questions or finding "
|
||||
"related concepts.",
|
||||
"Uses embeddings to find relevant content based on meaning, "
|
||||
"not just keywords. Best for answering questions or finding "
|
||||
"related concepts.",
|
||||
inputSchema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@ -91,8 +87,8 @@ class KnowledgeMCPServer:
|
||||
Tool(
|
||||
name="index_knowledge",
|
||||
description="Index or re-index the knowledge base. "
|
||||
"Run this after adding new files to the vault. "
|
||||
"Scans all markdown files and builds the search index.",
|
||||
"Run this after adding new files to the vault. "
|
||||
"Scans all markdown files and builds the search index.",
|
||||
inputSchema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@ -115,9 +111,7 @@ class KnowledgeMCPServer:
|
||||
]
|
||||
|
||||
@self.server.call_tool()
|
||||
async def call_tool(
|
||||
name: str, arguments: dict | None
|
||||
) -> list[TextContent]:
|
||||
async def call_tool(name: str, arguments: dict | None) -> list[TextContent]:
|
||||
"""Handle tool calls."""
|
||||
if name == "search_knowledge":
|
||||
return await self._search_knowledge(arguments or {})
|
||||
@ -128,9 +122,7 @@ class KnowledgeMCPServer:
|
||||
else:
|
||||
raise ValueError(f"Unknown tool: {name}")
|
||||
|
||||
async def _search_knowledge(
|
||||
self, arguments: dict[str, Any]
|
||||
) -> list[TextContent]:
|
||||
async def _search_knowledge(self, arguments: dict[str, Any]) -> list[TextContent]:
|
||||
"""Search the knowledge base semantically."""
|
||||
query = arguments.get("query", "")
|
||||
top_k = arguments.get("top_k", 5)
|
||||
@ -153,7 +145,7 @@ class KnowledgeMCPServer:
|
||||
return [
|
||||
TextContent(
|
||||
type="text",
|
||||
text="No results found. Try indexing your knowledge base first."
|
||||
text="No results found. Try indexing your knowledge base first.",
|
||||
)
|
||||
]
|
||||
|
||||
@ -181,45 +173,70 @@ class KnowledgeMCPServer:
|
||||
logger.exception("Search error")
|
||||
return [TextContent(type="text", text=f"Search error: {str(e)}")]
|
||||
|
||||
async def _index_knowledge(
|
||||
self, arguments: dict[str, Any]
|
||||
) -> list[TextContent]:
|
||||
async def _index_knowledge(self, arguments: dict[str, Any]) -> list[TextContent]:
|
||||
"""Index the knowledge base."""
|
||||
force = arguments.get("force", False)
|
||||
|
||||
vault_path = Path(self.vault_path)
|
||||
|
||||
if not vault_path.exists():
|
||||
return [
|
||||
TextContent(
|
||||
type="text",
|
||||
text=f"Vault path does not exist: {self.vault_path}"
|
||||
)
|
||||
]
|
||||
return [TextContent(type="text", text=f"Vault path does not exist: {self.vault_path}")]
|
||||
|
||||
try:
|
||||
# Clear existing index if forced
|
||||
if force:
|
||||
logger.info("Force re-indexing...")
|
||||
self.vector_store.clear()
|
||||
chunks = self.chunker.chunk_directory(str(vault_path))
|
||||
new_chunks = chunks
|
||||
else:
|
||||
logger.info("Indexing knowledge base...")
|
||||
logger.info("Indexing knowledge base (incremental)...")
|
||||
|
||||
# Chunk all markdown files
|
||||
chunks = self.chunker.chunk_directory(str(vault_path))
|
||||
# Chunk all markdown files
|
||||
all_chunks = self.chunker.chunk_directory(str(vault_path))
|
||||
|
||||
if not chunks:
|
||||
return [
|
||||
TextContent(
|
||||
type="text",
|
||||
text="No markdown files found in vault."
|
||||
)
|
||||
]
|
||||
if not all_chunks:
|
||||
return [TextContent(type="text", text="No markdown files found in vault.")]
|
||||
|
||||
logger.info(f"Created {len(chunks)} chunks, adding to vector store...")
|
||||
# Get current sources from the vault
|
||||
current_sources = set()
|
||||
for chunk in all_chunks:
|
||||
source = chunk.metadata.get("source")
|
||||
if source:
|
||||
current_sources.add(source)
|
||||
|
||||
# Get indexed sources and detect deleted files
|
||||
indexed_sources = self.vector_store.get_existing_sources()
|
||||
deleted_sources = indexed_sources - current_sources
|
||||
|
||||
# Remove chunks from deleted files
|
||||
if deleted_sources:
|
||||
logger.info(f"Removing chunks from deleted files: {deleted_sources}")
|
||||
for source in deleted_sources:
|
||||
self.vector_store.remove_chunks_by_source(source)
|
||||
|
||||
# Filter to only new/modified chunks
|
||||
new_chunks = self.vector_store.filter_new_chunks(all_chunks)
|
||||
|
||||
if not new_chunks:
|
||||
# Check if we have any existing index
|
||||
stats = self.vector_store.get_stats()
|
||||
if stats["total_chunks"] > 0:
|
||||
return [
|
||||
TextContent(
|
||||
type="text",
|
||||
text=f"No new or modified files to index.\n"
|
||||
f"Total chunks in index: {stats['total_chunks']}",
|
||||
)
|
||||
]
|
||||
# No existing index, fall through to index everything
|
||||
new_chunks = all_chunks
|
||||
|
||||
logger.info(f"Processing {len(new_chunks)} new/modified chunks...")
|
||||
|
||||
# Add to vector store (this embeds them)
|
||||
self.vector_store.add_nodes(chunks, embedding_model=self.embedding_model)
|
||||
if new_chunks:
|
||||
self.vector_store.add_nodes(new_chunks, embedding_model=self.embedding_model)
|
||||
|
||||
self._indexed = True
|
||||
|
||||
@ -227,8 +244,8 @@ class KnowledgeMCPServer:
|
||||
return [
|
||||
TextContent(
|
||||
type="text",
|
||||
text=f"Successfully indexed {len(chunks)} chunks from the knowledge base.\n"
|
||||
f"Total chunks in index: {stats['total_chunks']}"
|
||||
text=f"Successfully indexed {len(new_chunks)} chunks from the knowledge base.\n"
|
||||
f"Total chunks in index: {stats['total_chunks']}",
|
||||
)
|
||||
]
|
||||
|
||||
@ -247,10 +264,10 @@ class KnowledgeMCPServer:
|
||||
TextContent(
|
||||
type="text",
|
||||
text=f"Knowledge Base Statistics:\n"
|
||||
f"- Vault path: {self.vault_path}\n"
|
||||
f"- Markdown files: {len(md_files)}\n"
|
||||
f"- Indexed chunks: {stats['total_chunks']}\n"
|
||||
f"- Index status: {'Ready' if self._indexed else 'Not indexed'}"
|
||||
f"- Vault path: {self.vault_path}\n"
|
||||
f"- Markdown files: {len(md_files)}\n"
|
||||
f"- Indexed chunks: {stats['total_chunks']}\n"
|
||||
f"- Index status: {'Ready' if self._indexed else 'Not indexed'}",
|
||||
)
|
||||
]
|
||||
|
||||
@ -279,4 +296,5 @@ async def main():
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
Reference in New Issue
Block a user