Add incremental indexing with deleted file detection

- Add file_mtime to chunk metadata for change detection
- Add get_indexed_files() and get_existing_sources() methods
- Add filter_new_chunks() to skip unchanged files
- Add remove_chunks_by_source() to delete orphaned chunks
- Update server to detect and remove deleted files on incremental index
- Fix clear() to recreate ChromaVectorStore wrapper
This commit is contained in:
2026-03-04 16:24:27 -05:00
parent 46afc4c256
commit 1f3450c2f8
3 changed files with 156 additions and 57 deletions

View File

@ -49,6 +49,9 @@ class MarkdownChunker:
# Get relative path for context
rel_path = os.path.relpath(file_path)
# Get file modification time for change detection
file_mtime = os.path.getmtime(file_path)
# Split into sections based on headings
sections = self._split_by_headings(body)
@ -64,6 +67,7 @@ class MarkdownChunker:
metadata={
"source": rel_path,
"file_name": os.path.basename(file_path),
"file_mtime": file_mtime,
"heading": section.get("heading", ""),
"section_index": i,
"wiki_links": ",".join(wiki_links) if wiki_links else "",
@ -105,9 +109,7 @@ class MarkdownChunker:
return all_chunks
def _extract_frontmatter(
self, content: str
) -> tuple[Optional[dict], str]:
def _extract_frontmatter(self, content: str) -> tuple[Optional[dict], str]:
"""Extract YAML frontmatter from markdown."""
if not content.startswith("---"):
return None, content