Add incremental indexing with deleted file detection
- Add file_mtime to chunk metadata for change detection - Add get_indexed_files() and get_existing_sources() methods - Add filter_new_chunks() to skip unchanged files - Add remove_chunks_by_source() to delete orphaned chunks - Update server to detect and remove deleted files on incremental index - Fix clear() to recreate ChromaVectorStore wrapper
This commit is contained in:
@ -49,6 +49,9 @@ class MarkdownChunker:
|
||||
# Get relative path for context
|
||||
rel_path = os.path.relpath(file_path)
|
||||
|
||||
# Get file modification time for change detection
|
||||
file_mtime = os.path.getmtime(file_path)
|
||||
|
||||
# Split into sections based on headings
|
||||
sections = self._split_by_headings(body)
|
||||
|
||||
@ -64,6 +67,7 @@ class MarkdownChunker:
|
||||
metadata={
|
||||
"source": rel_path,
|
||||
"file_name": os.path.basename(file_path),
|
||||
"file_mtime": file_mtime,
|
||||
"heading": section.get("heading", ""),
|
||||
"section_index": i,
|
||||
"wiki_links": ",".join(wiki_links) if wiki_links else "",
|
||||
@ -105,9 +109,7 @@ class MarkdownChunker:
|
||||
|
||||
return all_chunks
|
||||
|
||||
def _extract_frontmatter(
|
||||
self, content: str
|
||||
) -> tuple[Optional[dict], str]:
|
||||
def _extract_frontmatter(self, content: str) -> tuple[Optional[dict], str]:
|
||||
"""Extract YAML frontmatter from markdown."""
|
||||
if not content.startswith("---"):
|
||||
return None, content
|
||||
|
||||
Reference in New Issue
Block a user