Initial setup: Knowledge base RAG system with LlamaIndex and ChromaDB

- Add Python project with uv package manager
- Implement LlamaIndex + ChromaDB RAG pipeline
- Add sentence-transformers for local embeddings (all-MiniLM-L6-v2)
- Create MCP server with semantic search, indexing, and stats tools
- Add Markdown chunker with heading/wikilink/frontmatter support
- Add Dockerfile and docker-compose.yaml for self-hosted deployment
- Include sample Obsidian vault files for testing
- Add .gitignore and .env.example
This commit is contained in:
2026-03-03 20:42:42 -05:00
parent 94dd158d1c
commit 11c3f705ce
11 changed files with 5319 additions and 0 deletions

View File

@ -0,0 +1,181 @@
"""Markdown-aware document chunking for Obsidian vault."""
import os
import re
from pathlib import Path
from typing import List, Optional
from llama_index.core.schema import TextNode
class MarkdownChunker:
"""Intelligent markdown chunker for Obsidian vaults.
Chunks markdown files while preserving:
- Document/folder structure context
- Code blocks as atomic units
- Heading hierarchy
- Wiki links as metadata
"""
# Default chunk settings
DEFAULT_CHUNK_SIZE = 512
DEFAULT_CHUNK_OVERLAP = 50
def __init__(
self,
chunk_size: int = DEFAULT_CHUNK_SIZE,
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_file(self, file_path: str, content: str) -> List[TextNode]:
"""Chunk a single markdown file.
Args:
file_path: Path to the markdown file
content: Raw markdown content
Returns:
List of TextNode chunks with metadata
"""
# Extract frontmatter if present
frontmatter, body = self._extract_frontmatter(content)
# Extract wiki links for metadata
wiki_links = self._extract_wiki_links(body)
# Get relative path for context
rel_path = os.path.relpath(file_path)
# Split into sections based on headings
sections = self._split_by_headings(body)
chunks = []
for i, section in enumerate(sections):
if not section["content"].strip():
continue
# Create chunk with metadata
# Note: wiki_links must be a string for ChromaDB compatibility
node = TextNode(
text=section["content"],
metadata={
"source": rel_path,
"file_name": os.path.basename(file_path),
"heading": section.get("heading", ""),
"section_index": i,
"wiki_links": ",".join(wiki_links) if wiki_links else "",
"has_frontmatter": frontmatter is not None,
},
excluded_embed_metadata_keys=["wiki_links"],
excluded_search_metadata_keys=["wiki_links"],
)
chunks.append(node)
return chunks
def chunk_directory(self, dir_path: str) -> List[TextNode]:
"""Chunk all markdown files in a directory recursively.
Args:
dir_path: Root directory containing markdown files
Returns:
List of all TextNode chunks
"""
all_chunks = []
dir_path = Path(dir_path)
if not dir_path.exists():
raise FileNotFoundError(f"Directory not found: {dir_path}")
# Find all .md files
md_files = list(dir_path.rglob("*.md"))
for md_file in md_files:
try:
content = md_file.read_text(encoding="utf-8")
chunks = self.chunk_file(str(md_file), content)
all_chunks.extend(chunks)
except Exception as e:
print(f"Error chunking {md_file}: {e}")
continue
return all_chunks
def _extract_frontmatter(
self, content: str
) -> tuple[Optional[dict], str]:
"""Extract YAML frontmatter from markdown."""
if not content.startswith("---"):
return None, content
# Find closing ---
lines = content.split("\n")
if len(lines) < 3:
return None, content
frontmatter_lines = []
body_start = 2
for i in range(1, len(lines)):
if lines[i].strip() == "---":
body_start = i + 1
break
frontmatter_lines.append(lines[i])
# Parse simple key-value frontmatter
frontmatter = {}
for line in frontmatter_lines:
if ":" in line:
key, value = line.split(":", 1)
frontmatter[key.strip()] = value.strip()
body = "\n".join(lines[body_start:])
return frontmatter, body
def _extract_wiki_links(self, content: str) -> List[str]:
"""Extract [[wiki links]] from markdown content."""
wiki_link_pattern = r"\[\[([^\]|]+)(?:\|[^\]]+)?\]]"
return re.findall(wiki_link_pattern, content)
def _split_by_headings(self, content: str) -> List[dict]:
"""Split content by markdown headings while preserving context."""
# Split by heading lines (# ## ### etc)
heading_pattern = r"^(#{1,6})\s+(.+)$"
sections = []
current_section = {
"heading": "",
"content": "",
}
lines = content.split("\n")
for line in lines:
match = re.match(heading_pattern, line)
if match:
# Save current section if non-empty
if current_section["content"].strip():
sections.append(current_section)
# Start new section
level = len(match.group(1))
heading_text = match.group(2).strip()
current_section = {
"heading": heading_text,
"content": line + "\n",
}
else:
current_section["content"] += line + "\n"
# Don't forget the last section
if current_section["content"].strip():
sections.append(current_section)
# If no headings found, treat entire content as one section
if not sections:
sections = [{"heading": "", "content": content}]
return sections