Compare commits
4 Commits
94dd158d1c
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 1f3450c2f8 | |||
| 46afc4c256 | |||
| 8d09d03fe8 | |||
| 11c3f705ce |
15
.env.example
Normal file
15
.env.example
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
# Knowledge RAG Configuration
|
||||||
|
|
||||||
|
# Path to your Obsidian vault (must contain markdown files)
|
||||||
|
# This should be an absolute path or relative to where you run docker-compose
|
||||||
|
VAULT_PATH=./knowledge
|
||||||
|
|
||||||
|
# Embedding model to use
|
||||||
|
# Default: all-MiniLM-L6-v2 (fast, good quality, ~90MB)
|
||||||
|
# Other options:
|
||||||
|
# - all-mpnet-base-v2 (higher quality, slower, ~420MB)
|
||||||
|
# - BAAI/bge-small-en-v1.5 (good quality, ~130MB)
|
||||||
|
EMBEDDING_MODEL=all-MiniLM-L6-v2
|
||||||
|
|
||||||
|
# Optional: Log level (DEBUG, INFO, WARNING, ERROR)
|
||||||
|
LOG_LEVEL=INFO
|
||||||
47
.gitignore
vendored
Normal file
47
.gitignore
vendored
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
venv/
|
||||||
|
.venv/
|
||||||
|
env/
|
||||||
|
.env/
|
||||||
|
|
||||||
|
# IDEs
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# uv
|
||||||
|
.ruff_cache/
|
||||||
|
.mypy_cache/
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Data directories (should be mounted externally)
|
||||||
|
data/
|
||||||
|
knowledge/
|
||||||
|
|
||||||
|
# Environment
|
||||||
|
.env
|
||||||
|
.env.local
|
||||||
172
README.md
172
README.md
@ -1,21 +1,171 @@
|
|||||||
# Knowledge Base
|
# Knowledge Base RAG System
|
||||||
|
|
||||||
Personal knowledge base repository for storing useful information, notes, and documentation.
|
A self-hosted RAG (Retrieval Augmented Generation) system for your Obsidian vault with MCP server integration.
|
||||||
|
|
||||||
## Contents
|
## Features
|
||||||
|
|
||||||
- [Getting Started](#getting-started)
|
- **Semantic Search**: Find relevant content using embeddings, not just keywords
|
||||||
- [Contributing](#contributing)
|
- **MCP Server**: Exposes search, indexing, and stats tools via MCP protocol
|
||||||
- [License](#license)
|
- **Local-first**: No external APIs - everything runs locally
|
||||||
|
- **Obsidian Compatible**: Works with your existing markdown vault
|
||||||
|
|
||||||
## Getting Started
|
## Requirements
|
||||||
|
|
||||||
This repository contains various knowledge articles, how-to guides, and reference documentation.
|
- Python 3.11+
|
||||||
|
- ~2GB disk space for embeddings model
|
||||||
|
|
||||||
## Contributing
|
## Quick Start
|
||||||
|
|
||||||
Feel free to contribute by creating issues or submitting pull requests.
|
### 1. Install uv (if not already)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
source ~/.local/bin/env
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Clone and setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ~/knowledge-base
|
||||||
|
cp .env.example .env
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Configure
|
||||||
|
|
||||||
|
Edit `.env` to set your vault path:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
VAULT_PATH=/path/to/your/obsidian-vault
|
||||||
|
EMBEDDING_MODEL=all-MiniLM-L6-v2 # optional
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Install dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv sync
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Run the server
|
||||||
|
|
||||||
|
```bash
|
||||||
|
source .venv/bin/activate
|
||||||
|
VAULT_PATH=./knowledge python -m knowledge_rag.server
|
||||||
|
```
|
||||||
|
|
||||||
|
The server will:
|
||||||
|
- Auto-index your vault on startup
|
||||||
|
- Listen for MCP requests via stdio
|
||||||
|
|
||||||
|
## MCP Tools
|
||||||
|
|
||||||
|
Once running, these tools are available:
|
||||||
|
|
||||||
|
| Tool | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `search_knowledge` | Semantic search across your vault |
|
||||||
|
| `index_knowledge` | Re-index the vault (use after adding files) |
|
||||||
|
| `get_knowledge_stats` | View indexing statistics |
|
||||||
|
|
||||||
|
## Usage Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Example: Searching the knowledge base
|
||||||
|
# (via MCP client or Claude Desktop integration)
|
||||||
|
|
||||||
|
await search_knowledge({
|
||||||
|
"query": "how does the RAG system work",
|
||||||
|
"top_k": 5
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
## Auto-Start on Boot
|
||||||
|
|
||||||
|
### Option 1: Systemd Service
|
||||||
|
|
||||||
|
Create `/etc/systemd/system/knowledge-rag.service`:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
Description=Knowledge Base RAG MCP Server
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=ernie
|
||||||
|
WorkingDirectory=/home/ernie/knowledge-base
|
||||||
|
Environment="VAULT_PATH=/home/ernie/knowledge"
|
||||||
|
Environment="PATH=/home/ernie/.local/bin:/usr/bin:/bin"
|
||||||
|
ExecStart=/home/ernie/knowledge-base/.venv/bin/python -m knowledge_rag.server
|
||||||
|
Restart=always
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
|
||||||
|
Then enable:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable knowledge-rag.service
|
||||||
|
sudo systemctl start knowledge-rag.service
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 2: tmux/screen
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Start in tmux
|
||||||
|
tmux new -s knowledge-rag
|
||||||
|
source .venv/bin/activate
|
||||||
|
VAULT_PATH=./knowledge python -m knowledge_rag.server
|
||||||
|
# Detach: Ctrl+b, then d
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 3: rc.local or startup script
|
||||||
|
|
||||||
|
Add to your `~/.bashrc` or startup script:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Only start if not already running
|
||||||
|
if ! pgrep -f "knowledge_rag.server" > /dev/null; then
|
||||||
|
cd ~/knowledge-base
|
||||||
|
source .venv/bin/activate
|
||||||
|
VAULT_PATH=./knowledge nohup python -m knowledge_rag.server > /tmp/knowledge-rag.log 2>&1 &
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
knowledge-base/
|
||||||
|
├── src/knowledge_rag/ # Source code
|
||||||
|
│ ├── server.py # MCP server
|
||||||
|
│ ├── chunker.py # Markdown chunking
|
||||||
|
│ ├── embeddings.py # Sentence-transformers wrapper
|
||||||
|
│ └── vector_store.py # ChromaDB wrapper
|
||||||
|
├── knowledge/ # Your Obsidian vault (gitignored)
|
||||||
|
├── pyproject.toml # Project config
|
||||||
|
└── .env.example # Environment template
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
| Variable | Default | Description |
|
||||||
|
|----------|---------|-------------|
|
||||||
|
| `VAULT_PATH` | `/data/vault` | Path to your Obsidian vault |
|
||||||
|
| `EMBEDDING_MODEL` | `all-MiniLM-L6-v2` | Sentence-transformers model |
|
||||||
|
| `EMBEDDINGS_CACHE_DIR` | `/data/embeddings_cache` | Model cache location |
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### First run is slow
|
||||||
|
The embedding model (~90MB) downloads on first run. Subsequent runs are faster.
|
||||||
|
|
||||||
|
### No search results
|
||||||
|
Run `index_knowledge` tool to index your vault, or restart the server.
|
||||||
|
|
||||||
|
### Out of memory
|
||||||
|
The default model is lightweight. For even smaller models, try `paraphrase-MiniLM-L3-v2`.
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
MIT License
|
MIT
|
||||||
|
|||||||
42
pyproject.toml
Normal file
42
pyproject.toml
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
[project]
|
||||||
|
name = "knowledge-rag"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "RAG system for Obsidian vault knowledge base with MCP server"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = [
|
||||||
|
"llama-index>=0.10.0",
|
||||||
|
"llama-index-vector-stores-chroma>=0.1.0",
|
||||||
|
"chromadb>=0.4.0",
|
||||||
|
"sentence-transformers>=2.2.0",
|
||||||
|
"mcp>=1.0.0",
|
||||||
|
"python-dotenv>=1.0.0",
|
||||||
|
"pydantic>=2.0.0",
|
||||||
|
"watchdog>=3.0.0",
|
||||||
|
"httpx>=0.25.0",
|
||||||
|
# CPU-only PyTorch
|
||||||
|
"torch>=2.0.0",
|
||||||
|
"numpy>=1.24.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"pytest>=7.0.0",
|
||||||
|
"pytest-asyncio>=0.21.0",
|
||||||
|
"ruff>=0.1.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["hatchling"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[tool.hatch.build.targets.wheel]
|
||||||
|
packages = ["src/knowledge_rag"]
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 100
|
||||||
|
target-version = "py311"
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = ["E", "F", "I", "N", "W"]
|
||||||
|
ignore = ["E501"]
|
||||||
3
src/knowledge_rag/__init__.py
Normal file
3
src/knowledge_rag/__init__.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
"""Knowledge RAG - RAG system for Obsidian vault knowledge base."""
|
||||||
|
|
||||||
|
__version__ = "0.1.0"
|
||||||
183
src/knowledge_rag/chunker.py
Normal file
183
src/knowledge_rag/chunker.py
Normal file
@ -0,0 +1,183 @@
|
|||||||
|
"""Markdown-aware document chunking for Obsidian vault."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from llama_index.core.schema import TextNode
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownChunker:
|
||||||
|
"""Intelligent markdown chunker for Obsidian vaults.
|
||||||
|
|
||||||
|
Chunks markdown files while preserving:
|
||||||
|
- Document/folder structure context
|
||||||
|
- Code blocks as atomic units
|
||||||
|
- Heading hierarchy
|
||||||
|
- Wiki links as metadata
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Default chunk settings
|
||||||
|
DEFAULT_CHUNK_SIZE = 512
|
||||||
|
DEFAULT_CHUNK_OVERLAP = 50
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
||||||
|
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
||||||
|
):
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
self.chunk_overlap = chunk_overlap
|
||||||
|
|
||||||
|
def chunk_file(self, file_path: str, content: str) -> List[TextNode]:
|
||||||
|
"""Chunk a single markdown file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the markdown file
|
||||||
|
content: Raw markdown content
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of TextNode chunks with metadata
|
||||||
|
"""
|
||||||
|
# Extract frontmatter if present
|
||||||
|
frontmatter, body = self._extract_frontmatter(content)
|
||||||
|
|
||||||
|
# Extract wiki links for metadata
|
||||||
|
wiki_links = self._extract_wiki_links(body)
|
||||||
|
|
||||||
|
# Get relative path for context
|
||||||
|
rel_path = os.path.relpath(file_path)
|
||||||
|
|
||||||
|
# Get file modification time for change detection
|
||||||
|
file_mtime = os.path.getmtime(file_path)
|
||||||
|
|
||||||
|
# Split into sections based on headings
|
||||||
|
sections = self._split_by_headings(body)
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
for i, section in enumerate(sections):
|
||||||
|
if not section["content"].strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create chunk with metadata
|
||||||
|
# Note: wiki_links must be a string for ChromaDB compatibility
|
||||||
|
node = TextNode(
|
||||||
|
text=section["content"],
|
||||||
|
metadata={
|
||||||
|
"source": rel_path,
|
||||||
|
"file_name": os.path.basename(file_path),
|
||||||
|
"file_mtime": file_mtime,
|
||||||
|
"heading": section.get("heading", ""),
|
||||||
|
"section_index": i,
|
||||||
|
"wiki_links": ",".join(wiki_links) if wiki_links else "",
|
||||||
|
"has_frontmatter": frontmatter is not None,
|
||||||
|
},
|
||||||
|
excluded_embed_metadata_keys=["wiki_links"],
|
||||||
|
excluded_search_metadata_keys=["wiki_links"],
|
||||||
|
)
|
||||||
|
chunks.append(node)
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def chunk_directory(self, dir_path: str) -> List[TextNode]:
|
||||||
|
"""Chunk all markdown files in a directory recursively.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dir_path: Root directory containing markdown files
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of all TextNode chunks
|
||||||
|
"""
|
||||||
|
all_chunks = []
|
||||||
|
dir_path = Path(dir_path)
|
||||||
|
|
||||||
|
if not dir_path.exists():
|
||||||
|
raise FileNotFoundError(f"Directory not found: {dir_path}")
|
||||||
|
|
||||||
|
# Find all .md files
|
||||||
|
md_files = list(dir_path.rglob("*.md"))
|
||||||
|
|
||||||
|
for md_file in md_files:
|
||||||
|
try:
|
||||||
|
content = md_file.read_text(encoding="utf-8")
|
||||||
|
chunks = self.chunk_file(str(md_file), content)
|
||||||
|
all_chunks.extend(chunks)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error chunking {md_file}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return all_chunks
|
||||||
|
|
||||||
|
def _extract_frontmatter(self, content: str) -> tuple[Optional[dict], str]:
|
||||||
|
"""Extract YAML frontmatter from markdown."""
|
||||||
|
if not content.startswith("---"):
|
||||||
|
return None, content
|
||||||
|
|
||||||
|
# Find closing ---
|
||||||
|
lines = content.split("\n")
|
||||||
|
if len(lines) < 3:
|
||||||
|
return None, content
|
||||||
|
|
||||||
|
frontmatter_lines = []
|
||||||
|
body_start = 2
|
||||||
|
|
||||||
|
for i in range(1, len(lines)):
|
||||||
|
if lines[i].strip() == "---":
|
||||||
|
body_start = i + 1
|
||||||
|
break
|
||||||
|
frontmatter_lines.append(lines[i])
|
||||||
|
|
||||||
|
# Parse simple key-value frontmatter
|
||||||
|
frontmatter = {}
|
||||||
|
for line in frontmatter_lines:
|
||||||
|
if ":" in line:
|
||||||
|
key, value = line.split(":", 1)
|
||||||
|
frontmatter[key.strip()] = value.strip()
|
||||||
|
|
||||||
|
body = "\n".join(lines[body_start:])
|
||||||
|
return frontmatter, body
|
||||||
|
|
||||||
|
def _extract_wiki_links(self, content: str) -> List[str]:
|
||||||
|
"""Extract [[wiki links]] from markdown content."""
|
||||||
|
wiki_link_pattern = r"\[\[([^\]|]+)(?:\|[^\]]+)?\]]"
|
||||||
|
return re.findall(wiki_link_pattern, content)
|
||||||
|
|
||||||
|
def _split_by_headings(self, content: str) -> List[dict]:
|
||||||
|
"""Split content by markdown headings while preserving context."""
|
||||||
|
# Split by heading lines (# ## ### etc)
|
||||||
|
heading_pattern = r"^(#{1,6})\s+(.+)$"
|
||||||
|
|
||||||
|
sections = []
|
||||||
|
current_section = {
|
||||||
|
"heading": "",
|
||||||
|
"content": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
lines = content.split("\n")
|
||||||
|
for line in lines:
|
||||||
|
match = re.match(heading_pattern, line)
|
||||||
|
if match:
|
||||||
|
# Save current section if non-empty
|
||||||
|
if current_section["content"].strip():
|
||||||
|
sections.append(current_section)
|
||||||
|
|
||||||
|
# Start new section
|
||||||
|
level = len(match.group(1))
|
||||||
|
heading_text = match.group(2).strip()
|
||||||
|
current_section = {
|
||||||
|
"heading": heading_text,
|
||||||
|
"content": line + "\n",
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
current_section["content"] += line + "\n"
|
||||||
|
|
||||||
|
# Don't forget the last section
|
||||||
|
if current_section["content"].strip():
|
||||||
|
sections.append(current_section)
|
||||||
|
|
||||||
|
# If no headings found, treat entire content as one section
|
||||||
|
if not sections:
|
||||||
|
sections = [{"heading": "", "content": content}]
|
||||||
|
|
||||||
|
return sections
|
||||||
75
src/knowledge_rag/embeddings.py
Normal file
75
src/knowledge_rag/embeddings.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
"""Embedding model wrapper using sentence-transformers."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import List, Any
|
||||||
|
|
||||||
|
from llama_index.core.embeddings import BaseEmbedding
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
|
||||||
|
class LocalEmbeddingModel(BaseEmbedding):
|
||||||
|
"""Local embedding model using sentence-transformers.
|
||||||
|
|
||||||
|
Uses a lightweight, high-quality model for semantic similarity.
|
||||||
|
Default model: 'all-MiniLM-L6-v2' - fast and good quality.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model_name: str = "all-MiniLM-L6-v2",
|
||||||
|
cache_folder: str | None = None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
# Store model name before super init
|
||||||
|
self._model_name = model_name
|
||||||
|
|
||||||
|
# Use persistent cache directory for Docker, or local cache for development
|
||||||
|
if cache_folder is None:
|
||||||
|
if os.path.exists("/data"):
|
||||||
|
cache_folder = "/data/embeddings_cache"
|
||||||
|
else:
|
||||||
|
cache_folder = None
|
||||||
|
|
||||||
|
# Load model first
|
||||||
|
model = SentenceTransformer(model_name, cache_folder=cache_folder)
|
||||||
|
embed_dim = model.get_sentence_embedding_dimension()
|
||||||
|
|
||||||
|
# Initialize pydantic model with required fields
|
||||||
|
super().__init__(
|
||||||
|
embed_dim=embed_dim,
|
||||||
|
model_name=model_name,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Now set the model after pydantic init
|
||||||
|
object.__setattr__(self, '_model', model)
|
||||||
|
|
||||||
|
def _get_text_embedding(self, text: str) -> List[float]:
|
||||||
|
"""Get embedding for a single text."""
|
||||||
|
return self._model.encode(text, convert_to_numpy=True).tolist()
|
||||||
|
|
||||||
|
async def _aget_text_embedding(self, text: str) -> List[float]:
|
||||||
|
"""Async get embedding - synchronous for local model."""
|
||||||
|
return self._get_text_embedding(text)
|
||||||
|
|
||||||
|
def _get_query_embedding(self, query: str) -> List[float]:
|
||||||
|
"""Get embedding for a query."""
|
||||||
|
return self._model.encode(query, convert_to_numpy=True).tolist()
|
||||||
|
|
||||||
|
async def _aget_query_embedding(self, query: str) -> List[float]:
|
||||||
|
"""Async get query embedding - synchronous for local model."""
|
||||||
|
return self._get_query_embedding(query)
|
||||||
|
|
||||||
|
def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
|
||||||
|
"""Get embeddings for multiple texts."""
|
||||||
|
return self._model.encode(texts, convert_to_numpy=True).tolist()
|
||||||
|
|
||||||
|
async def _aget_text_embeddings(self, texts: List[str]) -> List[List[float]]:
|
||||||
|
"""Async get embeddings - synchronous for local model."""
|
||||||
|
return self._get_text_embeddings(texts)
|
||||||
|
|
||||||
|
|
||||||
|
def get_embedding_model() -> LocalEmbeddingModel:
|
||||||
|
"""Factory function to create the embedding model."""
|
||||||
|
model_name = os.environ.get("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
|
||||||
|
return LocalEmbeddingModel(model_name=model_name)
|
||||||
300
src/knowledge_rag/server.py
Normal file
300
src/knowledge_rag/server.py
Normal file
@ -0,0 +1,300 @@
|
|||||||
|
"""MCP server for knowledge base RAG system."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from mcp.server import Server
|
||||||
|
from mcp.server.stdio import stdio_server
|
||||||
|
from mcp.types import Tool, TextContent
|
||||||
|
from pydantic import AnyUrl
|
||||||
|
|
||||||
|
from .chunker import MarkdownChunker
|
||||||
|
from .embeddings import get_embedding_model
|
||||||
|
from .vector_store import KnowledgeVectorStore
|
||||||
|
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class KnowledgeMCPServer:
|
||||||
|
"""MCP server for semantic search in Obsidian vault.
|
||||||
|
|
||||||
|
Provides tools to:
|
||||||
|
- Search the knowledge base semantically
|
||||||
|
- Index/update the knowledge base
|
||||||
|
- Get statistics about indexed content
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, vault_path: str | None = None):
|
||||||
|
# Get vault path from environment or use default
|
||||||
|
self.vault_path = vault_path or os.environ.get("VAULT_PATH", "/data/vault")
|
||||||
|
|
||||||
|
# Ensure vault path exists
|
||||||
|
if not Path(self.vault_path).exists():
|
||||||
|
logger.warning(f"Vault path does not exist: {self.vault_path}")
|
||||||
|
|
||||||
|
# Initialize components
|
||||||
|
self.embedding_model = get_embedding_model()
|
||||||
|
self.vector_store = KnowledgeVectorStore(embedding_model=self.embedding_model)
|
||||||
|
self.chunker = MarkdownChunker()
|
||||||
|
|
||||||
|
# Track indexing status
|
||||||
|
self._indexed = False
|
||||||
|
|
||||||
|
# Create MCP server
|
||||||
|
self.server = Server("knowledge-rag")
|
||||||
|
|
||||||
|
# Register handlers
|
||||||
|
self._register_handlers()
|
||||||
|
|
||||||
|
def _register_handlers(self):
|
||||||
|
"""Register MCP request handlers."""
|
||||||
|
|
||||||
|
@self.server.list_tools()
|
||||||
|
async def list_tools() -> list[Tool]:
|
||||||
|
"""List available MCP tools."""
|
||||||
|
return [
|
||||||
|
Tool(
|
||||||
|
name="search_knowledge",
|
||||||
|
description="Semantic search through the knowledge base. "
|
||||||
|
"Uses embeddings to find relevant content based on meaning, "
|
||||||
|
"not just keywords. Best for answering questions or finding "
|
||||||
|
"related concepts.",
|
||||||
|
inputSchema={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"query": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The search query in natural language",
|
||||||
|
},
|
||||||
|
"top_k": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Number of results to return",
|
||||||
|
"default": 5,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["query"],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Tool(
|
||||||
|
name="index_knowledge",
|
||||||
|
description="Index or re-index the knowledge base. "
|
||||||
|
"Run this after adding new files to the vault. "
|
||||||
|
"Scans all markdown files and builds the search index.",
|
||||||
|
inputSchema={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"force": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Force re-index (clear existing index first)",
|
||||||
|
"default": False,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Tool(
|
||||||
|
name="get_knowledge_stats",
|
||||||
|
description="Get statistics about the indexed knowledge base.",
|
||||||
|
inputSchema={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
@self.server.call_tool()
|
||||||
|
async def call_tool(name: str, arguments: dict | None) -> list[TextContent]:
|
||||||
|
"""Handle tool calls."""
|
||||||
|
if name == "search_knowledge":
|
||||||
|
return await self._search_knowledge(arguments or {})
|
||||||
|
elif name == "index_knowledge":
|
||||||
|
return await self._index_knowledge(arguments or {})
|
||||||
|
elif name == "get_knowledge_stats":
|
||||||
|
return await self._get_stats()
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown tool: {name}")
|
||||||
|
|
||||||
|
async def _search_knowledge(self, arguments: dict[str, Any]) -> list[TextContent]:
|
||||||
|
"""Search the knowledge base semantically."""
|
||||||
|
query = arguments.get("query", "")
|
||||||
|
top_k = arguments.get("top_k", 5)
|
||||||
|
|
||||||
|
if not query:
|
||||||
|
return [TextContent(type="text", text="Query cannot be empty.")]
|
||||||
|
|
||||||
|
# Ensure we've indexed
|
||||||
|
if not self._indexed:
|
||||||
|
await self._index_knowledge({})
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Search with embeddings
|
||||||
|
results = self.vector_store.search(
|
||||||
|
query=query,
|
||||||
|
top_k=top_k,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
return [
|
||||||
|
TextContent(
|
||||||
|
type="text",
|
||||||
|
text="No results found. Try indexing your knowledge base first.",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Format results
|
||||||
|
output = []
|
||||||
|
for i, result in enumerate(results, 1):
|
||||||
|
source = result["metadata"].get("file_name", "unknown")
|
||||||
|
heading = result["metadata"].get("heading", "")
|
||||||
|
score = result.get("score", 0)
|
||||||
|
|
||||||
|
text = result["text"][:500] # Truncate long text
|
||||||
|
if len(result["text"]) > 500:
|
||||||
|
text += "..."
|
||||||
|
|
||||||
|
output.append(
|
||||||
|
f"--- Result {i} ---\n"
|
||||||
|
f"Source: {source}"
|
||||||
|
+ (f" > {heading}" if heading else "")
|
||||||
|
+ f"\nRelevance: {score:.2f}\n\n{text}\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
return [TextContent(type="text", text="\n".join(output))]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Search error")
|
||||||
|
return [TextContent(type="text", text=f"Search error: {str(e)}")]
|
||||||
|
|
||||||
|
async def _index_knowledge(self, arguments: dict[str, Any]) -> list[TextContent]:
|
||||||
|
"""Index the knowledge base."""
|
||||||
|
force = arguments.get("force", False)
|
||||||
|
|
||||||
|
vault_path = Path(self.vault_path)
|
||||||
|
|
||||||
|
if not vault_path.exists():
|
||||||
|
return [TextContent(type="text", text=f"Vault path does not exist: {self.vault_path}")]
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Clear existing index if forced
|
||||||
|
if force:
|
||||||
|
logger.info("Force re-indexing...")
|
||||||
|
self.vector_store.clear()
|
||||||
|
chunks = self.chunker.chunk_directory(str(vault_path))
|
||||||
|
new_chunks = chunks
|
||||||
|
else:
|
||||||
|
logger.info("Indexing knowledge base (incremental)...")
|
||||||
|
|
||||||
|
# Chunk all markdown files
|
||||||
|
all_chunks = self.chunker.chunk_directory(str(vault_path))
|
||||||
|
|
||||||
|
if not all_chunks:
|
||||||
|
return [TextContent(type="text", text="No markdown files found in vault.")]
|
||||||
|
|
||||||
|
# Get current sources from the vault
|
||||||
|
current_sources = set()
|
||||||
|
for chunk in all_chunks:
|
||||||
|
source = chunk.metadata.get("source")
|
||||||
|
if source:
|
||||||
|
current_sources.add(source)
|
||||||
|
|
||||||
|
# Get indexed sources and detect deleted files
|
||||||
|
indexed_sources = self.vector_store.get_existing_sources()
|
||||||
|
deleted_sources = indexed_sources - current_sources
|
||||||
|
|
||||||
|
# Remove chunks from deleted files
|
||||||
|
if deleted_sources:
|
||||||
|
logger.info(f"Removing chunks from deleted files: {deleted_sources}")
|
||||||
|
for source in deleted_sources:
|
||||||
|
self.vector_store.remove_chunks_by_source(source)
|
||||||
|
|
||||||
|
# Filter to only new/modified chunks
|
||||||
|
new_chunks = self.vector_store.filter_new_chunks(all_chunks)
|
||||||
|
|
||||||
|
if not new_chunks:
|
||||||
|
# Check if we have any existing index
|
||||||
|
stats = self.vector_store.get_stats()
|
||||||
|
if stats["total_chunks"] > 0:
|
||||||
|
return [
|
||||||
|
TextContent(
|
||||||
|
type="text",
|
||||||
|
text=f"No new or modified files to index.\n"
|
||||||
|
f"Total chunks in index: {stats['total_chunks']}",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
# No existing index, fall through to index everything
|
||||||
|
new_chunks = all_chunks
|
||||||
|
|
||||||
|
logger.info(f"Processing {len(new_chunks)} new/modified chunks...")
|
||||||
|
|
||||||
|
# Add to vector store (this embeds them)
|
||||||
|
if new_chunks:
|
||||||
|
self.vector_store.add_nodes(new_chunks, embedding_model=self.embedding_model)
|
||||||
|
|
||||||
|
self._indexed = True
|
||||||
|
|
||||||
|
stats = self.vector_store.get_stats()
|
||||||
|
return [
|
||||||
|
TextContent(
|
||||||
|
type="text",
|
||||||
|
text=f"Successfully indexed {len(new_chunks)} chunks from the knowledge base.\n"
|
||||||
|
f"Total chunks in index: {stats['total_chunks']}",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Indexing error")
|
||||||
|
return [TextContent(type="text", text=f"Indexing error: {str(e)}")]
|
||||||
|
|
||||||
|
async def _get_stats(self) -> list[TextContent]:
|
||||||
|
"""Get knowledge base statistics."""
|
||||||
|
stats = self.vector_store.get_stats()
|
||||||
|
|
||||||
|
vault_path = Path(self.vault_path)
|
||||||
|
md_files = list(vault_path.rglob("*.md")) if vault_path.exists() else []
|
||||||
|
|
||||||
|
return [
|
||||||
|
TextContent(
|
||||||
|
type="text",
|
||||||
|
text=f"Knowledge Base Statistics:\n"
|
||||||
|
f"- Vault path: {self.vault_path}\n"
|
||||||
|
f"- Markdown files: {len(md_files)}\n"
|
||||||
|
f"- Indexed chunks: {stats['total_chunks']}\n"
|
||||||
|
f"- Index status: {'Ready' if self._indexed else 'Not indexed'}",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
"""Run the MCP server."""
|
||||||
|
logger.info(f"Starting Knowledge RAG MCP Server")
|
||||||
|
logger.info(f"Vault path: {self.vault_path}")
|
||||||
|
|
||||||
|
# Auto-index on startup
|
||||||
|
await self._index_knowledge({})
|
||||||
|
|
||||||
|
# Run stdio server
|
||||||
|
async with stdio_server() as (read_stream, write_stream):
|
||||||
|
await self.server.run(
|
||||||
|
read_stream,
|
||||||
|
write_stream,
|
||||||
|
self.server.create_initialization_options(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Main entry point."""
|
||||||
|
server = KnowledgeMCPServer()
|
||||||
|
await server.run()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
218
src/knowledge_rag/vector_store.py
Normal file
218
src/knowledge_rag/vector_store.py
Normal file
@ -0,0 +1,218 @@
|
|||||||
|
"""ChromaDB vector store wrapper for knowledge base."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import TYPE_CHECKING, Any, List, Optional
|
||||||
|
|
||||||
|
from llama_index.core.schema import TextNode
|
||||||
|
from llama_index.vector_stores.chroma import ChromaVectorStore
|
||||||
|
import chromadb
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from llama_index.core.embeddings import BaseEmbedding
|
||||||
|
|
||||||
|
|
||||||
|
class KnowledgeVectorStore:
|
||||||
|
"""ChromaDB vector store for the knowledge base.
|
||||||
|
|
||||||
|
Handles persistence of embeddings and semantic search.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
persist_dir: str | None = None,
|
||||||
|
collection_name: str = "knowledge_base",
|
||||||
|
embedding_model: "BaseEmbedding | None" = None,
|
||||||
|
):
|
||||||
|
self._collection_name = collection_name
|
||||||
|
self._embedding_model = embedding_model
|
||||||
|
|
||||||
|
# Use Docker path if available, otherwise use local data dir
|
||||||
|
if persist_dir is None:
|
||||||
|
if os.path.exists("/data"):
|
||||||
|
persist_dir = "/data/chroma_db"
|
||||||
|
elif os.environ.get("DATA_PATH"):
|
||||||
|
persist_dir = os.environ.get("DATA_PATH")
|
||||||
|
else:
|
||||||
|
persist_dir = "./data/chroma_db"
|
||||||
|
|
||||||
|
self._persist_dir = persist_dir
|
||||||
|
|
||||||
|
# Ensure persist directory exists
|
||||||
|
os.makedirs(persist_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Initialize ChromaDB client
|
||||||
|
self._client = chromadb.PersistentClient(path=persist_dir)
|
||||||
|
|
||||||
|
# Get or create collection
|
||||||
|
self._collection = self._client.get_or_create_collection(
|
||||||
|
name=collection_name, metadata={"description": "Knowledge base embeddings"}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wrap in LlamaIndex vector store
|
||||||
|
# Pass the chroma_collection directly for PersistentClient
|
||||||
|
self._vector_store = ChromaVectorStore(
|
||||||
|
chroma_collection=self._collection,
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_embedding_model(self, embedding_model: "BaseEmbedding") -> None:
|
||||||
|
"""Set the embedding model for query embedding."""
|
||||||
|
self._embedding_model = embedding_model
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vector_store(self) -> ChromaVectorStore:
|
||||||
|
"""Get the LlamaIndex ChromaVectorStore."""
|
||||||
|
return self._vector_store
|
||||||
|
|
||||||
|
def add_nodes(
|
||||||
|
self, nodes: List[TextNode], embedding_model: "BaseEmbedding | None" = None
|
||||||
|
) -> None:
|
||||||
|
"""Add nodes to the vector store."""
|
||||||
|
from llama_index.core import VectorStoreIndex, StorageContext
|
||||||
|
|
||||||
|
# Use provided embedding model or the stored one
|
||||||
|
model = embedding_model or self._embedding_model
|
||||||
|
|
||||||
|
if model is None:
|
||||||
|
raise ValueError("No embedding model provided")
|
||||||
|
|
||||||
|
# First embed the nodes
|
||||||
|
for node in nodes:
|
||||||
|
node.embedding = model.get_text_embedding(node.text)
|
||||||
|
|
||||||
|
# Then add to vector store
|
||||||
|
self._vector_store.add(nodes)
|
||||||
|
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
top_k: int = 5,
|
||||||
|
filter: Optional[dict[str, Any]] = None,
|
||||||
|
) -> List[dict[str, Any]]:
|
||||||
|
"""Semantic search for similar chunks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: The search query
|
||||||
|
top_k: Number of results to return
|
||||||
|
filter: Optional metadata filters
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of search results with text and metadata
|
||||||
|
"""
|
||||||
|
from llama_index.core import VectorStoreIndex
|
||||||
|
|
||||||
|
# Use embedding model if provided, otherwise use the one from storage
|
||||||
|
embed_model = self._embedding_model
|
||||||
|
|
||||||
|
index = VectorStoreIndex.from_vector_store(
|
||||||
|
self._vector_store,
|
||||||
|
embed_model=embed_model,
|
||||||
|
)
|
||||||
|
|
||||||
|
query_engine = index.as_retriever(
|
||||||
|
similarity_top_k=top_k,
|
||||||
|
filters=filter,
|
||||||
|
)
|
||||||
|
|
||||||
|
results = query_engine.retrieve(query)
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"text": node.text,
|
||||||
|
"score": node.score,
|
||||||
|
"metadata": node.metadata,
|
||||||
|
}
|
||||||
|
for node in results
|
||||||
|
]
|
||||||
|
|
||||||
|
def clear(self) -> None:
|
||||||
|
"""Clear all embeddings from the store."""
|
||||||
|
self._client.delete_collection(self._collection_name)
|
||||||
|
self._collection = self._client.get_or_create_collection(
|
||||||
|
name=self._collection_name, metadata={"description": "Knowledge base embeddings"}
|
||||||
|
)
|
||||||
|
# Recreate the ChromaVectorStore wrapper with the new collection
|
||||||
|
self._vector_store = ChromaVectorStore(
|
||||||
|
chroma_collection=self._collection,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_stats(self) -> dict[str, Any]:
|
||||||
|
"""Get vector store statistics."""
|
||||||
|
return {
|
||||||
|
"total_chunks": self._collection.count(),
|
||||||
|
"collection_name": self._collection_name,
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_indexed_files(self) -> dict[str, float]:
|
||||||
|
"""Get all indexed files and their modification times.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping source path to file modification time
|
||||||
|
"""
|
||||||
|
indexed_files = {}
|
||||||
|
|
||||||
|
# Get all items from the collection
|
||||||
|
items = self._collection.get()
|
||||||
|
|
||||||
|
if items and items.get("metadatas"):
|
||||||
|
for metadata in items["metadatas"]:
|
||||||
|
source = metadata.get("source")
|
||||||
|
file_mtime = metadata.get("file_mtime")
|
||||||
|
if source and file_mtime is not None:
|
||||||
|
# Store the latest mtime for each source
|
||||||
|
if source not in indexed_files or file_mtime > indexed_files[source]:
|
||||||
|
indexed_files[source] = file_mtime
|
||||||
|
|
||||||
|
return indexed_files
|
||||||
|
|
||||||
|
def get_existing_sources(self) -> set[str]:
|
||||||
|
"""Get set of all indexed source file paths."""
|
||||||
|
return set(self.get_indexed_files().keys())
|
||||||
|
|
||||||
|
def filter_new_chunks(self, nodes: List["TextNode"]) -> List["TextNode"]:
|
||||||
|
"""Filter out chunks that are already indexed.
|
||||||
|
|
||||||
|
Compares source file path and modification time to skip unchanged files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
nodes: List of TextNode chunks to filter
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of chunks that are new or modified
|
||||||
|
"""
|
||||||
|
indexed_files = self.get_indexed_files()
|
||||||
|
|
||||||
|
new_chunks = []
|
||||||
|
for node in nodes:
|
||||||
|
source = node.metadata.get("source")
|
||||||
|
file_mtime = node.metadata.get("file_mtime")
|
||||||
|
|
||||||
|
if source is None:
|
||||||
|
# Include chunks without source (shouldn't happen)
|
||||||
|
new_chunks.append(node)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if this file has been indexed
|
||||||
|
indexed_mtime = indexed_files.get(source)
|
||||||
|
if indexed_mtime is None:
|
||||||
|
# New file, not in index
|
||||||
|
new_chunks.append(node)
|
||||||
|
elif file_mtime > indexed_mtime:
|
||||||
|
# File has been modified since last index
|
||||||
|
# First remove old chunks for this file
|
||||||
|
self._remove_chunks_by_source(source)
|
||||||
|
new_chunks.append(node)
|
||||||
|
# else: file unchanged, skip it
|
||||||
|
|
||||||
|
return new_chunks
|
||||||
|
|
||||||
|
def _remove_chunks_by_source(self, source: str) -> None:
|
||||||
|
"""Remove all chunks from a specific source file."""
|
||||||
|
# Get IDs of chunks to delete
|
||||||
|
items = self._collection.get(where={"source": source})
|
||||||
|
|
||||||
|
if items and items.get("ids"):
|
||||||
|
self._collection.delete(ids=items["ids"])
|
||||||
|
|
||||||
|
def remove_chunks_by_source(self, source: str) -> None:
|
||||||
|
"""Public method to remove all chunks from a specific source file."""
|
||||||
|
self._remove_chunks_by_source(source)
|
||||||
Reference in New Issue
Block a user