Add knowledge base RAG module for book Q&A
- Create knowledge.py handler with dynamic book selection - Support list/select/query actions for multiple books - Implement vector search with cosine similarity - Add knowledge detection to AI parser config - Cache embeddings per-book for performance
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -22,6 +22,7 @@ from bot.command_registry import get_handler, list_registered
|
||||
import ai.parser as ai_parser
|
||||
import bot.commands.routines # noqa: F401 - registers handler
|
||||
import bot.commands.medications # noqa: F401 - registers handler
|
||||
import bot.commands.knowledge # noqa: F401 - registers handler
|
||||
|
||||
DISCORD_BOT_TOKEN = os.getenv("DISCORD_BOT_TOKEN")
|
||||
API_URL = os.getenv("API_URL", "http://app:5000")
|
||||
@@ -314,7 +315,9 @@ async def routeCommand(message):
|
||||
|
||||
# Handle shortcuts when in active session
|
||||
if active_session:
|
||||
shortcut_handled = await handleActiveSessionShortcuts(message, session, active_session)
|
||||
shortcut_handled = await handleActiveSessionShortcuts(
|
||||
message, session, active_session
|
||||
)
|
||||
if shortcut_handled:
|
||||
return
|
||||
|
||||
@@ -330,7 +333,9 @@ async def routeCommand(message):
|
||||
total_steps = active_session.get("total_steps", 0)
|
||||
context = f"\n[Context: User is currently in active session for '{routine_name}', on step {current_step} of {total_steps}. They can say 'done', 'skip', 'pause', 'resume', or 'stop'.]"
|
||||
|
||||
parsed = ai_parser.parse(message.content + context, "command_parser", history=history)
|
||||
parsed = ai_parser.parse(
|
||||
message.content + context, "command_parser", history=history
|
||||
)
|
||||
|
||||
if discord_id not in message_history:
|
||||
message_history[discord_id] = []
|
||||
|
||||
300
bot/commands/knowledge.py
Normal file
300
bot/commands/knowledge.py
Normal file
@@ -0,0 +1,300 @@
|
||||
"""
|
||||
Knowledge base command handler - RAG-powered Q&A from book embeddings
|
||||
Supports multiple books with user selection
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import glob
|
||||
import numpy as np
|
||||
from typing import List, Tuple, Optional, Dict
|
||||
from pathlib import Path
|
||||
|
||||
from bot.command_registry import register_module
|
||||
import ai.parser as ai_parser
|
||||
from ai.parser import client
|
||||
|
||||
# Configuration
|
||||
EPUBS_DIRECTORY = os.getenv("KNOWLEDGE_EMBEDDINGS_DIR", "../embedding-generator/epubs")
|
||||
TOP_K_CHUNKS = 5
|
||||
EMBEDDING_MODEL = "sentence-transformers/all-minilm-l12-l2"
|
||||
CHAT_MODEL = "deepseek/deepseek-v3.2"
|
||||
EMBEDDING_EXTENSION = ".embeddings.json"
|
||||
|
||||
# Cache for loaded embeddings: {file_path: (chunks, embeddings, metadata)}
|
||||
_knowledge_cache: Dict[str, Tuple[List[str], List[List[float]], dict]] = {}
|
||||
|
||||
|
||||
def find_embedding_files() -> List[str]:
|
||||
"""Find all embedding files in the directory."""
|
||||
os.makedirs(EPUBS_DIRECTORY, exist_ok=True)
|
||||
pattern = os.path.join(EPUBS_DIRECTORY, f"*{EMBEDDING_EXTENSION}")
|
||||
files = glob.glob(pattern)
|
||||
return sorted(files)
|
||||
|
||||
|
||||
def get_book_name(file_path: str) -> str:
|
||||
"""Extract readable book name from file path."""
|
||||
return (
|
||||
Path(file_path).stem.replace(EMBEDDING_EXTENSION, "").replace(".", " ").title()
|
||||
)
|
||||
|
||||
|
||||
def load_knowledge_base(
|
||||
file_path: str,
|
||||
) -> Optional[Tuple[List[str], List[List[float]], dict]]:
|
||||
"""Load and cache a specific embeddings file."""
|
||||
if file_path in _knowledge_cache:
|
||||
return _knowledge_cache[file_path]
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
return None
|
||||
|
||||
with open(file_path, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
chunks = data.get("chunks", [])
|
||||
embeddings = data.get("embeddings", [])
|
||||
metadata = data.get("metadata", {})
|
||||
|
||||
# Add file_path to metadata for reference
|
||||
metadata["_file_path"] = file_path
|
||||
|
||||
_knowledge_cache[file_path] = (chunks, embeddings, metadata)
|
||||
return _knowledge_cache[file_path]
|
||||
|
||||
|
||||
def get_query_embedding(query: str) -> List[float]:
|
||||
"""Embed the user's question via OpenRouter."""
|
||||
response = client.embeddings.create(model=EMBEDDING_MODEL, input=query)
|
||||
return response.data[0].embedding
|
||||
|
||||
|
||||
def cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
|
||||
"""Calculate similarity between two vectors."""
|
||||
vec1 = np.array(vec1)
|
||||
vec2 = np.array(vec2)
|
||||
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
|
||||
|
||||
|
||||
def search_context(
|
||||
query_embedding: List[float],
|
||||
chunks: List[str],
|
||||
embeddings: List[List[float]],
|
||||
top_k: int = 5,
|
||||
) -> Tuple[List[str], List[float]]:
|
||||
"""Find the most relevant chunks and return them with scores."""
|
||||
scores = []
|
||||
for i, emb in enumerate(embeddings):
|
||||
score = cosine_similarity(query_embedding, emb)
|
||||
scores.append((score, i))
|
||||
|
||||
scores.sort(key=lambda x: x[0], reverse=True)
|
||||
top_chunks = [chunks[i] for score, i in scores[:top_k]]
|
||||
top_scores = [score for score, i in scores[:top_k]]
|
||||
|
||||
return top_chunks, top_scores
|
||||
|
||||
|
||||
def generate_answer(query: str, context_chunks: List[str], book_title: str) -> str:
|
||||
"""Generate answer using DeepSeek via OpenRouter."""
|
||||
|
||||
context_text = "\n\n---\n\n".join(context_chunks)
|
||||
|
||||
system_prompt = f"""You are an expert assistant answering questions about "{book_title}".
|
||||
Answer based strictly on the provided context. If the answer isn't in the context, say you don't know.
|
||||
Do not make up information. Provide clear, helpful answers based on the book's content.
|
||||
|
||||
Context from {book_title}:
|
||||
{context_text}"""
|
||||
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model=CHAT_MODEL,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": query},
|
||||
],
|
||||
temperature=0.1,
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
except Exception as e:
|
||||
return f"❌ Error generating answer: {e}"
|
||||
|
||||
|
||||
def get_user_selected_book(session) -> Optional[str]:
|
||||
"""Get the currently selected book for a user."""
|
||||
return session.get("knowledge_base", {}).get("selected_book")
|
||||
|
||||
|
||||
def set_user_selected_book(session, file_path: str):
|
||||
"""Set the selected book for a user."""
|
||||
if "knowledge_base" not in session:
|
||||
session["knowledge_base"] = {}
|
||||
session["knowledge_base"]["selected_book"] = file_path
|
||||
|
||||
|
||||
async def handle_knowledge(message, session, parsed):
|
||||
"""Handle knowledge base queries with dynamic book selection."""
|
||||
action = parsed.get("action", "query")
|
||||
|
||||
if action == "list":
|
||||
embedding_files = find_embedding_files()
|
||||
|
||||
if not embedding_files:
|
||||
await message.channel.send(
|
||||
f"❌ No knowledge bases found in `{EPUBS_DIRECTORY}`"
|
||||
)
|
||||
return
|
||||
|
||||
lines = [f"{i + 1}. {get_book_name(f)}" for i, f in enumerate(embedding_files)]
|
||||
current = get_user_selected_book(session)
|
||||
current_text = (
|
||||
f"\n\n📖 Currently selected: **{get_book_name(current)}**"
|
||||
if current
|
||||
else ""
|
||||
)
|
||||
|
||||
await message.channel.send(
|
||||
f"📚 **Available Knowledge Bases:**\n"
|
||||
+ "\n".join(lines)
|
||||
+ current_text
|
||||
+ "\n\nUse `ask <book number/name> <question>` or `select book <number/name>`"
|
||||
)
|
||||
|
||||
elif action == "select":
|
||||
book_identifier = parsed.get("book", "")
|
||||
embedding_files = find_embedding_files()
|
||||
|
||||
if not embedding_files:
|
||||
await message.channel.send("❌ No knowledge bases available.")
|
||||
return
|
||||
|
||||
selected_file = None
|
||||
|
||||
# Try to parse as number
|
||||
try:
|
||||
book_num = int(book_identifier) - 1
|
||||
if 0 <= book_num < len(embedding_files):
|
||||
selected_file = embedding_files[book_num]
|
||||
except (ValueError, TypeError):
|
||||
# Try to match by name
|
||||
book_lower = book_identifier.lower()
|
||||
for f in embedding_files:
|
||||
if book_lower in get_book_name(f).lower() or book_lower in f.lower():
|
||||
selected_file = f
|
||||
break
|
||||
|
||||
if not selected_file:
|
||||
await message.channel.send(
|
||||
f"❌ Could not find book '{book_identifier}'. Use `list books` to see available options."
|
||||
)
|
||||
return
|
||||
|
||||
set_user_selected_book(session, selected_file)
|
||||
book_name = get_book_name(selected_file)
|
||||
await message.channel.send(f"✅ Selected knowledge base: **{book_name}**")
|
||||
|
||||
elif action == "query":
|
||||
query = parsed.get("query", "")
|
||||
book_override = parsed.get("book", "")
|
||||
|
||||
if not query:
|
||||
await message.channel.send(
|
||||
"What would you like to know? (e.g., 'what does the book say about time management?')"
|
||||
)
|
||||
return
|
||||
|
||||
# Determine which book to use
|
||||
selected_file = None
|
||||
|
||||
if book_override:
|
||||
# User specified a book in the query
|
||||
embedding_files = find_embedding_files()
|
||||
book_lower = book_override.lower()
|
||||
|
||||
# Try number first
|
||||
try:
|
||||
book_num = int(book_override) - 1
|
||||
if 0 <= book_num < len(embedding_files):
|
||||
selected_file = embedding_files[book_num]
|
||||
except (ValueError, TypeError):
|
||||
# Try name match
|
||||
for f in embedding_files:
|
||||
if (
|
||||
book_lower in get_book_name(f).lower()
|
||||
or book_lower in f.lower()
|
||||
):
|
||||
selected_file = f
|
||||
break
|
||||
else:
|
||||
# Use user's selected book or default to first available
|
||||
selected_file = get_user_selected_book(session)
|
||||
if not selected_file:
|
||||
embedding_files = find_embedding_files()
|
||||
if embedding_files:
|
||||
selected_file = embedding_files[0]
|
||||
set_user_selected_book(session, selected_file)
|
||||
|
||||
if not selected_file:
|
||||
await message.channel.send(
|
||||
"❌ No knowledge base available. Please check the embeddings directory."
|
||||
)
|
||||
return
|
||||
|
||||
# Load knowledge base
|
||||
kb_data = load_knowledge_base(selected_file)
|
||||
if kb_data is None:
|
||||
await message.channel.send(
|
||||
"❌ Error loading knowledge base. Please check the file path."
|
||||
)
|
||||
return
|
||||
|
||||
chunks, embeddings, metadata = kb_data
|
||||
book_title = metadata.get("title", get_book_name(selected_file))
|
||||
|
||||
await message.channel.send(f"🔍 Searching **{book_title}**...")
|
||||
|
||||
try:
|
||||
# Get query embedding and search
|
||||
query_emb = get_query_embedding(query)
|
||||
relevant_chunks, scores = search_context(
|
||||
query_emb, chunks, embeddings, TOP_K_CHUNKS
|
||||
)
|
||||
|
||||
# Generate answer
|
||||
answer = generate_answer(query, relevant_chunks, book_title)
|
||||
|
||||
# Send response
|
||||
await message.channel.send(f"📚 **Answer:**\n{answer}")
|
||||
|
||||
except Exception as e:
|
||||
await message.channel.send(f"❌ Error processing query: {e}")
|
||||
|
||||
else:
|
||||
await message.channel.send(
|
||||
f"Unknown knowledge action: {action}. Try: list, select, or ask a question."
|
||||
)
|
||||
|
||||
|
||||
def validate_knowledge_json(data):
|
||||
"""Validate parsed JSON for knowledge queries."""
|
||||
errors = []
|
||||
|
||||
if not isinstance(data, dict):
|
||||
return ["Response must be a JSON object"]
|
||||
|
||||
if "error" in data:
|
||||
return []
|
||||
|
||||
if "action" not in data:
|
||||
errors.append("Missing required field: action")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
# Register the module
|
||||
register_module("knowledge", handle_knowledge)
|
||||
|
||||
# Register the validator
|
||||
ai_parser.register_validator("knowledge", validate_knowledge_json)
|
||||
Reference in New Issue
Block a user