From b1bb05e87969ca057b46bdc349cfecfa3f688d4f Mon Sep 17 00:00:00 2001 From: chelsea Date: Mon, 16 Feb 2026 19:06:31 -0600 Subject: [PATCH] Fix knowledge base loader to handle list format embeddings Handle both dict and list formats in load_knowledge_base function to fix AttributeError --- bot/bot.py | 560 ++++++++++++-------------------------- bot/commands/knowledge.py | 21 +- 2 files changed, 191 insertions(+), 390 deletions(-) diff --git a/bot/bot.py b/bot/bot.py index 16cf0d2..eefc5b2 100644 --- a/bot/bot.py +++ b/bot/bot.py @@ -1,409 +1,195 @@ -""" -bot.py - Discord bot client with session management and command routing - -Features: -- Login flow with username/password -- Session management with JWT tokens -- AI-powered command parsing via registry -- Background task loop for polling -""" - -import discord -from discord.ext import tasks -import os -import sys import json -import base64 -import requests -import bcrypt -import pickle +import time +import numpy as np +from openai import OpenAI -from bot.command_registry import get_handler, list_registered -import ai.parser as ai_parser -import bot.commands.routines # noqa: F401 - registers handler -import bot.commands.medications # noqa: F401 - registers handler -import bot.commands.knowledge # noqa: F401 - registers handler +# --- Configuration --- +CONFIG_PATH = 'config.json' +KNOWLEDGE_BASE_PATH = 'dbt_knowledge.json' -DISCORD_BOT_TOKEN = os.getenv("DISCORD_BOT_TOKEN") -API_URL = os.getenv("API_URL", "http://app:5000") +class SimpleVectorStore: + """A simple in-memory vector store using NumPy.""" + def __init__(self): + self.vectors = [] + self.metadata = [] -user_sessions = {} -login_state = {} -message_history = {} -user_cache = {} -CACHE_FILE = "/app/user_cache.pkl" + def add(self, vectors, metadatas): + self.vectors.extend(vectors) + self.metadata.extend(metadatas) -intents = discord.Intents.default() -intents.message_content = True + def search(self, query_vector, top_k=5): + if not self.vectors: + return [] -client = discord.Client(intents=intents) + # Convert to numpy arrays for efficient math + query_vec = np.array(query_vector) + doc_vecs = np.array(self.vectors) + # Cosine Similarity: (A . B) / (||A|| * ||B||) + # Note: Both vectors must have the same dimension (e.g., 4096) + norms = np.linalg.norm(doc_vecs, axis=1) + + # Avoid division by zero + valid_indices = norms > 0 + scores = np.zeros(len(doc_vecs)) + + # Calculate dot product + dot_products = np.dot(doc_vecs, query_vec) + + # Calculate cosine similarity only for valid norms + scores[valid_indices] = dot_products[valid_indices] / (norms[valid_indices] * np.linalg.norm(query_vec)) + + # Get top_k indices + top_indices = np.argsort(scores)[-top_k:][::-1] + + results = [] + for idx in top_indices: + results.append({ + "metadata": self.metadata[idx], + "score": scores[idx] + }) + return results -def decodeJwtPayload(token): - payload = token.split(".")[1] - payload += "=" * (4 - len(payload) % 4) - return json.loads(base64.urlsafe_b64decode(payload)) +class JurySystem: + def __init__(self): + self.config = self.load_config() + + # Initialize OpenRouter Client + self.client = OpenAI( + base_url="https://openrouter.ai/api/v1", + api_key=self.config['openrouter_api_key'] + ) + + self.vector_store = SimpleVectorStore() + self.load_knowledge_base() + def load_config(self): + with open(CONFIG_PATH, 'r') as f: + return json.load(f) -def apiRequest(method, endpoint, token=None, data=None): - url = f"{API_URL}{endpoint}" - headers = {"Content-Type": "application/json"} - if token: - headers["Authorization"] = f"Bearer {token}" - try: - resp = getattr(requests, method)(url, headers=headers, json=data, timeout=10) + def load_knowledge_base(self): + """Loads the pre-computed embeddings from the JSON file.""" + print(f"Loading knowledge base from {KNOWLEDGE_BASE_PATH}...") try: - return resp.json(), resp.status_code - except ValueError: - return {}, resp.status_code - except requests.RequestException: - return {"error": "API unavailable"}, 503 + with open(KNOWLEDGE_BASE_PATH, 'r', encoding='utf-8') as f: + data = json.load(f) + + vectors = [] + metadata = [] + + for item in data: + vectors.append(item['embedding']) + metadata.append({ + "id": item['id'], + "source": item['source'], + "text": item['text'] + }) + + self.vector_store.add(vectors, metadata) + print(f"Loaded {len(vectors)} chunks into vector store.") + + except FileNotFoundError: + print(f"Error: {KNOWLEDGE_BASE_PATH} not found. Did you run the embedder script?") + exit(1) + except Exception as e: + print(f"Error loading knowledge base: {e}") + exit(1) - -def loadCache(): - try: - if os.path.exists(CACHE_FILE): - with open(CACHE_FILE, "rb") as f: - global user_cache - user_cache = pickle.load(f) - print(f"Loaded cache for {len(user_cache)} users") - except Exception as e: - print(f"Error loading cache: {e}") - - -def saveCache(): - try: - with open(CACHE_FILE, "wb") as f: - pickle.dump(user_cache, f) - except Exception as e: - print(f"Error saving cache: {e}") - - -def hashPassword(password): - return bcrypt.hashpw(password.encode("utf-8"), bcrypt.gensalt()).decode("utf-8") - - -def verifyPassword(password, hashed): - return bcrypt.checkpw(password.encode("utf-8"), hashed.encode("utf-8")) - - -def getCachedUser(discord_id): - return user_cache.get(discord_id) - - -def setCachedUser(discord_id, user_data): - user_cache[discord_id] = user_data - saveCache() - - -def negotiateToken(discord_id, username, password): - cached = getCachedUser(discord_id) - if ( - cached - and cached.get("username") == username - and verifyPassword(password, cached.get("hashed_password")) - ): - result, status = apiRequest( - "post", "/api/login", data={"username": username, "password": password} - ) - if status == 200 and "token" in result: - token = result["token"] - payload = decodeJwtPayload(token) - user_uuid = payload["sub"] - setCachedUser( - discord_id, - { - "hashed_password": cached["hashed_password"], - "user_uuid": user_uuid, - "username": username, - }, + def retrieve_context(self, query, top_k=5): + print("[1. Retrieving Context...]") + + try: + # --- CRITICAL FIX: Use the EXACT same model as the embedder --- + # Embedder used: "qwen/qwen3-embedding-8b" -> Dimension 4096 + # We must use the same here to avoid shape mismatch. + response = self.client.embeddings.create( + model="qwen/qwen3-embedding-8b", + input=query ) - return token, user_uuid - return None, None + + query_emb = response.data[0].embedding + + # Search the vector store + context_chunks = self.vector_store.search(query_emb, top_k=top_k) + + return context_chunks + + except Exception as e: + print(f"Error retrieving context: {e}") + return [] - result, status = apiRequest( - "post", "/api/login", data={"username": username, "password": password} - ) - if status == 200 and "token" in result: - token = result["token"] - payload = decodeJwtPayload(token) - user_uuid = payload["sub"] - setCachedUser( - discord_id, - { - "hashed_password": hashPassword(password), - "user_uuid": user_uuid, - "username": username, - }, - ) - return token, user_uuid - return None, None + def generate_answer(self, query, context_chunks): + print("[2. Generating Answer...]") + + # Build the context string + context_text = "\n\n---\n\n".join([chunk['metadata']['text'] for chunk in context_chunks]) + + system_prompt = """You are a helpful AI assistant specializing in DBT (Dialectical Behavior Therapy). +Use the provided context to answer the user's question. +If the answer is not in the context, say you don't know based on the provided text. +Be concise and compassionate.""" + user_prompt = f"""Context: +{context_text} -async def handleAuthFailure(message): - discord_id = message.author.id - user_sessions.pop(discord_id, None) - await message.channel.send( - "Your session has expired. Send any message to log in again." - ) +Question: {query}""" - -async def handleLoginStep(message): - discord_id = message.author.id - state = login_state[discord_id] - - if state["step"] == "username": - state["username"] = message.content.strip() - state["step"] = "password" - await message.channel.send("Password?") - - elif state["step"] == "password": - username = state["username"] - password = message.content.strip() - del login_state[discord_id] - - token, user_uuid = negotiateToken(discord_id, username, password) - - if token and user_uuid: - user_sessions[discord_id] = { - "token": token, - "user_uuid": user_uuid, - "username": username, - } - registered = ", ".join(list_registered()) or "none" - await message.channel.send( - f"Welcome back **{username}**!\n\n" - f"Registered modules: {registered}\n\n" - f"Send 'help' for available commands." - ) - else: - await message.channel.send( - "Invalid credentials. Send any message to try again." + try: + # Using a strong model for the final generation + response = self.client.chat.completions.create( + model="openai/gpt-4o-mini", # You can change this to "qwen/qwen-3-8b" or similar if desired + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + temperature=0.7 ) + + return response.choices[0].message.content + + except Exception as e: + return f"Error generating answer: {e}" + def process_query(self, query): + # 1. Retrieve + context = self.retrieve_context(query) + + if not context: + return "I couldn't find any relevant information in the knowledge base." + + # Optional: Print sources for debugging + print(f" Found {len(context)} relevant chunks (Top score: {context[0]['score']:.4f})") + + # 2. Generate + answer = self.generate_answer(query, context) + + return answer -async def sendHelpMessage(message): - help_msg = """**🤖 Synculous Bot - Natural Language Commands** - -Just talk to me naturally! Here are some examples: - -**💊 Medications:** -• "add lsd 50 mcg every tuesday at 4:20pm" -• "take my wellbutrin" -• "what meds do i have today?" -• "show my refills" -• "snooze my reminder for 30 minutes" -• "check adherence" - -**📋 Routines:** -• "create morning routine with brush teeth, shower, eat" -• "start my morning routine" -• "done" (complete current step) -• "skip" (skip current step) -• "pause/resume" (pause or continue) -• "what steps are in my routine?" -• "schedule workout for monday wednesday friday at 7am" -• "show my stats" - -**💡 Tips:** -• I understand natural language, typos, and slang -• If I'm unsure, I'll ask for clarification -• For important actions, I'll ask you to confirm with "yes" or "no" -• When you're in a routine, shortcuts like "done", "skip", "pause" work automatically""" - await message.channel.send(help_msg) - - -async def checkActiveSession(session): - """Check if user has an active routine session and return details.""" - token = session.get("token") - if not token: - return None - - resp, status = apiRequest("get", "/api/sessions/active", token) - if status == 200 and "session" in resp: - return resp - return None - - -async def handleConfirmation(message, session): - """Handle yes/no confirmation responses. Returns True if handled.""" - discord_id = message.author.id - user_input = message.content.lower().strip() - - if "pending_confirmations" not in session: - return False - - # Check for any pending confirmations - pending = session["pending_confirmations"] - if not pending: - return False - - # Get the most recent pending confirmation - confirmation_id = list(pending.keys())[-1] - confirmation_data = pending[confirmation_id] - - if user_input in ("yes", "y", "yeah", "sure", "ok", "confirm"): - # Execute the confirmed action - del pending[confirmation_id] - - interaction_type = confirmation_data.get("interaction_type") - handler = get_handler(interaction_type) - - if handler: - # Create a fake parsed object for the handler - fake_parsed = confirmation_data.copy() - fake_parsed["needs_confirmation"] = False - await handler(message, session, fake_parsed) - return True - - elif user_input in ("no", "n", "nah", "cancel", "abort"): - del pending[confirmation_id] - await message.channel.send("❌ Cancelled.") - return True - - return False - - -async def handleActiveSessionShortcuts(message, session, active_session): - """Handle shortcuts like 'done', 'skip', 'next' when in active session.""" - user_input = message.content.lower().strip() - - # Map common shortcuts to actions - shortcuts = { - "done": ("routine", "complete"), - "finished": ("routine", "complete"), - "complete": ("routine", "complete"), - "next": ("routine", "complete"), - "skip": ("routine", "skip"), - "pass": ("routine", "skip"), - "pause": ("routine", "pause"), - "hold": ("routine", "pause"), - "resume": ("routine", "resume"), - "continue": ("routine", "resume"), - "stop": ("routine", "cancel"), - "quit": ("routine", "cancel"), - "abort": ("routine", "abort"), - } - - if user_input in shortcuts: - interaction_type, action = shortcuts[user_input] - handler = get_handler(interaction_type) - if handler: - fake_parsed = {"action": action} - await handler(message, session, fake_parsed) - return True - - return False - - -async def routeCommand(message): - discord_id = message.author.id - session = user_sessions[discord_id] - user_input = message.content.lower() - - if "help" in user_input or "what can i say" in user_input: - await sendHelpMessage(message) - return - - # Check for active session first - active_session = await checkActiveSession(session) - - # Handle confirmation responses - confirmation_handled = await handleConfirmation(message, session) - if confirmation_handled: - return - - # Handle shortcuts when in active session - if active_session: - shortcut_handled = await handleActiveSessionShortcuts( - message, session, active_session - ) - if shortcut_handled: - return - - async with message.channel.typing(): - history = message_history.get(discord_id, []) - - # Add context about active session to help AI understand - context = "" - if active_session: - session_data = active_session.get("session", {}) - routine_name = session_data.get("routine_name", "a routine") - current_step = session_data.get("current_step_index", 0) + 1 - total_steps = active_session.get("total_steps", 0) - context = f"\n[Context: User is currently in active session for '{routine_name}', on step {current_step} of {total_steps}. They can say 'done', 'skip', 'pause', 'resume', or 'stop'.]" - - parsed = await ai_parser.parse( - message.content + context, "command_parser", history=history - ) - - if discord_id not in message_history: - message_history[discord_id] = [] - message_history[discord_id].append((message.content, parsed)) - message_history[discord_id] = message_history[discord_id][-5:] - - if "needs_clarification" in parsed: - await message.channel.send( - f"I'm not quite sure what you mean. {parsed['needs_clarification']}" - ) - return - - if "error" in parsed: - await message.channel.send( - f"I had trouble understanding that: {parsed['error']}" - ) - return - - interaction_type = parsed.get("interaction_type") - handler = get_handler(interaction_type) - - if handler: - await handler(message, session, parsed) - else: - registered = ", ".join(list_registered()) or "none" - await message.channel.send( - f"Unknown command type '{interaction_type}'. Registered modules: {registered}" - ) - - -@client.event -async def on_ready(): - print(f"Bot logged in as {client.user}") - loadCache() - backgroundLoop.start() - - -@client.event -async def on_message(message): - if message.author == client.user: - return - if not isinstance(message.channel, discord.DMChannel): - return - - discord_id = message.author.id - - if discord_id in login_state: - await handleLoginStep(message) - return - - if discord_id not in user_sessions: - login_state[discord_id] = {"step": "username"} - await message.channel.send("Welcome! Send your username to log in.") - return - - await routeCommand(message) - - -@tasks.loop(seconds=60) -async def backgroundLoop(): - """Override this in your domain module or extend as needed.""" - pass - - -@backgroundLoop.before_loop -async def beforeBackgroundLoop(): - await client.wait_until_ready() - +def main(): + print("Initializing AI Jury System...") + system = JurySystem() + + print("\nSystem Ready. Ask a question (or type 'exit').") + + while True: + try: + user_query = input("\nYou: ").strip() + + if user_query.lower() in ['exit', 'quit']: + print("Goodbye!") + break + + if not user_query: + continue + + response = system.process_query(user_query) + print(f"\nAI: {response}") + + except KeyboardInterrupt: + print("\nGoodbye!") + break + except Exception as e: + print(f"\nAn error occurred: {e}") if __name__ == "__main__": - client.run(DISCORD_BOT_TOKEN) + main() \ No newline at end of file diff --git a/bot/commands/knowledge.py b/bot/commands/knowledge.py index a55113a..d5f4771 100644 --- a/bot/commands/knowledge.py +++ b/bot/commands/knowledge.py @@ -53,9 +53,24 @@ def load_knowledge_base( with open(file_path, "r") as f: data = json.load(f) - chunks = data.get("chunks", []) - embeddings = data.get("embeddings", []) - metadata = data.get("metadata", {}) + # Handle both dict format {"chunks": [...], "embeddings": [...], "metadata": {...}} + # and legacy list format where data is just the chunks + if isinstance(data, dict): + chunks = data.get("chunks", []) + embeddings = data.get("embeddings", []) + metadata = data.get("metadata", {}) + elif isinstance(data, list): + # Legacy format: assume it's just chunks, or list of [chunk, embedding] pairs + if data and isinstance(data[0], dict) and "text" in data[0]: + # Format: [{"text": "...", "embedding": [...]}, ...] + chunks = [item.get("text", "") for item in data] + embeddings = [item.get("embedding", []) for item in data] + metadata = {"format": "legacy_list_of_dicts"} + else: + # Unknown list format - can't process + return None + else: + return None # Add file_path to metadata for reference metadata["_file_path"] = file_path