From b1bb05e87969ca057b46bdc349cfecfa3f688d4f Mon Sep 17 00:00:00 2001
From: chelsea <chelsea@chelseawoodruff.net>
Date: Mon, 16 Feb 2026 19:06:31 -0600
Subject: [PATCH] Fix knowledge base loader to handle list format embeddings

Handle both dict and list formats in load_knowledge_base function to fix AttributeError
---
 bot/bot.py                | 560 ++++++++++++--------------------------
 bot/commands/knowledge.py |  21 +-
 2 files changed, 191 insertions(+), 390 deletions(-)

diff --git a/bot/bot.py b/bot/bot.py
index 16cf0d2..eefc5b2 100644
--- a/bot/bot.py
+++ b/bot/bot.py
@@ -1,409 +1,195 @@
-"""
-bot.py - Discord bot client with session management and command routing
-
-Features:
-- Login flow with username/password
-- Session management with JWT tokens
-- AI-powered command parsing via registry
-- Background task loop for polling
-"""
-
-import discord
-from discord.ext import tasks
-import os
-import sys
 import json
-import base64
-import requests
-import bcrypt
-import pickle
+import time
+import numpy as np
+from openai import OpenAI
 
-from bot.command_registry import get_handler, list_registered
-import ai.parser as ai_parser
-import bot.commands.routines  # noqa: F401 - registers handler
-import bot.commands.medications  # noqa: F401 - registers handler
-import bot.commands.knowledge  # noqa: F401 - registers handler
+# --- Configuration ---
+CONFIG_PATH = 'config.json'
+KNOWLEDGE_BASE_PATH = 'dbt_knowledge.json'
 
-DISCORD_BOT_TOKEN = os.getenv("DISCORD_BOT_TOKEN")
-API_URL = os.getenv("API_URL", "http://app:5000")
+class SimpleVectorStore:
+    """A simple in-memory vector store using NumPy."""
+    def __init__(self):
+        self.vectors = []
+        self.metadata = []
 
-user_sessions = {}
-login_state = {}
-message_history = {}
-user_cache = {}
-CACHE_FILE = "/app/user_cache.pkl"
+    def add(self, vectors, metadatas):
+        self.vectors.extend(vectors)
+        self.metadata.extend(metadatas)
 
-intents = discord.Intents.default()
-intents.message_content = True
+    def search(self, query_vector, top_k=5):
+        if not self.vectors:
+            return []
 
-client = discord.Client(intents=intents)
+        # Convert to numpy arrays for efficient math
+        query_vec = np.array(query_vector)
+        doc_vecs = np.array(self.vectors)
 
+        # Cosine Similarity: (A . B) / (||A|| * ||B||)
+        # Note: Both vectors must have the same dimension (e.g., 4096)
+        norms = np.linalg.norm(doc_vecs, axis=1)
+        
+        # Avoid division by zero
+        valid_indices = norms > 0
+        scores = np.zeros(len(doc_vecs))
+        
+        # Calculate dot product
+        dot_products = np.dot(doc_vecs, query_vec)
+        
+        # Calculate cosine similarity only for valid norms
+        scores[valid_indices] = dot_products[valid_indices] / (norms[valid_indices] * np.linalg.norm(query_vec))
+        
+        # Get top_k indices
+        top_indices = np.argsort(scores)[-top_k:][::-1]
+        
+        results = []
+        for idx in top_indices:
+            results.append({
+                "metadata": self.metadata[idx],
+                "score": scores[idx]
+            })
+        return results
 
-def decodeJwtPayload(token):
-    payload = token.split(".")[1]
-    payload += "=" * (4 - len(payload) % 4)
-    return json.loads(base64.urlsafe_b64decode(payload))
+class JurySystem:
+    def __init__(self):
+        self.config = self.load_config()
+        
+        # Initialize OpenRouter Client
+        self.client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=self.config['openrouter_api_key']
+        )
+        
+        self.vector_store = SimpleVectorStore()
+        self.load_knowledge_base()
 
+    def load_config(self):
+        with open(CONFIG_PATH, 'r') as f:
+            return json.load(f)
 
-def apiRequest(method, endpoint, token=None, data=None):
-    url = f"{API_URL}{endpoint}"
-    headers = {"Content-Type": "application/json"}
-    if token:
-        headers["Authorization"] = f"Bearer {token}"
-    try:
-        resp = getattr(requests, method)(url, headers=headers, json=data, timeout=10)
+    def load_knowledge_base(self):
+        """Loads the pre-computed embeddings from the JSON file."""
+        print(f"Loading knowledge base from {KNOWLEDGE_BASE_PATH}...")
         try:
-            return resp.json(), resp.status_code
-        except ValueError:
-            return {}, resp.status_code
-    except requests.RequestException:
-        return {"error": "API unavailable"}, 503
+            with open(KNOWLEDGE_BASE_PATH, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                
+            vectors = []
+            metadata = []
+            
+            for item in data:
+                vectors.append(item['embedding'])
+                metadata.append({
+                    "id": item['id'],
+                    "source": item['source'],
+                    "text": item['text']
+                })
+            
+            self.vector_store.add(vectors, metadata)
+            print(f"Loaded {len(vectors)} chunks into vector store.")
+            
+        except FileNotFoundError:
+            print(f"Error: {KNOWLEDGE_BASE_PATH} not found. Did you run the embedder script?")
+            exit(1)
+        except Exception as e:
+            print(f"Error loading knowledge base: {e}")
+            exit(1)
 
-
-def loadCache():
-    try:
-        if os.path.exists(CACHE_FILE):
-            with open(CACHE_FILE, "rb") as f:
-                global user_cache
-                user_cache = pickle.load(f)
-                print(f"Loaded cache for {len(user_cache)} users")
-    except Exception as e:
-        print(f"Error loading cache: {e}")
-
-
-def saveCache():
-    try:
-        with open(CACHE_FILE, "wb") as f:
-            pickle.dump(user_cache, f)
-    except Exception as e:
-        print(f"Error saving cache: {e}")
-
-
-def hashPassword(password):
-    return bcrypt.hashpw(password.encode("utf-8"), bcrypt.gensalt()).decode("utf-8")
-
-
-def verifyPassword(password, hashed):
-    return bcrypt.checkpw(password.encode("utf-8"), hashed.encode("utf-8"))
-
-
-def getCachedUser(discord_id):
-    return user_cache.get(discord_id)
-
-
-def setCachedUser(discord_id, user_data):
-    user_cache[discord_id] = user_data
-    saveCache()
-
-
-def negotiateToken(discord_id, username, password):
-    cached = getCachedUser(discord_id)
-    if (
-        cached
-        and cached.get("username") == username
-        and verifyPassword(password, cached.get("hashed_password"))
-    ):
-        result, status = apiRequest(
-            "post", "/api/login", data={"username": username, "password": password}
-        )
-        if status == 200 and "token" in result:
-            token = result["token"]
-            payload = decodeJwtPayload(token)
-            user_uuid = payload["sub"]
-            setCachedUser(
-                discord_id,
-                {
-                    "hashed_password": cached["hashed_password"],
-                    "user_uuid": user_uuid,
-                    "username": username,
-                },
+    def retrieve_context(self, query, top_k=5):
+        print("[1. Retrieving Context...]")
+        
+        try:
+            # --- CRITICAL FIX: Use the EXACT same model as the embedder ---
+            # Embedder used: "qwen/qwen3-embedding-8b" -> Dimension 4096
+            # We must use the same here to avoid shape mismatch.
+            response = self.client.embeddings.create(
+                model="qwen/qwen3-embedding-8b", 
+                input=query
             )
-            return token, user_uuid
-        return None, None
+            
+            query_emb = response.data[0].embedding
+            
+            # Search the vector store
+            context_chunks = self.vector_store.search(query_emb, top_k=top_k)
+            
+            return context_chunks
+            
+        except Exception as e:
+            print(f"Error retrieving context: {e}")
+            return []
 
-    result, status = apiRequest(
-        "post", "/api/login", data={"username": username, "password": password}
-    )
-    if status == 200 and "token" in result:
-        token = result["token"]
-        payload = decodeJwtPayload(token)
-        user_uuid = payload["sub"]
-        setCachedUser(
-            discord_id,
-            {
-                "hashed_password": hashPassword(password),
-                "user_uuid": user_uuid,
-                "username": username,
-            },
-        )
-        return token, user_uuid
-    return None, None
+    def generate_answer(self, query, context_chunks):
+        print("[2. Generating Answer...]")
+        
+        # Build the context string
+        context_text = "\n\n---\n\n".join([chunk['metadata']['text'] for chunk in context_chunks])
+        
+        system_prompt = """You are a helpful AI assistant specializing in DBT (Dialectical Behavior Therapy). 
+Use the provided context to answer the user's question. 
+If the answer is not in the context, say you don't know based on the provided text.
+Be concise and compassionate."""
 
+        user_prompt = f"""Context:
+{context_text}
 
-async def handleAuthFailure(message):
-    discord_id = message.author.id
-    user_sessions.pop(discord_id, None)
-    await message.channel.send(
-        "Your session has expired. Send any message to log in again."
-    )
+Question: {query}"""
 
-
-async def handleLoginStep(message):
-    discord_id = message.author.id
-    state = login_state[discord_id]
-
-    if state["step"] == "username":
-        state["username"] = message.content.strip()
-        state["step"] = "password"
-        await message.channel.send("Password?")
-
-    elif state["step"] == "password":
-        username = state["username"]
-        password = message.content.strip()
-        del login_state[discord_id]
-
-        token, user_uuid = negotiateToken(discord_id, username, password)
-
-        if token and user_uuid:
-            user_sessions[discord_id] = {
-                "token": token,
-                "user_uuid": user_uuid,
-                "username": username,
-            }
-            registered = ", ".join(list_registered()) or "none"
-            await message.channel.send(
-                f"Welcome back **{username}**!\n\n"
-                f"Registered modules: {registered}\n\n"
-                f"Send 'help' for available commands."
-            )
-        else:
-            await message.channel.send(
-                "Invalid credentials. Send any message to try again."
+        try:
+            # Using a strong model for the final generation
+            response = self.client.chat.completions.create(
+                model="openai/gpt-4o-mini", # You can change this to "qwen/qwen-3-8b" or similar if desired
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                temperature=0.7
             )
+            
+            return response.choices[0].message.content
+            
+        except Exception as e:
+            return f"Error generating answer: {e}"
 
+    def process_query(self, query):
+        # 1. Retrieve
+        context = self.retrieve_context(query)
+        
+        if not context:
+            return "I couldn't find any relevant information in the knowledge base."
+        
+        # Optional: Print sources for debugging
+        print(f"   Found {len(context)} relevant chunks (Top score: {context[0]['score']:.4f})")
+        
+        # 2. Generate
+        answer = self.generate_answer(query, context)
+        
+        return answer
 
-async def sendHelpMessage(message):
-    help_msg = """**🤖 Synculous Bot - Natural Language Commands**
-
-Just talk to me naturally! Here are some examples:
-
-**💊 Medications:**
-• "add lsd 50 mcg every tuesday at 4:20pm"
-• "take my wellbutrin"
-• "what meds do i have today?"
-• "show my refills"
-• "snooze my reminder for 30 minutes"
-• "check adherence"
-
-**📋 Routines:**
-• "create morning routine with brush teeth, shower, eat"
-• "start my morning routine"
-• "done" (complete current step)
-• "skip" (skip current step)
-• "pause/resume" (pause or continue)
-• "what steps are in my routine?"
-• "schedule workout for monday wednesday friday at 7am"
-• "show my stats"
-
-**💡 Tips:**
-• I understand natural language, typos, and slang
-• If I'm unsure, I'll ask for clarification
-• For important actions, I'll ask you to confirm with "yes" or "no"
-• When you're in a routine, shortcuts like "done", "skip", "pause" work automatically"""
-    await message.channel.send(help_msg)
-
-
-async def checkActiveSession(session):
-    """Check if user has an active routine session and return details."""
-    token = session.get("token")
-    if not token:
-        return None
-
-    resp, status = apiRequest("get", "/api/sessions/active", token)
-    if status == 200 and "session" in resp:
-        return resp
-    return None
-
-
-async def handleConfirmation(message, session):
-    """Handle yes/no confirmation responses. Returns True if handled."""
-    discord_id = message.author.id
-    user_input = message.content.lower().strip()
-
-    if "pending_confirmations" not in session:
-        return False
-
-    # Check for any pending confirmations
-    pending = session["pending_confirmations"]
-    if not pending:
-        return False
-
-    # Get the most recent pending confirmation
-    confirmation_id = list(pending.keys())[-1]
-    confirmation_data = pending[confirmation_id]
-
-    if user_input in ("yes", "y", "yeah", "sure", "ok", "confirm"):
-        # Execute the confirmed action
-        del pending[confirmation_id]
-
-        interaction_type = confirmation_data.get("interaction_type")
-        handler = get_handler(interaction_type)
-
-        if handler:
-            # Create a fake parsed object for the handler
-            fake_parsed = confirmation_data.copy()
-            fake_parsed["needs_confirmation"] = False
-            await handler(message, session, fake_parsed)
-        return True
-
-    elif user_input in ("no", "n", "nah", "cancel", "abort"):
-        del pending[confirmation_id]
-        await message.channel.send("❌ Cancelled.")
-        return True
-
-    return False
-
-
-async def handleActiveSessionShortcuts(message, session, active_session):
-    """Handle shortcuts like 'done', 'skip', 'next' when in active session."""
-    user_input = message.content.lower().strip()
-
-    # Map common shortcuts to actions
-    shortcuts = {
-        "done": ("routine", "complete"),
-        "finished": ("routine", "complete"),
-        "complete": ("routine", "complete"),
-        "next": ("routine", "complete"),
-        "skip": ("routine", "skip"),
-        "pass": ("routine", "skip"),
-        "pause": ("routine", "pause"),
-        "hold": ("routine", "pause"),
-        "resume": ("routine", "resume"),
-        "continue": ("routine", "resume"),
-        "stop": ("routine", "cancel"),
-        "quit": ("routine", "cancel"),
-        "abort": ("routine", "abort"),
-    }
-
-    if user_input in shortcuts:
-        interaction_type, action = shortcuts[user_input]
-        handler = get_handler(interaction_type)
-        if handler:
-            fake_parsed = {"action": action}
-            await handler(message, session, fake_parsed)
-            return True
-
-    return False
-
-
-async def routeCommand(message):
-    discord_id = message.author.id
-    session = user_sessions[discord_id]
-    user_input = message.content.lower()
-
-    if "help" in user_input or "what can i say" in user_input:
-        await sendHelpMessage(message)
-        return
-
-    # Check for active session first
-    active_session = await checkActiveSession(session)
-
-    # Handle confirmation responses
-    confirmation_handled = await handleConfirmation(message, session)
-    if confirmation_handled:
-        return
-
-    # Handle shortcuts when in active session
-    if active_session:
-        shortcut_handled = await handleActiveSessionShortcuts(
-            message, session, active_session
-        )
-        if shortcut_handled:
-            return
-
-    async with message.channel.typing():
-        history = message_history.get(discord_id, [])
-
-        # Add context about active session to help AI understand
-        context = ""
-        if active_session:
-            session_data = active_session.get("session", {})
-            routine_name = session_data.get("routine_name", "a routine")
-            current_step = session_data.get("current_step_index", 0) + 1
-            total_steps = active_session.get("total_steps", 0)
-            context = f"\n[Context: User is currently in active session for '{routine_name}', on step {current_step} of {total_steps}. They can say 'done', 'skip', 'pause', 'resume', or 'stop'.]"
-
-        parsed = await ai_parser.parse(
-            message.content + context, "command_parser", history=history
-        )
-
-        if discord_id not in message_history:
-            message_history[discord_id] = []
-        message_history[discord_id].append((message.content, parsed))
-        message_history[discord_id] = message_history[discord_id][-5:]
-
-    if "needs_clarification" in parsed:
-        await message.channel.send(
-            f"I'm not quite sure what you mean. {parsed['needs_clarification']}"
-        )
-        return
-
-    if "error" in parsed:
-        await message.channel.send(
-            f"I had trouble understanding that: {parsed['error']}"
-        )
-        return
-
-    interaction_type = parsed.get("interaction_type")
-    handler = get_handler(interaction_type)
-
-    if handler:
-        await handler(message, session, parsed)
-    else:
-        registered = ", ".join(list_registered()) or "none"
-        await message.channel.send(
-            f"Unknown command type '{interaction_type}'. Registered modules: {registered}"
-        )
-
-
-@client.event
-async def on_ready():
-    print(f"Bot logged in as {client.user}")
-    loadCache()
-    backgroundLoop.start()
-
-
-@client.event
-async def on_message(message):
-    if message.author == client.user:
-        return
-    if not isinstance(message.channel, discord.DMChannel):
-        return
-
-    discord_id = message.author.id
-
-    if discord_id in login_state:
-        await handleLoginStep(message)
-        return
-
-    if discord_id not in user_sessions:
-        login_state[discord_id] = {"step": "username"}
-        await message.channel.send("Welcome! Send your username to log in.")
-        return
-
-    await routeCommand(message)
-
-
-@tasks.loop(seconds=60)
-async def backgroundLoop():
-    """Override this in your domain module or extend as needed."""
-    pass
-
-
-@backgroundLoop.before_loop
-async def beforeBackgroundLoop():
-    await client.wait_until_ready()
-
+def main():
+    print("Initializing AI Jury System...")
+    system = JurySystem()
+    
+    print("\nSystem Ready. Ask a question (or type 'exit').")
+    
+    while True:
+        try:
+            user_query = input("\nYou: ").strip()
+            
+            if user_query.lower() in ['exit', 'quit']:
+                print("Goodbye!")
+                break
+            
+            if not user_query:
+                continue
+                
+            response = system.process_query(user_query)
+            print(f"\nAI: {response}")
+            
+        except KeyboardInterrupt:
+            print("\nGoodbye!")
+            break
+        except Exception as e:
+            print(f"\nAn error occurred: {e}")
 
 if __name__ == "__main__":
-    client.run(DISCORD_BOT_TOKEN)
+    main()
\ No newline at end of file
diff --git a/bot/commands/knowledge.py b/bot/commands/knowledge.py
index a55113a..d5f4771 100644
--- a/bot/commands/knowledge.py
+++ b/bot/commands/knowledge.py
@@ -53,9 +53,24 @@ def load_knowledge_base(
     with open(file_path, "r") as f:
         data = json.load(f)
 
-    chunks = data.get("chunks", [])
-    embeddings = data.get("embeddings", [])
-    metadata = data.get("metadata", {})
+    # Handle both dict format {"chunks": [...], "embeddings": [...], "metadata": {...}}
+    # and legacy list format where data is just the chunks
+    if isinstance(data, dict):
+        chunks = data.get("chunks", [])
+        embeddings = data.get("embeddings", [])
+        metadata = data.get("metadata", {})
+    elif isinstance(data, list):
+        # Legacy format: assume it's just chunks, or list of [chunk, embedding] pairs
+        if data and isinstance(data[0], dict) and "text" in data[0]:
+            # Format: [{"text": "...", "embedding": [...]}, ...]
+            chunks = [item.get("text", "") for item in data]
+            embeddings = [item.get("embedding", []) for item in data]
+            metadata = {"format": "legacy_list_of_dicts"}
+        else:
+            # Unknown list format - can't process
+            return None
+    else:
+        return None
 
     # Add file_path to metadata for reference
     metadata["_file_path"] = file_path