Files
Synculous-2/regenerate_embeddings.py
2026-02-20 20:04:35 +00:00

57 lines
1.5 KiB
Python

#!/usr/bin/env python3
"""Regenerate DBT embeddings with qwen/qwen3-embedding-8b model (384 dimensions)"""
import json
import os
from openai import OpenAI
import time
# Load config
with open("config.json", "r") as f:
config = json.load(f)
# Initialize OpenAI client with OpenRouter
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=config["openrouter_api_key"],
)
# Load text data
with open("bot/data/dbt_knowledge.text.json", "r") as f:
text_data = json.load(f)
print(f"Regenerating embeddings for {len(text_data)} chunks...")
# Generate embeddings
embeddings_data = []
for i, item in enumerate(text_data):
try:
response = client.embeddings.create(
model="qwen/qwen3-embedding-8b",
input=item["text"]
)
embedding = response.data[0].embedding
embeddings_data.append({
"id": item["id"],
"source": item["source"],
"text": item["text"],
"embedding": embedding
})
if (i + 1) % 10 == 0:
print(f"Processed {i + 1}/{len(text_data)} chunks...")
# Small delay to avoid rate limits
time.sleep(0.1)
except Exception as e:
print(f"Error processing item {i}: {e}")
continue
# Save new embeddings
with open("bot/data/dbt_knowledge.embeddings.json", "w") as f:
json.dump(embeddings_data, f)
print(f"\nDone! Generated {len(embeddings_data)} embeddings with {len(embeddings_data[0]['embedding'])} dimensions")