kg-scr/kg_ocr/rag/query.py

from txtai.embeddings import Embeddings
from txtai import LLM
import litellm
from dotenv import load_dotenv
import os

load_dotenv()

def retrieve(embeddings: Embeddings, query: str, limit: int = 3) -> list[dict]:
    """Search embeddings and return results with scores"""
    return embeddings.search(query, limit)

def ask_wllm(embeddings: Embeddings, question: str, model: str = "openrouter/minimax/minimax-m2.5:free", limit: int = 3) -> str:
    """RAG: retrieve context from embeddings, then answer with an LLM."""
    results = retrieve(embeddings, question, limit)
    context = "\n\n".join([r["text"] for r in results])

    response = litellm.completion(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "Answer ONLY using the provided context. Cite which parts you're drawing from. If the context doesn't cover something, say 'not in my documents'."
            },
            {
                "role": "user",
                "content": f"Context from my documents:\n{context}\n\nQuestion: {question}"
            }
        ]
    )

    return response.choices[0].message.content