Files
kg-scr/kg_ocr/embeddings/indexer.py

24 lines
627 B
Python

from txtai.embeddings import Embeddings
def create_and_index(
data: list[str], model: str = "sentence-transformers/all-MiniLM-L6-v2"
) -> Embeddings:
"""Create and index embeddings from text."""
embeddings = Embeddings({
"path": model,
"content": True,
"hybrid": True,
"scoring": "bm25",
})
embeddings.index(data)
return embeddings
def query_embedding(
embeddings: Embeddings, query: str, limit: int = 100
) -> list[str]:
"""Search embeddings and return matching texts."""
results = embeddings.search(query, limit)
return [r["text"] for r in results]