added scaffold struct for the prj
This commit is contained in:
3
kg_ocr/embeddings/__init__.py
Normal file
3
kg_ocr/embeddings/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .indexer import create_and_index, query_embedding
|
||||
|
||||
__all__ = ["create_and_index", "query_embedding"]
|
||||
1
kg_ocr/embeddings/config_loader.py
Normal file
1
kg_ocr/embeddings/config_loader.py
Normal file
@@ -0,0 +1 @@
|
||||
# in future prefer a config, especially for graph traversal
|
||||
23
kg_ocr/embeddings/indexer.py
Normal file
23
kg_ocr/embeddings/indexer.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from txtai.embeddings import Embeddings
|
||||
|
||||
|
||||
def create_and_index(
|
||||
data: list[str], model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
) -> Embeddings:
|
||||
"""Create and index embeddings from text."""
|
||||
embeddings = Embeddings({
|
||||
"path": model,
|
||||
"content": True,
|
||||
"hybrid": True,
|
||||
"scoring": "bm25",
|
||||
})
|
||||
embeddings.index(data)
|
||||
return embeddings
|
||||
|
||||
|
||||
def query_embedding(
|
||||
embeddings: Embeddings, query: str, limit: int = 100
|
||||
) -> list[str]:
|
||||
"""Search embeddings and return matching texts."""
|
||||
results = embeddings.search(query, limit)
|
||||
return [r["text"] for r in results]
|
||||
Reference in New Issue
Block a user