added scaffold struct for the prj

This commit is contained in:
2026-03-24 14:58:58 +01:00
parent 95655686f9
commit cbfcf1e315
26 changed files with 6655 additions and 312 deletions

4
kg_ocr/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
from .ocr import get_screenshots, extract_text
from .embeddings import create_and_index, query_embedding
__all__ = ["get_screenshots", "extract_text", "create_and_index", "query_embedding"]

0
kg_ocr/cli/__init__.py Normal file
View File

View File

@@ -0,0 +1,3 @@
# to do
# add entrypoint to setuppy

View File

@@ -0,0 +1 @@
# to do

View File

@@ -0,0 +1 @@
# to do

View File

@@ -0,0 +1,3 @@
from .indexer import create_and_index, query_embedding
__all__ = ["create_and_index", "query_embedding"]

View File

@@ -0,0 +1 @@
# in future prefer a config, especially for graph traversal

View File

@@ -0,0 +1,23 @@
from txtai.embeddings import Embeddings
def create_and_index(
data: list[str], model: str = "sentence-transformers/all-MiniLM-L6-v2"
) -> Embeddings:
"""Create and index embeddings from text."""
embeddings = Embeddings({
"path": model,
"content": True,
"hybrid": True,
"scoring": "bm25",
})
embeddings.index(data)
return embeddings
def query_embedding(
embeddings: Embeddings, query: str, limit: int = 100
) -> list[str]:
"""Search embeddings and return matching texts."""
results = embeddings.search(query, limit)
return [r["text"] for r in results]

View File

View File

0
kg_ocr/graph/__init__.py Normal file
View File

1
kg_ocr/graph/analyzer.py Normal file
View File

@@ -0,0 +1 @@
# also anomaly detection here

0
kg_ocr/graph/builder.py Normal file
View File

4
kg_ocr/ocr/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
from .extractor import get_screenshots
from .batch_processor import extract_text
__all__ = ["get_screenshots", "extract_text"]

View File

@@ -0,0 +1,7 @@
from PIL import Image
import pytesseract
def extract_text(images: list[str]) -> list[str]:
"""OCR a list of image paths into text."""
return [pytesseract.image_to_string(Image.open(img)) for img in images]

15
kg_ocr/ocr/constants.py Normal file
View File

@@ -0,0 +1,15 @@
from pathlib import Path
import platform
def_paths = {
"Darwin": Path.home() / "Desktop",
"Windows": Path.home() / "Pictures" / "Screenshots",
"Linux": Path.home() / "Pictures",
}
sc_pathpatterns = {
"Darwin": ["SCR*.png", "Screenshot*.png"],
"Windows": ["Screenshot*.png"],
"Linux": ["Screenshot*.png", "scrot*.png", "screenshot*.png"],
}

17
kg_ocr/ocr/extractor.py Normal file
View File

@@ -0,0 +1,17 @@
import platform
from pathlib import Path
from typing import Optional
from .constants import def_paths, sc_pathpatterns
def get_screenshots(path: Optional[str | Path] = None) -> list[str]:
"""Find screenshot files for the current OS."""
if path is None:
path = def_paths.get(platform.system(), Path.home())
path = Path(path)
patterns = sc_pathpatterns.get(platform.system(), ["SCR*.png"])
results = []
for pattern in patterns:
results.extend(str(f.absolute()) for f in path.glob(pattern))
return sorted(set(results))