added scaffold struct for the prj
This commit is contained in:
4
kg_ocr/__init__.py
Normal file
4
kg_ocr/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .ocr import get_screenshots, extract_text
|
||||
from .embeddings import create_and_index, query_embedding
|
||||
|
||||
__all__ = ["get_screenshots", "extract_text", "create_and_index", "query_embedding"]
|
||||
0
kg_ocr/cli/__init__.py
Normal file
0
kg_ocr/cli/__init__.py
Normal file
3
kg_ocr/cli/build_graph.py
Normal file
3
kg_ocr/cli/build_graph.py
Normal file
@@ -0,0 +1,3 @@
|
||||
# to do
|
||||
|
||||
# add entrypoint to setuppy
|
||||
1
kg_ocr/cli/export_graph.py
Normal file
1
kg_ocr/cli/export_graph.py
Normal file
@@ -0,0 +1 @@
|
||||
# to do
|
||||
1
kg_ocr/cli/process_screenshots.py
Normal file
1
kg_ocr/cli/process_screenshots.py
Normal file
@@ -0,0 +1 @@
|
||||
# to do
|
||||
3
kg_ocr/embeddings/__init__.py
Normal file
3
kg_ocr/embeddings/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .indexer import create_and_index, query_embedding
|
||||
|
||||
__all__ = ["create_and_index", "query_embedding"]
|
||||
1
kg_ocr/embeddings/config_loader.py
Normal file
1
kg_ocr/embeddings/config_loader.py
Normal file
@@ -0,0 +1 @@
|
||||
# in future prefer a config, especially for graph traversal
|
||||
23
kg_ocr/embeddings/indexer.py
Normal file
23
kg_ocr/embeddings/indexer.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from txtai.embeddings import Embeddings
|
||||
|
||||
|
||||
def create_and_index(
|
||||
data: list[str], model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
) -> Embeddings:
|
||||
"""Create and index embeddings from text."""
|
||||
embeddings = Embeddings({
|
||||
"path": model,
|
||||
"content": True,
|
||||
"hybrid": True,
|
||||
"scoring": "bm25",
|
||||
})
|
||||
embeddings.index(data)
|
||||
return embeddings
|
||||
|
||||
|
||||
def query_embedding(
|
||||
embeddings: Embeddings, query: str, limit: int = 100
|
||||
) -> list[str]:
|
||||
"""Search embeddings and return matching texts."""
|
||||
results = embeddings.search(query, limit)
|
||||
return [r["text"] for r in results]
|
||||
0
kg_ocr/export/__init__.py
Normal file
0
kg_ocr/export/__init__.py
Normal file
0
kg_ocr/export/neo4j_exporter.py
Normal file
0
kg_ocr/export/neo4j_exporter.py
Normal file
0
kg_ocr/graph/__init__.py
Normal file
0
kg_ocr/graph/__init__.py
Normal file
1
kg_ocr/graph/analyzer.py
Normal file
1
kg_ocr/graph/analyzer.py
Normal file
@@ -0,0 +1 @@
|
||||
# also anomaly detection here
|
||||
0
kg_ocr/graph/builder.py
Normal file
0
kg_ocr/graph/builder.py
Normal file
4
kg_ocr/ocr/__init__.py
Normal file
4
kg_ocr/ocr/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .extractor import get_screenshots
|
||||
from .batch_processor import extract_text
|
||||
|
||||
__all__ = ["get_screenshots", "extract_text"]
|
||||
7
kg_ocr/ocr/batch_processor.py
Normal file
7
kg_ocr/ocr/batch_processor.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
|
||||
|
||||
def extract_text(images: list[str]) -> list[str]:
|
||||
"""OCR a list of image paths into text."""
|
||||
return [pytesseract.image_to_string(Image.open(img)) for img in images]
|
||||
15
kg_ocr/ocr/constants.py
Normal file
15
kg_ocr/ocr/constants.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from pathlib import Path
|
||||
|
||||
import platform
|
||||
|
||||
def_paths = {
|
||||
"Darwin": Path.home() / "Desktop",
|
||||
"Windows": Path.home() / "Pictures" / "Screenshots",
|
||||
"Linux": Path.home() / "Pictures",
|
||||
}
|
||||
|
||||
sc_pathpatterns = {
|
||||
"Darwin": ["SCR*.png", "Screenshot*.png"],
|
||||
"Windows": ["Screenshot*.png"],
|
||||
"Linux": ["Screenshot*.png", "scrot*.png", "screenshot*.png"],
|
||||
}
|
||||
17
kg_ocr/ocr/extractor.py
Normal file
17
kg_ocr/ocr/extractor.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from .constants import def_paths, sc_pathpatterns
|
||||
|
||||
|
||||
def get_screenshots(path: Optional[str | Path] = None) -> list[str]:
|
||||
"""Find screenshot files for the current OS."""
|
||||
if path is None:
|
||||
path = def_paths.get(platform.system(), Path.home())
|
||||
path = Path(path)
|
||||
patterns = sc_pathpatterns.get(platform.system(), ["SCR*.png"])
|
||||
results = []
|
||||
for pattern in patterns:
|
||||
results.extend(str(f.absolute()) for f in path.glob(pattern))
|
||||
return sorted(set(results))
|
||||
Reference in New Issue
Block a user