added scaffold struct for the prj

2026-03-24 14:58:58 +01:00
parent 95655686f9
commit cbfcf1e315
26 changed files with 6655 additions and 312 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,81 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Virtual environments
+venv/
+env/
+ENV/
+.venv
+
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Project specific
+data/
--- a/functions.ipynb
+++ b/functions.ipynb
@@ -1,312 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e58ed372",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import platform\n",
-    "from pathlib import Path\n",
-    "import pytesseract\n",
-    "from PIL import Image\n",
-    "from txtai.embeddings import Embeddings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "10a7eff9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def_paths = {\n",
-    "    \"Darwin\": Path.home() / \"Desktop\",\n",
-    "    \"Windows\": Path.home() / \"Pictures\" / \"Screenshots\",\n",
-    "    \"Linux\": Path.home() / \"Pictures\",\n",
-    "}\n",
-    "\n",
-    "sc_pathpatterns = {\n",
-    "    \"Darwin\": [\"SCR*.png\", \"Screenshot*.png\"],\n",
-    "    \"Windows\": [\"Screenshot*.png\"],\n",
-    "    \"Linux\": [\"Screenshot*.png\", \"scrot*.png\", \"screenshot*.png\"],\n",
-    "}\n",
-    "\n",
-    "def get_screenshots(path: str | Path | None = None):\n",
-    "    if path is None:\n",
-    "        path = def_paths.get(platform.system(), Path.home())\n",
-    "    os_name = platform.system()\n",
-    "    patterns = sc_pathpatterns.get(os_name, [\"SCR*.png\"]) # assume mac\n",
-    "    path = Path(path)\n",
-    "    results = []\n",
-    "    for pattern in patterns:\n",
-    "       # results.extend(path.glob(pattern))\n",
-    "        results.extend(str(f.absolute()) for f in path.glob(pattern))\n",
-    "    \n",
-    "    return sorted(set(results))\n",
-    "def extract_text(path, limit: int = 50):\n",
-    "    text_scr = []\n",
-    "    for img in get_screenshots(path):\n",
-    "        image = Image.open(img)     \n",
-    "        text = pytesseract.image_to_string(image)\n",
-    "        text_scr.append(text)\n",
-    "    return text_scr"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e73d6386",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "txt = extract_text(\"/Users/Aman/Pictures\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a613a361",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create embeddings\n",
-    "embeddings = Embeddings({\n",
-    "    \"path\": \"sentence-transformers/all-MiniLM-L6-v2\",\n",
-    "    \"content\": True\n",
-    "})\n",
-    "\n",
-    "# do indexing\n",
-    "embeddings.index(txt)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7982d00e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#print(txt)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6f94de70",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# embeddings search\n",
-    "print(\"%-20s %s\" % (\"Query\", \"Best Match\"))\n",
-    "print(\"-\" * 50)\n",
-    "\n",
-    "for query in [\"genome\"]:\n",
-    "    results = embeddings.search(query, 100)\n",
-    "    for r in results:\n",
-    "      print(r[\"text\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fa9b189d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#results"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cb5ee81e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#embeddings.save(\"index\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c263aee0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "embeddings = Embeddings()\n",
-    "embeddings.load(\"index\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "10c81e27",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "os.environ[\"OPENROUTER_API_KEY\"] = \"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\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e9519cf2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from txtai import LLM"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "98164787",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#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"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "58bce2ae",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "OPENROUTER_API_KEY = os.getenv(\"OPENROUTER_API_KEY\")\n",
-    "OPENROUTER_BASE_URL = os.getenv(\"OPENROUTER_API_BASE\", \"https://openrouter.ai/api/v1\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8e20bf7e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "messages = \"What is Hi-C and how does it work?\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "41f0f066",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import litellm\n",
-    "\n",
-    "response = litellm.completion(\n",
-    "    model=\"openrouter/minimax/minimax-m2.5:free\",\n",
-    "    messages=[\n",
-    "        {\"role\": \"user\", \"content\": \"How do population size fluctuations affect effective population size??\"}\n",
-    "    ]\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8caf0ff4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#print(response)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "879c7011",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Just the answer\n",
-    "print(\"Answer:\", response.choices[0].message.content)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b2f7af13",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# The reasoning/thinking\n",
-    "print(\"Reasoning:\", response.choices[0].message.reasoning_content)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0e29bc4c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Token usage\n",
-    "print(\"Tokens used:\", response.usage.total_tokens)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4c5ca3c7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# do embedding search\n",
-    "question = \"How do population size fluctuations affect effective population size?\"\n",
-    "results = embeddings.search(question, 3)\n",
-    "context = \"\\n\\n\".join([r[\"text\"] for r in results]) # pass to llm\n",
-    "\n",
-    "# verify\n",
-    "print(\"Retrieved from docs\")\n",
-    "for r in results:\n",
-    "    print(f\"[Score: {r['score']:.3f}] {r['text'][:150]}...\")\n",
-    "    print()\n",
-    "\n",
-    "# send with context\n",
-    "response = litellm.completion(\n",
-    "    model=\"openrouter/minimax/minimax-m2.5:free\",\n",
-    "    messages=[\n",
-    "        {\n",
-    "            \"role\": \"system\",\n",
-    "            \"content\": \"Answer ONLY using the provided context. Cite which parts you're drawing from. If the context doesn't cover something, say 'not in my documents'.\"\n",
-    "        },\n",
-    "        {\n",
-    "            \"role\": \"user\",\n",
-    "            \"content\": f\"Context from my documents:\\n{context}\\n\\nQuestion: {question}\"\n",
-    "        }\n",
-    "    ]\n",
-    ")\n",
-    "print(\"\\nllm ans\")\n",
-    "print(response.choices[0].message.content)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "base",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/kg_ocr/init.py
+++ b/kg_ocr/init.py
@@ -0,0 +1,4 @@
+from .ocr import get_screenshots, extract_text
+from .embeddings import create_and_index, query_embedding
+
+__all__ = ["get_screenshots", "extract_text", "create_and_index", "query_embedding"]
--- a/kg_ocr/cli/init.py
+++ b/kg_ocr/cli/init.py
--- a/kg_ocr/cli/build_graph.py
+++ b/kg_ocr/cli/build_graph.py
@@ -0,0 +1,3 @@
+# to do
+
+# add entrypoint to setuppy
--- a/kg_ocr/cli/export_graph.py
+++ b/kg_ocr/cli/export_graph.py
@@ -0,0 +1 @@
+# to do
--- a/kg_ocr/cli/process_screenshots.py
+++ b/kg_ocr/cli/process_screenshots.py
@@ -0,0 +1 @@
+# to do
--- a/kg_ocr/embeddings/init.py
+++ b/kg_ocr/embeddings/init.py
@@ -0,0 +1,3 @@
+from .indexer import create_and_index, query_embedding
+
+__all__ = ["create_and_index", "query_embedding"]
--- a/kg_ocr/embeddings/config_loader.py
+++ b/kg_ocr/embeddings/config_loader.py
@@ -0,0 +1 @@
+# in future prefer a config, especially for graph traversal
--- a/kg_ocr/embeddings/indexer.py
+++ b/kg_ocr/embeddings/indexer.py
@@ -0,0 +1,23 @@
+from txtai.embeddings import Embeddings
+
+
+def create_and_index(
+    data: list[str], model: str = "sentence-transformers/all-MiniLM-L6-v2"
+) -> Embeddings:
+    """Create and index embeddings from text."""
+    embeddings = Embeddings({
+        "path": model,
+        "content": True,
+        "hybrid": True,
+        "scoring": "bm25",
+    })
+    embeddings.index(data)
+    return embeddings
+
+
+def query_embedding(
+    embeddings: Embeddings, query: str, limit: int = 100
+) -> list[str]:
+    """Search embeddings and return matching texts."""
+    results = embeddings.search(query, limit)
+    return [r["text"] for r in results]
--- a/kg_ocr/export/init.py
+++ b/kg_ocr/export/init.py
--- a/kg_ocr/export/neo4j_exporter.py
+++ b/kg_ocr/export/neo4j_exporter.py
--- a/kg_ocr/graph/init.py
+++ b/kg_ocr/graph/init.py
--- a/kg_ocr/graph/analyzer.py
+++ b/kg_ocr/graph/analyzer.py
@@ -0,0 +1 @@
+# also anomaly detection here
--- a/kg_ocr/graph/builder.py
+++ b/kg_ocr/graph/builder.py
--- a/kg_ocr/ocr/init.py
+++ b/kg_ocr/ocr/init.py
@@ -0,0 +1,4 @@
+from .extractor import get_screenshots
+from .batch_processor import extract_text
+
+__all__ = ["get_screenshots", "extract_text"]
--- a/kg_ocr/ocr/batch_processor.py
+++ b/kg_ocr/ocr/batch_processor.py
@@ -0,0 +1,7 @@
+from PIL import Image
+import pytesseract
+
+
+def extract_text(images: list[str]) -> list[str]:
+    """OCR a list of image paths into text."""
+    return [pytesseract.image_to_string(Image.open(img)) for img in images]
--- a/kg_ocr/ocr/constants.py
+++ b/kg_ocr/ocr/constants.py
@@ -0,0 +1,15 @@
+from pathlib import Path
+
+import platform
+
+def_paths = {
+    "Darwin": Path.home() / "Desktop",
+    "Windows": Path.home() / "Pictures" / "Screenshots",
+    "Linux": Path.home() / "Pictures",
+}
+
+sc_pathpatterns = {
+    "Darwin": ["SCR*.png", "Screenshot*.png"],
+    "Windows": ["Screenshot*.png"],
+    "Linux": ["Screenshot*.png", "scrot*.png", "screenshot*.png"],
+}
--- a/kg_ocr/ocr/extractor.py
+++ b/kg_ocr/ocr/extractor.py
@@ -0,0 +1,17 @@
+import platform
+from pathlib import Path
+from typing import Optional
+
+from .constants import def_paths, sc_pathpatterns
+
+
+def get_screenshots(path: Optional[str | Path] = None) -> list[str]:
+    """Find screenshot files for the current OS."""
+    if path is None:
+        path = def_paths.get(platform.system(), Path.home())
+    path = Path(path)
+    patterns = sc_pathpatterns.get(platform.system(), ["SCR*.png"])
+    results = []
+    for pattern in patterns:
+        results.extend(str(f.absolute()) for f in path.glob(pattern))
+    return sorted(set(results))
--- a/notebooks/01_ocr_sc.ipynb
+++ b/notebooks/01_ocr_sc.ipynb
--- a/notebooks/02_functions_legacy.ipynb
+++ b/notebooks/02_functions_legacy.ipynb
--- a/notebooks/03_flow.ipynb
+++ b/notebooks/03_flow.ipynb
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,13 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="kg_ocr",
+    packages=find_packages(),
+    python_requires=">=3.10",
+    install_requires=[
+        "pytesseract",
+        "Pillow",
+        "txtai",
+        "sentence-transformers",
+    ],
+)
--- a/tests/test_graph.py
+++ b/tests/test_graph.py
--- a/tests/test_indexer.py
+++ b/tests/test_indexer.py
--- a/tests/test_ocr.py
+++ b/tests/test_ocr.py
				`@@ -0,0 +1 @@`
				`# in future prefer a config, especially for graph traversal`