added scaffold struct for the prj

This commit is contained in:
2026-03-24 14:58:58 +01:00
parent 95655686f9
commit cbfcf1e315
26 changed files with 6655 additions and 312 deletions

81
.gitignore vendored Normal file
View File

@@ -0,0 +1,81 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Virtual environments
venv/
env/
ENV/
.venv
# IDEs
.vscode/
.idea/
*.swp
*.swo
*~
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# OS
.DS_Store
Thumbs.db
# Project specific
data/

View File

@@ -1,312 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "e58ed372",
"metadata": {},
"outputs": [],
"source": [
"import platform\n",
"from pathlib import Path\n",
"import pytesseract\n",
"from PIL import Image\n",
"from txtai.embeddings import Embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10a7eff9",
"metadata": {},
"outputs": [],
"source": [
"def_paths = {\n",
" \"Darwin\": Path.home() / \"Desktop\",\n",
" \"Windows\": Path.home() / \"Pictures\" / \"Screenshots\",\n",
" \"Linux\": Path.home() / \"Pictures\",\n",
"}\n",
"\n",
"sc_pathpatterns = {\n",
" \"Darwin\": [\"SCR*.png\", \"Screenshot*.png\"],\n",
" \"Windows\": [\"Screenshot*.png\"],\n",
" \"Linux\": [\"Screenshot*.png\", \"scrot*.png\", \"screenshot*.png\"],\n",
"}\n",
"\n",
"def get_screenshots(path: str | Path | None = None):\n",
" if path is None:\n",
" path = def_paths.get(platform.system(), Path.home())\n",
" os_name = platform.system()\n",
" patterns = sc_pathpatterns.get(os_name, [\"SCR*.png\"]) # assume mac\n",
" path = Path(path)\n",
" results = []\n",
" for pattern in patterns:\n",
" # results.extend(path.glob(pattern))\n",
" results.extend(str(f.absolute()) for f in path.glob(pattern))\n",
" \n",
" return sorted(set(results))\n",
"def extract_text(path, limit: int = 50):\n",
" text_scr = []\n",
" for img in get_screenshots(path):\n",
" image = Image.open(img) \n",
" text = pytesseract.image_to_string(image)\n",
" text_scr.append(text)\n",
" return text_scr"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e73d6386",
"metadata": {},
"outputs": [],
"source": [
"txt = extract_text(\"/Users/Aman/Pictures\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a613a361",
"metadata": {},
"outputs": [],
"source": [
"# create embeddings\n",
"embeddings = Embeddings({\n",
" \"path\": \"sentence-transformers/all-MiniLM-L6-v2\",\n",
" \"content\": True\n",
"})\n",
"\n",
"# do indexing\n",
"embeddings.index(txt)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7982d00e",
"metadata": {},
"outputs": [],
"source": [
"#print(txt)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f94de70",
"metadata": {},
"outputs": [],
"source": [
"# embeddings search\n",
"print(\"%-20s %s\" % (\"Query\", \"Best Match\"))\n",
"print(\"-\" * 50)\n",
"\n",
"for query in [\"genome\"]:\n",
" results = embeddings.search(query, 100)\n",
" for r in results:\n",
" print(r[\"text\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa9b189d",
"metadata": {},
"outputs": [],
"source": [
"#results"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb5ee81e",
"metadata": {},
"outputs": [],
"source": [
"#embeddings.save(\"index\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c263aee0",
"metadata": {},
"outputs": [],
"source": [
"embeddings = Embeddings()\n",
"embeddings.load(\"index\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10c81e27",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"OPENROUTER_API_KEY\"] = \"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\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e9519cf2",
"metadata": {},
"outputs": [],
"source": [
"from txtai import LLM"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "98164787",
"metadata": {},
"outputs": [],
"source": [
"#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"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "58bce2ae",
"metadata": {},
"outputs": [],
"source": [
"OPENROUTER_API_KEY = os.getenv(\"OPENROUTER_API_KEY\")\n",
"OPENROUTER_BASE_URL = os.getenv(\"OPENROUTER_API_BASE\", \"https://openrouter.ai/api/v1\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8e20bf7e",
"metadata": {},
"outputs": [],
"source": [
"messages = \"What is Hi-C and how does it work?\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41f0f066",
"metadata": {},
"outputs": [],
"source": [
"import litellm\n",
"\n",
"response = litellm.completion(\n",
" model=\"openrouter/minimax/minimax-m2.5:free\",\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": \"How do population size fluctuations affect effective population size??\"}\n",
" ]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8caf0ff4",
"metadata": {},
"outputs": [],
"source": [
"#print(response)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "879c7011",
"metadata": {},
"outputs": [],
"source": [
"# Just the answer\n",
"print(\"Answer:\", response.choices[0].message.content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b2f7af13",
"metadata": {},
"outputs": [],
"source": [
"# The reasoning/thinking\n",
"print(\"Reasoning:\", response.choices[0].message.reasoning_content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e29bc4c",
"metadata": {},
"outputs": [],
"source": [
"# Token usage\n",
"print(\"Tokens used:\", response.usage.total_tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c5ca3c7",
"metadata": {},
"outputs": [],
"source": [
"# do embedding search\n",
"question = \"How do population size fluctuations affect effective population size?\"\n",
"results = embeddings.search(question, 3)\n",
"context = \"\\n\\n\".join([r[\"text\"] for r in results]) # pass to llm\n",
"\n",
"# verify\n",
"print(\"Retrieved from docs\")\n",
"for r in results:\n",
" print(f\"[Score: {r['score']:.3f}] {r['text'][:150]}...\")\n",
" print()\n",
"\n",
"# send with context\n",
"response = litellm.completion(\n",
" model=\"openrouter/minimax/minimax-m2.5:free\",\n",
" messages=[\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": \"Answer ONLY using the provided context. Cite which parts you're drawing from. If the context doesn't cover something, say 'not in my documents'.\"\n",
" },\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": f\"Context from my documents:\\n{context}\\n\\nQuestion: {question}\"\n",
" }\n",
" ]\n",
")\n",
"print(\"\\nllm ans\")\n",
"print(response.choices[0].message.content)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

4
kg_ocr/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
from .ocr import get_screenshots, extract_text
from .embeddings import create_and_index, query_embedding
__all__ = ["get_screenshots", "extract_text", "create_and_index", "query_embedding"]

0
kg_ocr/cli/__init__.py Normal file
View File

View File

@@ -0,0 +1,3 @@
# to do
# add entrypoint to setuppy

View File

@@ -0,0 +1 @@
# to do

View File

@@ -0,0 +1 @@
# to do

View File

@@ -0,0 +1,3 @@
from .indexer import create_and_index, query_embedding
__all__ = ["create_and_index", "query_embedding"]

View File

@@ -0,0 +1 @@
# in future prefer a config, especially for graph traversal

View File

@@ -0,0 +1,23 @@
from txtai.embeddings import Embeddings
def create_and_index(
data: list[str], model: str = "sentence-transformers/all-MiniLM-L6-v2"
) -> Embeddings:
"""Create and index embeddings from text."""
embeddings = Embeddings({
"path": model,
"content": True,
"hybrid": True,
"scoring": "bm25",
})
embeddings.index(data)
return embeddings
def query_embedding(
embeddings: Embeddings, query: str, limit: int = 100
) -> list[str]:
"""Search embeddings and return matching texts."""
results = embeddings.search(query, limit)
return [r["text"] for r in results]

View File

View File

0
kg_ocr/graph/__init__.py Normal file
View File

1
kg_ocr/graph/analyzer.py Normal file
View File

@@ -0,0 +1 @@
# also anomaly detection here

0
kg_ocr/graph/builder.py Normal file
View File

4
kg_ocr/ocr/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
from .extractor import get_screenshots
from .batch_processor import extract_text
__all__ = ["get_screenshots", "extract_text"]

View File

@@ -0,0 +1,7 @@
from PIL import Image
import pytesseract
def extract_text(images: list[str]) -> list[str]:
"""OCR a list of image paths into text."""
return [pytesseract.image_to_string(Image.open(img)) for img in images]

15
kg_ocr/ocr/constants.py Normal file
View File

@@ -0,0 +1,15 @@
from pathlib import Path
import platform
def_paths = {
"Darwin": Path.home() / "Desktop",
"Windows": Path.home() / "Pictures" / "Screenshots",
"Linux": Path.home() / "Pictures",
}
sc_pathpatterns = {
"Darwin": ["SCR*.png", "Screenshot*.png"],
"Windows": ["Screenshot*.png"],
"Linux": ["Screenshot*.png", "scrot*.png", "screenshot*.png"],
}

17
kg_ocr/ocr/extractor.py Normal file
View File

@@ -0,0 +1,17 @@
import platform
from pathlib import Path
from typing import Optional
from .constants import def_paths, sc_pathpatterns
def get_screenshots(path: Optional[str | Path] = None) -> list[str]:
"""Find screenshot files for the current OS."""
if path is None:
path = def_paths.get(platform.system(), Path.home())
path = Path(path)
patterns = sc_pathpatterns.get(platform.system(), ["SCR*.png"])
results = []
for pattern in patterns:
results.extend(str(f.absolute()) for f in path.glob(pattern))
return sorted(set(results))

File diff suppressed because it is too large Load Diff

2827
notebooks/03_flow.ipynb Normal file

File diff suppressed because it is too large Load Diff

13
setup.py Normal file
View File

@@ -0,0 +1,13 @@
from setuptools import setup, find_packages
setup(
name="kg_ocr",
packages=find_packages(),
python_requires=">=3.10",
install_requires=[
"pytesseract",
"Pillow",
"txtai",
"sentence-transformers",
],
)

0
tests/test_graph.py Normal file
View File

0
tests/test_indexer.py Normal file
View File

0
tests/test_ocr.py Normal file
View File