added scaffold struct for the prj
This commit is contained in:
81
.gitignore
vendored
Normal file
81
.gitignore
vendored
Normal file
@@ -0,0 +1,81 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# PyInstaller
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Virtual environments
|
||||
venv/
|
||||
env/
|
||||
ENV/
|
||||
.venv
|
||||
|
||||
# IDEs
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Project specific
|
||||
data/
|
||||
312
functions.ipynb
312
functions.ipynb
@@ -1,312 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e58ed372",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import platform\n",
|
||||
"from pathlib import Path\n",
|
||||
"import pytesseract\n",
|
||||
"from PIL import Image\n",
|
||||
"from txtai.embeddings import Embeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "10a7eff9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def_paths = {\n",
|
||||
" \"Darwin\": Path.home() / \"Desktop\",\n",
|
||||
" \"Windows\": Path.home() / \"Pictures\" / \"Screenshots\",\n",
|
||||
" \"Linux\": Path.home() / \"Pictures\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"sc_pathpatterns = {\n",
|
||||
" \"Darwin\": [\"SCR*.png\", \"Screenshot*.png\"],\n",
|
||||
" \"Windows\": [\"Screenshot*.png\"],\n",
|
||||
" \"Linux\": [\"Screenshot*.png\", \"scrot*.png\", \"screenshot*.png\"],\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"def get_screenshots(path: str | Path | None = None):\n",
|
||||
" if path is None:\n",
|
||||
" path = def_paths.get(platform.system(), Path.home())\n",
|
||||
" os_name = platform.system()\n",
|
||||
" patterns = sc_pathpatterns.get(os_name, [\"SCR*.png\"]) # assume mac\n",
|
||||
" path = Path(path)\n",
|
||||
" results = []\n",
|
||||
" for pattern in patterns:\n",
|
||||
" # results.extend(path.glob(pattern))\n",
|
||||
" results.extend(str(f.absolute()) for f in path.glob(pattern))\n",
|
||||
" \n",
|
||||
" return sorted(set(results))\n",
|
||||
"def extract_text(path, limit: int = 50):\n",
|
||||
" text_scr = []\n",
|
||||
" for img in get_screenshots(path):\n",
|
||||
" image = Image.open(img) \n",
|
||||
" text = pytesseract.image_to_string(image)\n",
|
||||
" text_scr.append(text)\n",
|
||||
" return text_scr"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e73d6386",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"txt = extract_text(\"/Users/Aman/Pictures\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a613a361",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# create embeddings\n",
|
||||
"embeddings = Embeddings({\n",
|
||||
" \"path\": \"sentence-transformers/all-MiniLM-L6-v2\",\n",
|
||||
" \"content\": True\n",
|
||||
"})\n",
|
||||
"\n",
|
||||
"# do indexing\n",
|
||||
"embeddings.index(txt)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7982d00e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#print(txt)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6f94de70",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# embeddings search\n",
|
||||
"print(\"%-20s %s\" % (\"Query\", \"Best Match\"))\n",
|
||||
"print(\"-\" * 50)\n",
|
||||
"\n",
|
||||
"for query in [\"genome\"]:\n",
|
||||
" results = embeddings.search(query, 100)\n",
|
||||
" for r in results:\n",
|
||||
" print(r[\"text\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fa9b189d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cb5ee81e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#embeddings.save(\"index\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c263aee0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embeddings = Embeddings()\n",
|
||||
"embeddings.load(\"index\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "10c81e27",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"os.environ[\"OPENROUTER_API_KEY\"] = \"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\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e9519cf2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from txtai import LLM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "98164787",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#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"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "58bce2ae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"OPENROUTER_API_KEY = os.getenv(\"OPENROUTER_API_KEY\")\n",
|
||||
"OPENROUTER_BASE_URL = os.getenv(\"OPENROUTER_API_BASE\", \"https://openrouter.ai/api/v1\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8e20bf7e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"messages = \"What is Hi-C and how does it work?\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "41f0f066",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import litellm\n",
|
||||
"\n",
|
||||
"response = litellm.completion(\n",
|
||||
" model=\"openrouter/minimax/minimax-m2.5:free\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": \"How do population size fluctuations affect effective population size??\"}\n",
|
||||
" ]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8caf0ff4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#print(response)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "879c7011",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Just the answer\n",
|
||||
"print(\"Answer:\", response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b2f7af13",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# The reasoning/thinking\n",
|
||||
"print(\"Reasoning:\", response.choices[0].message.reasoning_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0e29bc4c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Token usage\n",
|
||||
"print(\"Tokens used:\", response.usage.total_tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4c5ca3c7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# do embedding search\n",
|
||||
"question = \"How do population size fluctuations affect effective population size?\"\n",
|
||||
"results = embeddings.search(question, 3)\n",
|
||||
"context = \"\\n\\n\".join([r[\"text\"] for r in results]) # pass to llm\n",
|
||||
"\n",
|
||||
"# verify\n",
|
||||
"print(\"Retrieved from docs\")\n",
|
||||
"for r in results:\n",
|
||||
" print(f\"[Score: {r['score']:.3f}] {r['text'][:150]}...\")\n",
|
||||
" print()\n",
|
||||
"\n",
|
||||
"# send with context\n",
|
||||
"response = litellm.completion(\n",
|
||||
" model=\"openrouter/minimax/minimax-m2.5:free\",\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\": \"Answer ONLY using the provided context. Cite which parts you're drawing from. If the context doesn't cover something, say 'not in my documents'.\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": f\"Context from my documents:\\n{context}\\n\\nQuestion: {question}\"\n",
|
||||
" }\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"print(\"\\nllm ans\")\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
4
kg_ocr/__init__.py
Normal file
4
kg_ocr/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .ocr import get_screenshots, extract_text
|
||||
from .embeddings import create_and_index, query_embedding
|
||||
|
||||
__all__ = ["get_screenshots", "extract_text", "create_and_index", "query_embedding"]
|
||||
0
kg_ocr/cli/__init__.py
Normal file
0
kg_ocr/cli/__init__.py
Normal file
3
kg_ocr/cli/build_graph.py
Normal file
3
kg_ocr/cli/build_graph.py
Normal file
@@ -0,0 +1,3 @@
|
||||
# to do
|
||||
|
||||
# add entrypoint to setuppy
|
||||
1
kg_ocr/cli/export_graph.py
Normal file
1
kg_ocr/cli/export_graph.py
Normal file
@@ -0,0 +1 @@
|
||||
# to do
|
||||
1
kg_ocr/cli/process_screenshots.py
Normal file
1
kg_ocr/cli/process_screenshots.py
Normal file
@@ -0,0 +1 @@
|
||||
# to do
|
||||
3
kg_ocr/embeddings/__init__.py
Normal file
3
kg_ocr/embeddings/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .indexer import create_and_index, query_embedding
|
||||
|
||||
__all__ = ["create_and_index", "query_embedding"]
|
||||
1
kg_ocr/embeddings/config_loader.py
Normal file
1
kg_ocr/embeddings/config_loader.py
Normal file
@@ -0,0 +1 @@
|
||||
# in future prefer a config, especially for graph traversal
|
||||
23
kg_ocr/embeddings/indexer.py
Normal file
23
kg_ocr/embeddings/indexer.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from txtai.embeddings import Embeddings
|
||||
|
||||
|
||||
def create_and_index(
|
||||
data: list[str], model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
) -> Embeddings:
|
||||
"""Create and index embeddings from text."""
|
||||
embeddings = Embeddings({
|
||||
"path": model,
|
||||
"content": True,
|
||||
"hybrid": True,
|
||||
"scoring": "bm25",
|
||||
})
|
||||
embeddings.index(data)
|
||||
return embeddings
|
||||
|
||||
|
||||
def query_embedding(
|
||||
embeddings: Embeddings, query: str, limit: int = 100
|
||||
) -> list[str]:
|
||||
"""Search embeddings and return matching texts."""
|
||||
results = embeddings.search(query, limit)
|
||||
return [r["text"] for r in results]
|
||||
0
kg_ocr/export/__init__.py
Normal file
0
kg_ocr/export/__init__.py
Normal file
0
kg_ocr/export/neo4j_exporter.py
Normal file
0
kg_ocr/export/neo4j_exporter.py
Normal file
0
kg_ocr/graph/__init__.py
Normal file
0
kg_ocr/graph/__init__.py
Normal file
1
kg_ocr/graph/analyzer.py
Normal file
1
kg_ocr/graph/analyzer.py
Normal file
@@ -0,0 +1 @@
|
||||
# also anomaly detection here
|
||||
0
kg_ocr/graph/builder.py
Normal file
0
kg_ocr/graph/builder.py
Normal file
4
kg_ocr/ocr/__init__.py
Normal file
4
kg_ocr/ocr/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .extractor import get_screenshots
|
||||
from .batch_processor import extract_text
|
||||
|
||||
__all__ = ["get_screenshots", "extract_text"]
|
||||
7
kg_ocr/ocr/batch_processor.py
Normal file
7
kg_ocr/ocr/batch_processor.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
|
||||
|
||||
def extract_text(images: list[str]) -> list[str]:
|
||||
"""OCR a list of image paths into text."""
|
||||
return [pytesseract.image_to_string(Image.open(img)) for img in images]
|
||||
15
kg_ocr/ocr/constants.py
Normal file
15
kg_ocr/ocr/constants.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from pathlib import Path
|
||||
|
||||
import platform
|
||||
|
||||
def_paths = {
|
||||
"Darwin": Path.home() / "Desktop",
|
||||
"Windows": Path.home() / "Pictures" / "Screenshots",
|
||||
"Linux": Path.home() / "Pictures",
|
||||
}
|
||||
|
||||
sc_pathpatterns = {
|
||||
"Darwin": ["SCR*.png", "Screenshot*.png"],
|
||||
"Windows": ["Screenshot*.png"],
|
||||
"Linux": ["Screenshot*.png", "scrot*.png", "screenshot*.png"],
|
||||
}
|
||||
17
kg_ocr/ocr/extractor.py
Normal file
17
kg_ocr/ocr/extractor.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from .constants import def_paths, sc_pathpatterns
|
||||
|
||||
|
||||
def get_screenshots(path: Optional[str | Path] = None) -> list[str]:
|
||||
"""Find screenshot files for the current OS."""
|
||||
if path is None:
|
||||
path = def_paths.get(platform.system(), Path.home())
|
||||
path = Path(path)
|
||||
patterns = sc_pathpatterns.get(platform.system(), ["SCR*.png"])
|
||||
results = []
|
||||
for pattern in patterns:
|
||||
results.extend(str(f.absolute()) for f in path.glob(pattern))
|
||||
return sorted(set(results))
|
||||
3654
notebooks/02_functions_legacy.ipynb
Normal file
3654
notebooks/02_functions_legacy.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
2827
notebooks/03_flow.ipynb
Normal file
2827
notebooks/03_flow.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
13
setup.py
Normal file
13
setup.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name="kg_ocr",
|
||||
packages=find_packages(),
|
||||
python_requires=">=3.10",
|
||||
install_requires=[
|
||||
"pytesseract",
|
||||
"Pillow",
|
||||
"txtai",
|
||||
"sentence-transformers",
|
||||
],
|
||||
)
|
||||
0
tests/test_graph.py
Normal file
0
tests/test_graph.py
Normal file
0
tests/test_indexer.py
Normal file
0
tests/test_indexer.py
Normal file
0
tests/test_ocr.py
Normal file
0
tests/test_ocr.py
Normal file
Reference in New Issue
Block a user