Files
kg-scr/functions.ipynb
2026-03-21 19:31:11 +01:00

313 lines
7.5 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "e58ed372",
"metadata": {},
"outputs": [],
"source": [
"import platform\n",
"from pathlib import Path\n",
"import pytesseract\n",
"from PIL import Image\n",
"from txtai.embeddings import Embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10a7eff9",
"metadata": {},
"outputs": [],
"source": [
"def_paths = {\n",
" \"Darwin\": Path.home() / \"Desktop\",\n",
" \"Windows\": Path.home() / \"Pictures\" / \"Screenshots\",\n",
" \"Linux\": Path.home() / \"Pictures\",\n",
"}\n",
"\n",
"sc_pathpatterns = {\n",
" \"Darwin\": [\"SCR*.png\", \"Screenshot*.png\"],\n",
" \"Windows\": [\"Screenshot*.png\"],\n",
" \"Linux\": [\"Screenshot*.png\", \"scrot*.png\", \"screenshot*.png\"],\n",
"}\n",
"\n",
"def get_screenshots(path: str | Path | None = None):\n",
" if path is None:\n",
" path = def_paths.get(platform.system(), Path.home())\n",
" os_name = platform.system()\n",
" patterns = sc_pathpatterns.get(os_name, [\"SCR*.png\"]) # assume mac\n",
" path = Path(path)\n",
" results = []\n",
" for pattern in patterns:\n",
" # results.extend(path.glob(pattern))\n",
" results.extend(str(f.absolute()) for f in path.glob(pattern))\n",
" \n",
" return sorted(set(results))\n",
"def extract_text(path, limit: int = 50):\n",
" text_scr = []\n",
" for img in get_screenshots(path):\n",
" image = Image.open(img) \n",
" text = pytesseract.image_to_string(image)\n",
" text_scr.append(text)\n",
" return text_scr"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e73d6386",
"metadata": {},
"outputs": [],
"source": [
"txt = extract_text(\"/Users/Aman/Pictures\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a613a361",
"metadata": {},
"outputs": [],
"source": [
"# create embeddings\n",
"embeddings = Embeddings({\n",
" \"path\": \"sentence-transformers/all-MiniLM-L6-v2\",\n",
" \"content\": True\n",
"})\n",
"\n",
"# do indexing\n",
"embeddings.index(txt)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7982d00e",
"metadata": {},
"outputs": [],
"source": [
"#print(txt)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f94de70",
"metadata": {},
"outputs": [],
"source": [
"# embeddings search\n",
"print(\"%-20s %s\" % (\"Query\", \"Best Match\"))\n",
"print(\"-\" * 50)\n",
"\n",
"for query in [\"genome\"]:\n",
" results = embeddings.search(query, 100)\n",
" for r in results:\n",
" print(r[\"text\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa9b189d",
"metadata": {},
"outputs": [],
"source": [
"#results"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb5ee81e",
"metadata": {},
"outputs": [],
"source": [
"#embeddings.save(\"index\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c263aee0",
"metadata": {},
"outputs": [],
"source": [
"embeddings = Embeddings()\n",
"embeddings.load(\"index\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10c81e27",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"OPENROUTER_API_KEY\"] = \"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\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e9519cf2",
"metadata": {},
"outputs": [],
"source": [
"from txtai import LLM"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "98164787",
"metadata": {},
"outputs": [],
"source": [
"#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"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "58bce2ae",
"metadata": {},
"outputs": [],
"source": [
"OPENROUTER_API_KEY = os.getenv(\"OPENROUTER_API_KEY\")\n",
"OPENROUTER_BASE_URL = os.getenv(\"OPENROUTER_API_BASE\", \"https://openrouter.ai/api/v1\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8e20bf7e",
"metadata": {},
"outputs": [],
"source": [
"messages = \"What is Hi-C and how does it work?\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41f0f066",
"metadata": {},
"outputs": [],
"source": [
"import litellm\n",
"\n",
"response = litellm.completion(\n",
" model=\"openrouter/minimax/minimax-m2.5:free\",\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": \"How do population size fluctuations affect effective population size??\"}\n",
" ]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8caf0ff4",
"metadata": {},
"outputs": [],
"source": [
"#print(response)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "879c7011",
"metadata": {},
"outputs": [],
"source": [
"# Just the answer\n",
"print(\"Answer:\", response.choices[0].message.content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b2f7af13",
"metadata": {},
"outputs": [],
"source": [
"# The reasoning/thinking\n",
"print(\"Reasoning:\", response.choices[0].message.reasoning_content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e29bc4c",
"metadata": {},
"outputs": [],
"source": [
"# Token usage\n",
"print(\"Tokens used:\", response.usage.total_tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c5ca3c7",
"metadata": {},
"outputs": [],
"source": [
"# do embedding search\n",
"question = \"How do population size fluctuations affect effective population size?\"\n",
"results = embeddings.search(question, 3)\n",
"context = \"\\n\\n\".join([r[\"text\"] for r in results]) # pass to llm\n",
"\n",
"# verify\n",
"print(\"Retrieved from docs\")\n",
"for r in results:\n",
" print(f\"[Score: {r['score']:.3f}] {r['text'][:150]}...\")\n",
" print()\n",
"\n",
"# send with context\n",
"response = litellm.completion(\n",
" model=\"openrouter/minimax/minimax-m2.5:free\",\n",
" messages=[\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": \"Answer ONLY using the provided context. Cite which parts you're drawing from. If the context doesn't cover something, say 'not in my documents'.\"\n",
" },\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": f\"Context from my documents:\\n{context}\\n\\nQuestion: {question}\"\n",
" }\n",
" ]\n",
")\n",
"print(\"\\nllm ans\")\n",
"print(response.choices[0].message.content)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}