kg-scr/functions.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e58ed372",
   "metadata": {},
   "outputs": [],
   "source": [
    "import platform\n",
    "from pathlib import Path\n",
    "import pytesseract\n",
    "from PIL import Image\n",
    "from txtai.embeddings import Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10a7eff9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def_paths = {\n",
    "    \"Darwin\": Path.home() / \"Desktop\",\n",
    "    \"Windows\": Path.home() / \"Pictures\" / \"Screenshots\",\n",
    "    \"Linux\": Path.home() / \"Pictures\",\n",
    "}\n",
    "\n",
    "sc_pathpatterns = {\n",
    "    \"Darwin\": [\"SCR*.png\", \"Screenshot*.png\"],\n",
    "    \"Windows\": [\"Screenshot*.png\"],\n",
    "    \"Linux\": [\"Screenshot*.png\", \"scrot*.png\", \"screenshot*.png\"],\n",
    "}\n",
    "\n",
    "def get_screenshots(path: str | Path | None = None):\n",
    "    if path is None:\n",
    "        path = def_paths.get(platform.system(), Path.home())\n",
    "    os_name = platform.system()\n",
    "    patterns = sc_pathpatterns.get(os_name, [\"SCR*.png\"]) # assume mac\n",
    "    path = Path(path)\n",
    "    results = []\n",
    "    for pattern in patterns:\n",
    "       # results.extend(path.glob(pattern))\n",
    "        results.extend(str(f.absolute()) for f in path.glob(pattern))\n",
    "    \n",
    "    return sorted(set(results))\n",
    "def extract_text(path, limit: int = 50):\n",
    "    text_scr = []\n",
    "    for img in get_screenshots(path):\n",
    "        image = Image.open(img)     \n",
    "        text = pytesseract.image_to_string(image)\n",
    "        text_scr.append(text)\n",
    "    return text_scr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e73d6386",
   "metadata": {},
   "outputs": [],
   "source": [
    "txt = extract_text(\"/Users/Aman/Pictures\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a613a361",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create embeddings\n",
    "embeddings = Embeddings({\n",
    "    \"path\": \"sentence-transformers/all-MiniLM-L6-v2\",\n",
    "    \"content\": True\n",
    "})\n",
    "\n",
    "# do indexing\n",
    "embeddings.index(txt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7982d00e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(txt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f94de70",
   "metadata": {},
   "outputs": [],
   "source": [
    "# embeddings search\n",
    "print(\"%-20s %s\" % (\"Query\", \"Best Match\"))\n",
    "print(\"-\" * 50)\n",
    "\n",
    "for query in [\"genome\"]:\n",
    "    results = embeddings.search(query, 100)\n",
    "    for r in results:\n",
    "      print(r[\"text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa9b189d",
   "metadata": {},
   "outputs": [],
   "source": [
    "#results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb5ee81e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#embeddings.save(\"index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c263aee0",
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings = Embeddings()\n",
    "embeddings.load(\"index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10c81e27",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"OPENROUTER_API_KEY\"] = \"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\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9519cf2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from txtai import LLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98164787",
   "metadata": {},
   "outputs": [],
   "source": [
    "#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"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58bce2ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "OPENROUTER_API_KEY = os.getenv(\"OPENROUTER_API_KEY\")\n",
    "OPENROUTER_BASE_URL = os.getenv(\"OPENROUTER_API_BASE\", \"https://openrouter.ai/api/v1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e20bf7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "messages = \"What is Hi-C and how does it work?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41f0f066",
   "metadata": {},
   "outputs": [],
   "source": [
    "import litellm\n",
    "\n",
    "response = litellm.completion(\n",
    "    model=\"openrouter/minimax/minimax-m2.5:free\",\n",
    "    messages=[\n",
    "        {\"role\": \"user\", \"content\": \"How do population size fluctuations affect effective population size??\"}\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8caf0ff4",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "879c7011",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Just the answer\n",
    "print(\"Answer:\", response.choices[0].message.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2f7af13",
   "metadata": {},
   "outputs": [],
   "source": [
    "# The reasoning/thinking\n",
    "print(\"Reasoning:\", response.choices[0].message.reasoning_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e29bc4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Token usage\n",
    "print(\"Tokens used:\", response.usage.total_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c5ca3c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# do embedding search\n",
    "question = \"How do population size fluctuations affect effective population size?\"\n",
    "results = embeddings.search(question, 3)\n",
    "context = \"\\n\\n\".join([r[\"text\"] for r in results]) # pass to llm\n",
    "\n",
    "# verify\n",
    "print(\"Retrieved from docs\")\n",
    "for r in results:\n",
    "    print(f\"[Score: {r['score']:.3f}] {r['text'][:150]}...\")\n",
    "    print()\n",
    "\n",
    "# send with context\n",
    "response = litellm.completion(\n",
    "    model=\"openrouter/minimax/minimax-m2.5:free\",\n",
    "    messages=[\n",
    "        {\n",
    "            \"role\": \"system\",\n",
    "            \"content\": \"Answer ONLY using the provided context. Cite which parts you're drawing from. If the context doesn't cover something, say 'not in my documents'.\"\n",
    "        },\n",
    "        {\n",
    "            \"role\": \"user\",\n",
    "            \"content\": f\"Context from my documents:\\n{context}\\n\\nQuestion: {question}\"\n",
    "        }\n",
    "    ]\n",
    ")\n",
    "print(\"\\nllm ans\")\n",
    "print(response.choices[0].message.content)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}