{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "e58ed372", "metadata": {}, "outputs": [], "source": [ "import platform\n", "from pathlib import Path\n", "import pytesseract\n", "from PIL import Image\n", "from txtai.embeddings import Embeddings" ] }, { "cell_type": "code", "execution_count": null, "id": "10a7eff9", "metadata": {}, "outputs": [], "source": [ "def_paths = {\n", " \"Darwin\": Path.home() / \"Desktop\",\n", " \"Windows\": Path.home() / \"Pictures\" / \"Screenshots\",\n", " \"Linux\": Path.home() / \"Pictures\",\n", "}\n", "\n", "sc_pathpatterns = {\n", " \"Darwin\": [\"SCR*.png\", \"Screenshot*.png\"],\n", " \"Windows\": [\"Screenshot*.png\"],\n", " \"Linux\": [\"Screenshot*.png\", \"scrot*.png\", \"screenshot*.png\"],\n", "}\n", "\n", "def get_screenshots(path: str | Path | None = None):\n", " if path is None:\n", " path = def_paths.get(platform.system(), Path.home())\n", " os_name = platform.system()\n", " patterns = sc_pathpatterns.get(os_name, [\"SCR*.png\"]) # assume mac\n", " path = Path(path)\n", " results = []\n", " for pattern in patterns:\n", " # results.extend(path.glob(pattern))\n", " results.extend(str(f.absolute()) for f in path.glob(pattern))\n", " \n", " return sorted(set(results))\n", "def extract_text(path, limit: int = 50):\n", " text_scr = []\n", " for img in get_screenshots(path):\n", " image = Image.open(img) \n", " text = pytesseract.image_to_string(image)\n", " text_scr.append(text)\n", " return text_scr" ] }, { "cell_type": "code", "execution_count": null, "id": "e73d6386", "metadata": {}, "outputs": [], "source": [ "txt = extract_text(\"/Users/Aman/Pictures\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a613a361", "metadata": {}, "outputs": [], "source": [ "# create embeddings\n", "embeddings = Embeddings({\n", " \"path\": \"sentence-transformers/all-MiniLM-L6-v2\",\n", " \"content\": True\n", "})\n", "\n", "# do indexing\n", "embeddings.index(txt)" ] }, { "cell_type": "code", "execution_count": null, "id": "7982d00e", "metadata": {}, "outputs": [], "source": [ "#print(txt)" ] }, { "cell_type": "code", "execution_count": null, "id": "6f94de70", "metadata": {}, "outputs": [], "source": [ "# embeddings search\n", "print(\"%-20s %s\" % (\"Query\", \"Best Match\"))\n", "print(\"-\" * 50)\n", "\n", "for query in [\"genome\"]:\n", " results = embeddings.search(query, 100)\n", " for r in results:\n", " print(r[\"text\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "fa9b189d", "metadata": {}, "outputs": [], "source": [ "#results" ] }, { "cell_type": "code", "execution_count": null, "id": "cb5ee81e", "metadata": {}, "outputs": [], "source": [ "#embeddings.save(\"index\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c263aee0", "metadata": {}, "outputs": [], "source": [ "embeddings = Embeddings()\n", "embeddings.load(\"index\")" ] }, { "cell_type": "code", "execution_count": null, "id": "10c81e27", "metadata": {}, "outputs": [], "source": [ "import os\n", "os.environ[\"OPENROUTER_API_KEY\"] = \"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\"" ] }, { "cell_type": "code", "execution_count": null, "id": "e9519cf2", "metadata": {}, "outputs": [], "source": [ "from txtai import LLM" ] }, { "cell_type": "code", "execution_count": null, "id": "98164787", "metadata": {}, "outputs": [], "source": [ "#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" ] }, { "cell_type": "code", "execution_count": null, "id": "58bce2ae", "metadata": {}, "outputs": [], "source": [ "OPENROUTER_API_KEY = os.getenv(\"OPENROUTER_API_KEY\")\n", "OPENROUTER_BASE_URL = os.getenv(\"OPENROUTER_API_BASE\", \"https://openrouter.ai/api/v1\")" ] }, { "cell_type": "code", "execution_count": null, "id": "8e20bf7e", "metadata": {}, "outputs": [], "source": [ "messages = \"What is Hi-C and how does it work?\"" ] }, { "cell_type": "code", "execution_count": null, "id": "41f0f066", "metadata": {}, "outputs": [], "source": [ "import litellm\n", "\n", "response = litellm.completion(\n", " model=\"openrouter/minimax/minimax-m2.5:free\",\n", " messages=[\n", " {\"role\": \"user\", \"content\": \"How do population size fluctuations affect effective population size??\"}\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "8caf0ff4", "metadata": {}, "outputs": [], "source": [ "#print(response)" ] }, { "cell_type": "code", "execution_count": null, "id": "879c7011", "metadata": {}, "outputs": [], "source": [ "# Just the answer\n", "print(\"Answer:\", response.choices[0].message.content)" ] }, { "cell_type": "code", "execution_count": null, "id": "b2f7af13", "metadata": {}, "outputs": [], "source": [ "# The reasoning/thinking\n", "print(\"Reasoning:\", response.choices[0].message.reasoning_content)" ] }, { "cell_type": "code", "execution_count": null, "id": "0e29bc4c", "metadata": {}, "outputs": [], "source": [ "# Token usage\n", "print(\"Tokens used:\", response.usage.total_tokens)" ] }, { "cell_type": "code", "execution_count": null, "id": "4c5ca3c7", "metadata": {}, "outputs": [], "source": [ "# do embedding search\n", "question = \"How do population size fluctuations affect effective population size?\"\n", "results = embeddings.search(question, 3)\n", "context = \"\\n\\n\".join([r[\"text\"] for r in results]) # pass to llm\n", "\n", "# verify\n", "print(\"Retrieved from docs\")\n", "for r in results:\n", " print(f\"[Score: {r['score']:.3f}] {r['text'][:150]}...\")\n", " print()\n", "\n", "# send with context\n", "response = litellm.completion(\n", " model=\"openrouter/minimax/minimax-m2.5:free\",\n", " messages=[\n", " {\n", " \"role\": \"system\",\n", " \"content\": \"Answer ONLY using the provided context. Cite which parts you're drawing from. If the context doesn't cover something, say 'not in my documents'.\"\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"Context from my documents:\\n{context}\\n\\nQuestion: {question}\"\n", " }\n", " ]\n", ")\n", "print(\"\\nllm ans\")\n", "print(response.choices[0].message.content)" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 5 }