added llm and rag queries

2026-03-24 20:15:22 +01:00
parent cbfcf1e315
commit 012549b4bc
12 changed files with 278 additions and 2981 deletions
--- a/.env.template
+++ b/.env.template
@@ -0,0 +1,2 @@
 OPENROUTER_API_KEY=
 OPENROUTER_BASE_URL =
--- a/.gitignore
+++ b/.gitignore
@@ -79,3 +79,4 @@ Thumbs.db
 # Project specific
 data/
 .env
--- a/kg_ocr/init.py
+++ b/kg_ocr/init.py
@@ -1,4 +1,5 @@
 from .ocr import get_screenshots, extract_text
-from .embeddings import create_and_index, query_embedding
+from .embeddings import create_and_index
 from .rag import retrieve, ask_wllm
-__all__ = ["get_screenshots", "extract_text", "create_and_index", "query_embedding"]
+__all__ = ["get_screenshots", "extract_text", "create_and_index", "retrieve", "ask_wllm"]
--- a/kg_ocr/embeddings/init.py
+++ b/kg_ocr/embeddings/init.py
@@ -1,3 +1,3 @@
-from .indexer import create_and_index, query_embedding
+from .indexer import create_and_index
-__all__ = ["create_and_index", "query_embedding"]
+__all__ = ["create_and_index"]
--- a/kg_ocr/embeddings/indexer.py
+++ b/kg_ocr/embeddings/indexer.py
@@ -15,9 +15,4 @@ def create_and_index(
    return embeddings
-def query_embedding(
+
    embeddings: Embeddings, query: str, limit: int = 100
 ) -> list[str]:
    """Search embeddings and return matching texts."""
    results = embeddings.search(query, limit)
    return [r["text"] for r in results]
--- a/kg_ocr/rag/init.py
+++ b/kg_ocr/rag/init.py
@@ -0,0 +1,3 @@
 from .query import retrieve, ask_wllm
 __all__ = ["retrieve", "ask_wllm"]
--- a/kg_ocr/rag/query.py
+++ b/kg_ocr/rag/query.py
@@ -0,0 +1,32 @@
 from txtai.embeddings import Embeddings
 from txtai import LLM
 import litellm
 from dotenv import load_dotenv
 import os
 load_dotenv()
 def retrieve(embeddings: Embeddings, query: str, limit: int = 3) -> list[dict]:
    """Search embeddings and return results with scores"""
    return embeddings.search(query, limit)
 def ask_wllm(embeddings: Embeddings, question: str, model: str = "openrouter/minimax/minimax-m2.5:free", limit: int = 3) -> str:
    """RAG: retrieve context from embeddings, then answer with an LLM."""
    results = retrieve(embeddings, question, limit)
    context = "\n\n".join([r["text"] for r in results])
    response = litellm.completion(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "Answer ONLY using the provided context. Cite which parts you're drawing from. If the context doesn't cover something, say 'not in my documents'."
            },
            {
                "role": "user",
                "content": f"Context from my documents:\n{context}\n\nQuestion: {question}"
            }
        ]
    )
    return response.choices[0].message.content
--- a/notebooks/01_ocr_sc.ipynb
+++ b/notebooks/01_ocr_sc.ipynb
--- a/notebooks/02_functions_legacy.ipynb
+++ b/notebooks/02_functions_legacy.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 5,
   "id": "e58ed372",
   "metadata": {},
   "outputs": [],
@@ -11,34 +11,16 @@
    "from pathlib import Path\n",
    "import pytesseract\n",
    "from PIL import Image\n",
-    "from txtai.embeddings import Embeddings"
+    "from txtai.embeddings import Embeddings\n",
    "from txtai import LLM\n",
    "import litellm\n",
    "from dotenv import load_dotenv\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "id": "5f2d75d9",
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'kg_scr'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mkg_scr\u001b[39;00m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'kg_scr'"
     ]
    }
   ],
   "source": [
    "import kg_scr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "10a7eff9",
   "metadata": {},
   "outputs": [],
@@ -89,19 +71,51 @@
    "def query_embedding(embeddings: Embeddings, query: str, limit: int = 100) -> list[str]:\n",
    "    \"\"\"Search embeddings and return matching texts\"\"\"\n",
    "    results = embeddings.search(query, limit)\n",
-    "    return [r[\"text\"] for r in results]"
+    "    return [r[\"text\"] for r in results]\n",
    "\n",
    "def retrieve(embeddings: Embeddings, query: str, limit: int = 3) -> list[dict]:\n",
    "    \"\"\"Search embeddings and return results with scores\"\"\"\n",
    "    return embeddings.search(query, limit)\n",
    "\n",
    "def ask_wllm(embeddings: Embeddings, question: str, model: str = \"openrouter/minimax/minimax-m2.5:free\", limit: int = 3) -> str:\n",
    "    \"\"\"RAG: retrieve context from embeddings, then answer with an LLM.\"\"\"\n",
    "    results = retrieve(embeddings, question, limit)\n",
    "    context = \"\\n\\n\".join([r[\"text\"] for r in results])\n",
    "\n",
    "    response = litellm.completion(\n",
    "        model=model,\n",
    "        messages=[\n",
    "            {\n",
    "                \"role\": \"system\",\n",
    "                \"content\": \"Answer ONLY using the provided context. Cite which parts you're drawing from. If the context doesn't cover something, say 'not in my documents'.\"\n",
    "            },\n",
    "            {\n",
    "                \"role\": \"user\",\n",
    "                \"content\": f\"Context from my documents:\\n{context}\\n\\nQuestion: {question}\"\n",
    "            }\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    return response.choices[0].message.content"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
   "id": "e73d6386",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "da81a1463d0f4d5694edbfd412e52763",
+       "model_id": "41d3c1089d7346229f5c9ff7b31068a1",
       "version_major": 2,
       "version_minor": 0
      },
@@ -3466,76 +3480,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
-   "id": "a613a361",
+   "id": "81a8265a",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "# # create embeddings\n",
+    "load_dotenv()"
    "# embeddings = Embeddings({\n",
    "#     \"path\": \"sentence-transformers/all-MiniLM-L6-v2\",\n",
    "#     \"content\": True,\n",
    "#     \"graph\": True,\n",
    "#     \"hybrid\": True, \n",
    "#     \"scoring\": True\n",
    "# })\n",
    "\n",
    "# # do indexing\n",
    "# embeddings.index(txt)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "id": "6f94de70",
   "metadata": {},
   "outputs": [],
   "source": [
    "# embeddings search\n",
    "print(\"%-20s %s\" % (\"Query\", \"Best Match\"))\n",
    "print(\"-\" * 50)\n",
    "\n",
    "for query in [\"genome\"]:\n",
    "    results = embeddings.search(query, 100)\n",
    "    for r in results:\n",
    "      print(r[\"text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10c81e27",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"OPENROUTER_API_KEY\"] = \"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\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9519cf2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from txtai import LLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58bce2ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "OPENROUTER_API_KEY = os.getenv(\"OPENROUTER_API_KEY\")\n",
    "OPENROUTER_BASE_URL = os.getenv(\"OPENROUTER_API_BASE\", \"https://openrouter.ai/api/v1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e20bf7e",
   "metadata": {},
   "outputs": [],
@@ -3545,60 +3511,57 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "id": "41f0f066",
   "metadata": {},
   "outputs": [],
   "source": [
    "import litellm\n",
    "\n",
    "response = litellm.completion(\n",
    "    model=\"openrouter/minimax/minimax-m2.5:free\",\n",
    "    messages=[\n",
    "        {\"role\": \"user\", \"content\": \"How do population size fluctuations affect effective population size??\"}\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "879c7011",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Just the answer\n",
    "print(\"Answer:\", response.choices[0].message.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2f7af13",
   "metadata": {},
   "outputs": [],
   "source": [
    "# The reasoning/thinking\n",
    "print(\"Reasoning:\", response.choices[0].message.reasoning_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e29bc4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Token usage\n",
    "print(\"Tokens used:\", response.usage.total_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c5ca3c7",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Retrieved from docs\n",
      "[Score: 0.745] This slide explains how rapid fluctuations in population size influence the effective population size\n",
      "(N-), a key parameter in population genetics. Un...\n",
      "\n",
      "[Score: 0.726] This slide focuses on the effect of slow fluctuations in population size on the effective\n",
      "population size (V.) and emphasizes the conditions under whi...\n",
      "\n",
      "[Score: 0.640] Variable population size\n",
      "\n",
      "Beyond the Standard Neutral Model\n",
      "\n",
      "Slow fluctuations\n",
      "in population size : = =\n",
      "\n",
      "4 Need:\n",
      "A, 7 T << min[N, |\n",
      "\n",
      "...\n",
      "\n",
      "\n",
      "llm ans\n",
      "# Impact of Population Size Fluctuations on Effective Population Size\n",
      "\n",
      "Based on the provided documents, population size fluctuations significantly reduce the effective population size (Ne), primarily because Ne reflects the **harmonic mean** of population sizes over time rather than the arithmetic mean.\n",
      "\n",
      "## Key Effects:\n",
      "\n",
      "1. **Disproportionate Impact of Small Populations**: The harmonic mean is heavily influenced by periods of small population size. For example, if a population fluctuates between N and N/4, the effective population size becomes Ne = 2N—significantly smaller than the actual average population size [1].\n",
      "\n",
      "2. **Increased Coalescence**: Smaller populations have higher probabilities of coalescence, which reduces genetic diversity and lowers Ne [1].\n",
      "\n",
      "3. **Even Brief Reductions Matter**: Even short periods of population decline can greatly reduce Ne, emphasizing that temporary bottlenecks have lasting genetic consequences [1].\n",
      "\n",
      "## Rapid vs. Slow Fluctuations:\n",
      "\n",
      "- **Rapid fluctuations**: The harmonic mean formula accurately represents Ne, and population size changes are abrupt [1].\n",
      "\n",
      "- **Slow fluctuations**: When the observation time (T') is much smaller than the minimum population size (min[Nt]), the population appears relatively stable, and the harmonic mean formula may not accurately represent Ne over longer periods [2]. For the harmonic mean calculation to be meaningful, the time scale of observation must be significantly smaller than the scale of population size changes [2].\n",
      "\n",
      "**Bottom line**: Fluctuations in population size reduce genetic diversity by lowering Ne, affecting coalescence rates and increasing the impact of genetic drift [1].\n"
     ]
    }
   ],
   "source": [
    "# do embedding search\n",
    "question = \"How do population size fluctuations affect effective population size?\"\n",
--- a/notebooks/03_flow.ipynb
+++ b/notebooks/03_flow.ipynb
--- a/notebooks/03_testing.ipynb
+++ b/notebooks/03_testing.ipynb
@@ -0,0 +1,125 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "508336f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from kg_ocr import get_screenshots, extract_text, create_and_index, retrieve, ask_wllm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "11055f85",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "20e3c61fe191485da79a32e823ccd1ec",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[1mBertModel LOAD REPORT\u001b[0m from: sentence-transformers/all-MiniLM-L6-v2\n",
      "Key                     | Status     |  | \n",
      "------------------------+------------+--+-\n",
      "embeddings.position_ids | UNEXPECTED |  | \n",
      "\n",
      "\u001b[3mNotes:\n",
      "- UNEXPECTED\u001b[3m\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\u001b[0m\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.739] This slide focuses on the effect of slow fluctuations in population size on the effective\n",
      "population size (V.) and emphasizes the conditions under whi...\n",
      "[0.722] This slide explains how rapid fluctuations in population size influence the effective population size\n",
      "(N-), a key parameter in population genetics. Un...\n",
      "[0.712] Variable population size\n",
      "\n",
      "Beyond the Standard Neutral Model\n",
      "\n",
      "Slow fluctuations\n",
      "in population size : = =\n",
      "\n",
      "4 Need:\n",
      "A, 7 T << min[N, |\n",
      "\n",
      "...\n",
      "## How Population Size Fluctuations Affect Effective Population Size\n",
      "\n",
      "Based on the provided documents, population size fluctuations significantly impact effective population size (Ne) in the following ways:\n",
      "\n",
      "**1. Harmonic Mean Effect**\n",
      "Unlike the arithmetic mean, Ne reflects the *harmonic mean* of population sizes over time. This is disproportionately affected by periods of small population size. For example, if a population fluctuates between N and N/4, the effective population size becomes N/2—\"significantly smaller than the actual average population size\" (first slide).\n",
      "\n",
      "**2. Reduction from Small Population Periods**\n",
      "Smaller populations have higher probabilities of coalescence, which reduces genetic diversity. Even brief reductions in population size can greatly lower Ne. This is because \"smaller populations have higher probabilities of coalescence, reducing genetic diversity\" (first slide).\n",
      "\n",
      "**3. Time Scale Matters**\n",
      "The effect depends on whether fluctuations are rapid or slow:\n",
      "- **Rapid fluctuations**: The harmonic mean formula accurately represents Ne over the fluctuation period.\n",
      "- **Slow fluctuations**: Occur when the time period of interest (T') is \"much shorter than the minimum population size (min[Nt]) across the fluctuation cycle\" (second slide). In this case, population size appears relatively stable, and \"the harmonic mean formula may not accurately represent the effective population size over longer periods\" (second slide).\n",
      "\n",
      "The key takeaway is that for meaningful harmonic mean calculation, \"the time scale of observation (T') must be significantly smaller than the scale of population size changes\" (second slide).\n"
     ]
    }
   ],
   "source": [
    "screenshots = get_screenshots(\"/Users/Aman/Pictures\")\n",
    "texts = extract_text(screenshots)\n",
    "embeddings = create_and_index(texts)\n",
    "results = retrieve(embeddings, \"population size fluctuations\")\n",
    "for r in results:\n",
    "    print(f\"[{r['score']:.3f}] {r['text'][:150]}...\")\n",
    "\n",
    "answer = ask_wllm(embeddings, \"How do population size fluctuations affect effective population size?\")\n",
    "print(answer)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/setup.py
+++ b/setup.py
@@ -9,5 +9,7 @@ setup(
        "Pillow",
        "txtai",
        "sentence-transformers",
        "litellm",
        "python-dotenv"
    ],
 )
		`@@ -0,0 +1,2 @@`
							`OPENROUTER_API_KEY=`
							`OPENROUTER_BASE_URL =`
`@@ -1,3 +1,3 @@`
	`from .indexer import create_and_index, query_embedding`	`from .indexer import create_and_index`

	`__all__ = ["create_and_index", "query_embedding"]`	`__all__ = ["create_and_index"]`
		`@@ -0,0 +1,3 @@`
							`from .query import retrieve, ask_wllm`

							`__all__ = ["retrieve", "ask_wllm"]`