From cbfcf1e315657730abf3d4ea1ceaf0c56b232136 Mon Sep 17 00:00:00 2001 From: Aman Nalakath Date: Tue, 24 Mar 2026 14:58:58 +0100 Subject: [PATCH] added scaffold struct for the prj --- .gitignore | 81 + functions.ipynb | 312 -- kg_ocr/__init__.py | 4 + kg_ocr/cli/__init__.py | 0 kg_ocr/cli/build_graph.py | 3 + kg_ocr/cli/export_graph.py | 1 + kg_ocr/cli/process_screenshots.py | 1 + kg_ocr/embeddings/__init__.py | 3 + kg_ocr/embeddings/config_loader.py | 1 + kg_ocr/embeddings/indexer.py | 23 + kg_ocr/export/__init__.py | 0 kg_ocr/export/neo4j_exporter.py | 0 kg_ocr/graph/__init__.py | 0 kg_ocr/graph/analyzer.py | 1 + kg_ocr/graph/builder.py | 0 kg_ocr/ocr/__init__.py | 4 + kg_ocr/ocr/batch_processor.py | 7 + kg_ocr/ocr/constants.py | 15 + kg_ocr/ocr/extractor.py | 17 + ocr_sc.ipynb => notebooks/01_ocr_sc.ipynb | 0 notebooks/02_functions_legacy.ipynb | 3654 +++++++++++++++++++++ notebooks/03_flow.ipynb | 2827 ++++++++++++++++ setup.py | 13 + tests/test_graph.py | 0 tests/test_indexer.py | 0 tests/test_ocr.py | 0 26 files changed, 6655 insertions(+), 312 deletions(-) create mode 100644 .gitignore delete mode 100644 functions.ipynb create mode 100644 kg_ocr/__init__.py create mode 100644 kg_ocr/cli/__init__.py create mode 100644 kg_ocr/cli/build_graph.py create mode 100644 kg_ocr/cli/export_graph.py create mode 100644 kg_ocr/cli/process_screenshots.py create mode 100644 kg_ocr/embeddings/__init__.py create mode 100644 kg_ocr/embeddings/config_loader.py create mode 100644 kg_ocr/embeddings/indexer.py create mode 100644 kg_ocr/export/__init__.py create mode 100644 kg_ocr/export/neo4j_exporter.py create mode 100644 kg_ocr/graph/__init__.py create mode 100644 kg_ocr/graph/analyzer.py create mode 100644 kg_ocr/graph/builder.py create mode 100644 kg_ocr/ocr/__init__.py create mode 100644 kg_ocr/ocr/batch_processor.py create mode 100644 kg_ocr/ocr/constants.py create mode 100644 kg_ocr/ocr/extractor.py rename ocr_sc.ipynb => notebooks/01_ocr_sc.ipynb (100%) create mode 100644 notebooks/02_functions_legacy.ipynb create mode 100644 notebooks/03_flow.ipynb create mode 100644 setup.py create mode 100644 tests/test_graph.py create mode 100644 tests/test_indexer.py create mode 100644 tests/test_ocr.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a80c48c --- /dev/null +++ b/.gitignore @@ -0,0 +1,81 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Virtual environments +venv/ +env/ +ENV/ +.venv + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# OS +.DS_Store +Thumbs.db + +# Project specific +data/ diff --git a/functions.ipynb b/functions.ipynb deleted file mode 100644 index 81092bc..0000000 --- a/functions.ipynb +++ /dev/null @@ -1,312 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "e58ed372", - "metadata": {}, - "outputs": [], - "source": [ - "import platform\n", - "from pathlib import Path\n", - "import pytesseract\n", - "from PIL import Image\n", - "from txtai.embeddings import Embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10a7eff9", - "metadata": {}, - "outputs": [], - "source": [ - "def_paths = {\n", - " \"Darwin\": Path.home() / \"Desktop\",\n", - " \"Windows\": Path.home() / \"Pictures\" / \"Screenshots\",\n", - " \"Linux\": Path.home() / \"Pictures\",\n", - "}\n", - "\n", - "sc_pathpatterns = {\n", - " \"Darwin\": [\"SCR*.png\", \"Screenshot*.png\"],\n", - " \"Windows\": [\"Screenshot*.png\"],\n", - " \"Linux\": [\"Screenshot*.png\", \"scrot*.png\", \"screenshot*.png\"],\n", - "}\n", - "\n", - "def get_screenshots(path: str | Path | None = None):\n", - " if path is None:\n", - " path = def_paths.get(platform.system(), Path.home())\n", - " os_name = platform.system()\n", - " patterns = sc_pathpatterns.get(os_name, [\"SCR*.png\"]) # assume mac\n", - " path = Path(path)\n", - " results = []\n", - " for pattern in patterns:\n", - " # results.extend(path.glob(pattern))\n", - " results.extend(str(f.absolute()) for f in path.glob(pattern))\n", - " \n", - " return sorted(set(results))\n", - "def extract_text(path, limit: int = 50):\n", - " text_scr = []\n", - " for img in get_screenshots(path):\n", - " image = Image.open(img) \n", - " text = pytesseract.image_to_string(image)\n", - " text_scr.append(text)\n", - " return text_scr" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e73d6386", - "metadata": {}, - "outputs": [], - "source": [ - "txt = extract_text(\"/Users/Aman/Pictures\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a613a361", - "metadata": {}, - "outputs": [], - "source": [ - "# create embeddings\n", - "embeddings = Embeddings({\n", - " \"path\": \"sentence-transformers/all-MiniLM-L6-v2\",\n", - " \"content\": True\n", - "})\n", - "\n", - "# do indexing\n", - "embeddings.index(txt)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7982d00e", - "metadata": {}, - "outputs": [], - "source": [ - "#print(txt)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f94de70", - "metadata": {}, - "outputs": [], - "source": [ - "# embeddings search\n", - "print(\"%-20s %s\" % (\"Query\", \"Best Match\"))\n", - "print(\"-\" * 50)\n", - "\n", - "for query in [\"genome\"]:\n", - " results = embeddings.search(query, 100)\n", - " for r in results:\n", - " print(r[\"text\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa9b189d", - "metadata": {}, - "outputs": [], - "source": [ - "#results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb5ee81e", - "metadata": {}, - "outputs": [], - "source": [ - "#embeddings.save(\"index\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c263aee0", - "metadata": {}, - "outputs": [], - "source": [ - "embeddings = Embeddings()\n", - "embeddings.load(\"index\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10c81e27", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"OPENROUTER_API_KEY\"] = \"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\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e9519cf2", - "metadata": {}, - "outputs": [], - "source": [ - "from txtai import LLM" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98164787", - "metadata": {}, - "outputs": [], - "source": [ - "#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" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58bce2ae", - "metadata": {}, - "outputs": [], - "source": [ - "OPENROUTER_API_KEY = os.getenv(\"OPENROUTER_API_KEY\")\n", - "OPENROUTER_BASE_URL = os.getenv(\"OPENROUTER_API_BASE\", \"https://openrouter.ai/api/v1\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e20bf7e", - "metadata": {}, - "outputs": [], - "source": [ - "messages = \"What is Hi-C and how does it work?\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "41f0f066", - "metadata": {}, - "outputs": [], - "source": [ - "import litellm\n", - "\n", - "response = litellm.completion(\n", - " model=\"openrouter/minimax/minimax-m2.5:free\",\n", - " messages=[\n", - " {\"role\": \"user\", \"content\": \"How do population size fluctuations affect effective population size??\"}\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8caf0ff4", - "metadata": {}, - "outputs": [], - "source": [ - "#print(response)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "879c7011", - "metadata": {}, - "outputs": [], - "source": [ - "# Just the answer\n", - "print(\"Answer:\", response.choices[0].message.content)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b2f7af13", - "metadata": {}, - "outputs": [], - "source": [ - "# The reasoning/thinking\n", - "print(\"Reasoning:\", response.choices[0].message.reasoning_content)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e29bc4c", - "metadata": {}, - "outputs": [], - "source": [ - "# Token usage\n", - "print(\"Tokens used:\", response.usage.total_tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c5ca3c7", - "metadata": {}, - "outputs": [], - "source": [ - "# do embedding search\n", - "question = \"How do population size fluctuations affect effective population size?\"\n", - "results = embeddings.search(question, 3)\n", - "context = \"\\n\\n\".join([r[\"text\"] for r in results]) # pass to llm\n", - "\n", - "# verify\n", - "print(\"Retrieved from docs\")\n", - "for r in results:\n", - " print(f\"[Score: {r['score']:.3f}] {r['text'][:150]}...\")\n", - " print()\n", - "\n", - "# send with context\n", - "response = litellm.completion(\n", - " model=\"openrouter/minimax/minimax-m2.5:free\",\n", - " messages=[\n", - " {\n", - " \"role\": \"system\",\n", - " \"content\": \"Answer ONLY using the provided context. Cite which parts you're drawing from. If the context doesn't cover something, say 'not in my documents'.\"\n", - " },\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": f\"Context from my documents:\\n{context}\\n\\nQuestion: {question}\"\n", - " }\n", - " ]\n", - ")\n", - "print(\"\\nllm ans\")\n", - "print(response.choices[0].message.content)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/kg_ocr/__init__.py b/kg_ocr/__init__.py new file mode 100644 index 0000000..31a199f --- /dev/null +++ b/kg_ocr/__init__.py @@ -0,0 +1,4 @@ +from .ocr import get_screenshots, extract_text +from .embeddings import create_and_index, query_embedding + +__all__ = ["get_screenshots", "extract_text", "create_and_index", "query_embedding"] diff --git a/kg_ocr/cli/__init__.py b/kg_ocr/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/kg_ocr/cli/build_graph.py b/kg_ocr/cli/build_graph.py new file mode 100644 index 0000000..70db6b2 --- /dev/null +++ b/kg_ocr/cli/build_graph.py @@ -0,0 +1,3 @@ +# to do + +# add entrypoint to setuppy \ No newline at end of file diff --git a/kg_ocr/cli/export_graph.py b/kg_ocr/cli/export_graph.py new file mode 100644 index 0000000..d5364a9 --- /dev/null +++ b/kg_ocr/cli/export_graph.py @@ -0,0 +1 @@ +# to do \ No newline at end of file diff --git a/kg_ocr/cli/process_screenshots.py b/kg_ocr/cli/process_screenshots.py new file mode 100644 index 0000000..d5364a9 --- /dev/null +++ b/kg_ocr/cli/process_screenshots.py @@ -0,0 +1 @@ +# to do \ No newline at end of file diff --git a/kg_ocr/embeddings/__init__.py b/kg_ocr/embeddings/__init__.py new file mode 100644 index 0000000..1f0fb56 --- /dev/null +++ b/kg_ocr/embeddings/__init__.py @@ -0,0 +1,3 @@ +from .indexer import create_and_index, query_embedding + +__all__ = ["create_and_index", "query_embedding"] diff --git a/kg_ocr/embeddings/config_loader.py b/kg_ocr/embeddings/config_loader.py new file mode 100644 index 0000000..0ce7cef --- /dev/null +++ b/kg_ocr/embeddings/config_loader.py @@ -0,0 +1 @@ +# in future prefer a config, especially for graph traversal \ No newline at end of file diff --git a/kg_ocr/embeddings/indexer.py b/kg_ocr/embeddings/indexer.py new file mode 100644 index 0000000..ea964f8 --- /dev/null +++ b/kg_ocr/embeddings/indexer.py @@ -0,0 +1,23 @@ +from txtai.embeddings import Embeddings + + +def create_and_index( + data: list[str], model: str = "sentence-transformers/all-MiniLM-L6-v2" +) -> Embeddings: + """Create and index embeddings from text.""" + embeddings = Embeddings({ + "path": model, + "content": True, + "hybrid": True, + "scoring": "bm25", + }) + embeddings.index(data) + return embeddings + + +def query_embedding( + embeddings: Embeddings, query: str, limit: int = 100 +) -> list[str]: + """Search embeddings and return matching texts.""" + results = embeddings.search(query, limit) + return [r["text"] for r in results] diff --git a/kg_ocr/export/__init__.py b/kg_ocr/export/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/kg_ocr/export/neo4j_exporter.py b/kg_ocr/export/neo4j_exporter.py new file mode 100644 index 0000000..e69de29 diff --git a/kg_ocr/graph/__init__.py b/kg_ocr/graph/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/kg_ocr/graph/analyzer.py b/kg_ocr/graph/analyzer.py new file mode 100644 index 0000000..f74afe7 --- /dev/null +++ b/kg_ocr/graph/analyzer.py @@ -0,0 +1 @@ +# also anomaly detection here \ No newline at end of file diff --git a/kg_ocr/graph/builder.py b/kg_ocr/graph/builder.py new file mode 100644 index 0000000..e69de29 diff --git a/kg_ocr/ocr/__init__.py b/kg_ocr/ocr/__init__.py new file mode 100644 index 0000000..f20aca4 --- /dev/null +++ b/kg_ocr/ocr/__init__.py @@ -0,0 +1,4 @@ +from .extractor import get_screenshots +from .batch_processor import extract_text + +__all__ = ["get_screenshots", "extract_text"] diff --git a/kg_ocr/ocr/batch_processor.py b/kg_ocr/ocr/batch_processor.py new file mode 100644 index 0000000..88934f9 --- /dev/null +++ b/kg_ocr/ocr/batch_processor.py @@ -0,0 +1,7 @@ +from PIL import Image +import pytesseract + + +def extract_text(images: list[str]) -> list[str]: + """OCR a list of image paths into text.""" + return [pytesseract.image_to_string(Image.open(img)) for img in images] diff --git a/kg_ocr/ocr/constants.py b/kg_ocr/ocr/constants.py new file mode 100644 index 0000000..ab6ee02 --- /dev/null +++ b/kg_ocr/ocr/constants.py @@ -0,0 +1,15 @@ +from pathlib import Path + +import platform + +def_paths = { + "Darwin": Path.home() / "Desktop", + "Windows": Path.home() / "Pictures" / "Screenshots", + "Linux": Path.home() / "Pictures", +} + +sc_pathpatterns = { + "Darwin": ["SCR*.png", "Screenshot*.png"], + "Windows": ["Screenshot*.png"], + "Linux": ["Screenshot*.png", "scrot*.png", "screenshot*.png"], +} diff --git a/kg_ocr/ocr/extractor.py b/kg_ocr/ocr/extractor.py new file mode 100644 index 0000000..0d55512 --- /dev/null +++ b/kg_ocr/ocr/extractor.py @@ -0,0 +1,17 @@ +import platform +from pathlib import Path +from typing import Optional + +from .constants import def_paths, sc_pathpatterns + + +def get_screenshots(path: Optional[str | Path] = None) -> list[str]: + """Find screenshot files for the current OS.""" + if path is None: + path = def_paths.get(platform.system(), Path.home()) + path = Path(path) + patterns = sc_pathpatterns.get(platform.system(), ["SCR*.png"]) + results = [] + for pattern in patterns: + results.extend(str(f.absolute()) for f in path.glob(pattern)) + return sorted(set(results)) diff --git a/ocr_sc.ipynb b/notebooks/01_ocr_sc.ipynb similarity index 100% rename from ocr_sc.ipynb rename to notebooks/01_ocr_sc.ipynb diff --git a/notebooks/02_functions_legacy.ipynb b/notebooks/02_functions_legacy.ipynb new file mode 100644 index 0000000..b788d6f --- /dev/null +++ b/notebooks/02_functions_legacy.ipynb @@ -0,0 +1,3654 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "e58ed372", + "metadata": {}, + "outputs": [], + "source": [ + "import platform\n", + "from pathlib import Path\n", + "import pytesseract\n", + "from PIL import Image\n", + "from txtai.embeddings import Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5f2d75d9", + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'kg_scr'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mkg_scr\u001b[39;00m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'kg_scr'" + ] + } + ], + "source": [ + "import kg_scr" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "10a7eff9", + "metadata": {}, + "outputs": [], + "source": [ + "# default paths, sys agnostic\n", + "def_paths = {\n", + " \"Darwin\": Path.home() / \"Desktop\",\n", + " \"Windows\": Path.home() / \"Pictures\" / \"Screenshots\",\n", + " \"Linux\": Path.home() / \"Pictures\",\n", + "}\n", + "\n", + "# os spec. default filenames when it comes to screen shots\n", + "sc_pathpatterns = {\n", + " \"Darwin\": [\"SCR*.png\", \"Screenshot*.png\"],\n", + " \"Windows\": [\"Screenshot*.png\"],\n", + " \"Linux\": [\"Screenshot*.png\", \"scrot*.png\", \"screenshot*.png\"],\n", + "}\n", + "\n", + "# functions\n", + "\n", + "def get_screenshots(path: str | Path | None = None) -> list[str]:\n", + " \"\"\"Find screenshot files for the current OS\"\"\"\n", + " if path is None:\n", + " path = def_paths.get(platform.system(), Path.home())\n", + " path = Path(path)\n", + " patterns = sc_pathpatterns.get(platform.system(), [\"SCR*.png\"])\n", + " results = []\n", + " for pattern in patterns:\n", + " results.extend(str(f.absolute()) for f in path.glob(pattern))\n", + " return sorted(set(results))\n", + "\n", + "def extract_text(images: list[str]) -> list[str]:\n", + " \"\"\"OCR a list of image paths into text\"\"\"\n", + " return [pytesseract.image_to_string(Image.open(img)) for img in images]\n", + "\n", + "def create_and_index(data: list[str], model=\"sentence-transformers/all-MiniLM-L6-v2\") -> Embeddings:\n", + " \"\"\"Create and index embeddings from text\"\"\"\n", + " embeddings = Embeddings({\n", + " \"path\": model,\n", + " \"content\": True,\n", + " # \"graph\": True,\n", + " \"hybrid\": True,\n", + " \"scoring\": \"bm25\",\n", + " })\n", + " embeddings.index(data)\n", + " return embeddings\n", + "\n", + "def query_embedding(embeddings: Embeddings, query: str, limit: int = 100) -> list[str]:\n", + " \"\"\"Search embeddings and return matching texts\"\"\"\n", + " results = embeddings.search(query, limit)\n", + " return [r[\"text\"] for r in results]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e73d6386", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "da81a1463d0f4d5694edbfd412e52763", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading weights: 0%| | 0/103 [00:00 Genome Biology > Article\n", + "\n", + "HiC-Pro: an optimized and flexible pipeline\n", + "for Hi-C data processing\n", + "\n", + "Software | Openaccess | Published: 01 December 2015\n", + "Volume 16, articlenumber 259, (2015) Cite this article\n", + "\n", + "Download PDF @ You have full access to this open access article\n", + "\n", + "Nicolas Servant 4, Nelle Varoquaux, Bryan R. Lajoie, Eric Viara, Chong-Jian Chen, Jean-Philippe Vert,\n", + "Edith Heard, Job Dekker & Emmanuel Barillot\n", + "\n", + "S) 65k Accesses f) 1404 Citations & 19 Altmetric & 3 Mentions Exploreall metrics >\n", + "\n", + "Abstract\n", + "\n", + "\n", + "o-Yof = AinTiating: H1iv—-rro-master/scripts/srce/cutsite_trimming.cpp\n", + "5.037 creating: HiC-Pro-master/test-op/\n", + "\n", + "5.038 inflating: HiC-Pro-master/test-op/config_test_as.txt\n", + "\n", + "5.038 inflating: HiC-Pro-master/test-op/config_test_cap.txt\n", + "\n", + "5.038 inflating: HiC-Pro-master/test-op/config_test_dnase.txt\n", + "\n", + "5.038 inflating: HiC-Pro-master/test-op/config_test_latest.txt\n", + "5.038 inflating: HiC-Pro-master/test-op/run-test-op.sh\n", + "\n", + "5.038 finishing deferred symbolic links:\n", + "\n", + "5.038 HiC-Pro-master/doc/themes/paris/logos -> ../../_static/logos/\n", + "5.095 Make sure internet connection works for your shell prompt under current user's privilege ...\n", + "5.096 Starting HiC-Pro installation !\n", + "\n", + "5.122 Exit - Error : Configuration file not found\n", + "\n", + "41 # Install HiC-Pro\n", + "\n", + "42 | >>> RUN cd /opt && \\\n", + "\n", + "43 | >>> wget https://github.com/nservant/HiC-Pro/archive/master.zip -O hicpro_latest.zip && \\\n", + "\n", + "44 | >>> unzip hicpro_latest.zip && \\\n", + "\n", + "45 | >>> cd HiC-Pro-master/scripts/install && \\\n", + "\n", + "46 | >>> bash install_dependencies.sh -c config-install.txt -p /opt/hicpro -o /opt/hicpro/HiC-Pro_3.1.@ -q && \\\n", + "47 | >>> cd /opt/HiC-Pro-master && \\\n", + "\n", + "48 | >>> make install && \\\n", + "\n", + "49 | >>> 1n -s /opt/hicpro/bin/HiC-Pro /usr/local/bin/HiC-Pro && \\\n", + "\n", + "5@ | >>> rm -rf /opt/hicpro_latest.zip /opt/HiC-Pro-master\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "corrected Hi-C counts\n", + "\n", + "10!\n", + "\n", + "10°\n", + "\n", + "107?\n", + "\n", + "10°\n", + "genomic distance\n", + "\n", + "—— data_mcool.h5\n", + "\n", + "> Decay curve\n", + "\n", + "> First converted into .h5\n", + "format\n", + "\n", + "> HiCExplorer—-\n", + "\n", + "hicPlotDistVsCounts()\n", + "> Data quality and\n", + "\n", + "comparison\n", + "\n", + "4.524 HiC-Pro-master/doc/themes/paris/logos -> ../../_static/logos/\n", + "\n", + "4.575 Make sure internet connection works for your shell prompt under current user's privilege ...\n", + "\n", + "4.575 Starting HiC-Pro installation !\n", + "\n", + "4.976 Checking dependencies\n", + "\n", + "4.976 - Python libraries ...0K\n", + "\n", + "6.765 — R installation ...0K\n", + "\n", + "9.515 - Bowtie2 installation ...0K\n", + "\n", + "9.531 - Samtools installation ...0K\n", + "\n", + "9.590\n", + "\n", + "9.598 Checking HiC-Pro configuration\n", + "\n", + "9.758 - Configuration for TORQUE/PBS system ...0K\n", + "\n", + "9.758\n", + "\n", + "9.758 done !\n", + "\n", + "9.844 (g++ -Wall -02 -std=c++@x -o build_matrix /opt/HiC-Pro-master/scripts/src/build_matrix.cpp; mv build_matrix /opt/HiC-Pro-master/scripts)\n", + "16.47 (g++ -Wall -02 -std=c++@x -o cutsite_trimming /opt/HiC-Pro-master/scripts/src/cutsite_trimming.cpp; mv cutsite_trimming /opt/HiC-Pro-master/scripts)\n", + "19.24 realpath: /opt/hicpro/HiC-Pro_3.1.@: No such file or directory\n", + "\n", + "19.25 cp -Ri /opt/HiC-Pro-master /opt/hicpro/HiC-Pro_3.1.0\n", + "\n", + "19.26 cp: cannot create directory '/opt/hicpro/HiC-Pro_3.1.@': No such file or directory\n", + "\n", + "19.27 make: *** [Makefile:78: cp] Error 1\n", + "\n", + "Dockerfile:42\n", + "\n", + "# Install HiC-Pro\n", + "\n", + "41 |\n", + "42 | >>> RUN cd /opt && \\\n", + "43 | >>> wget https://github.com/nservant/HiC-Pro/archive/master.zip -O hicpro_latest.zip && \\\n", + "44 | >>> unzip hicpro_latest.zip && \\\n", + "45 | >>> cd HiC-Pro-master && \\\n", + "46 | >>> bash scripts/install/install_dependencies.sh -c config-install.txt -p /opt/hicpro -o /opt/hicpro/HiC-Pro_3.1.0 -q && \\\n", + "47 | >>> make install && \\\n", + "48 | >>> 1n -s /opt/hicpro/bin/HiC-Pro /usr/local/bin/HiC-Pro && \\\n", + "49 | >>> rm -rf /opt/hicpro_latest.zip /opt/HiC-Pro-master\n", + "|\n", + "\n", + "5@\n", + "\n", + "\n", + "Visualization: HiGlass\n", + "\n", + "HICCUPs juicer_tools:\n", + "\n", + "bedpe file\n", + "\n", + "¥\n", + "\n", + "Enrichmnet: Juicer\n", + "APA,\n", + "TADs: Arrowhead\n", + "\n", + "Juicer\n", + "\n", + "v\n", + "\n", + "Visualization: JuiceBox\n", + "Analysis: HiC Straw\n", + "\n", + "Trimmomatic, FostQC\n", + "\n", + "HIC-Pro\n", + "(Current)\n", + "\n", + "validpairs file\n", + "\n", + "¥\n", + "\n", + "Analysis: Cooler\n", + "library python\n", + "\n", + ">\n", + "\n", + "FitHiC2 loop caller\n", + "\n", + "Enrichment:\n", + "coolpup.py\n", + "\n", + "HiC - Pro Juicer\n", + "\n", + "Parailel Computing Hi-C Fragment\n", + "A Sequenced Alignment and Duplicate Map creation\n", + "Hi-C Reads Chimera Handling Merge Sort removal\n", + "a on\n", + "==\" a —— RI R2\n", + "Sequencing © ———— SSS EES ESS\n", + "Ey SSS SSS . > .\n", + "\n", + "ae a ee : -.\n", + "\n", + "\n", + "Visualization: HiGlass,\n", + "JuicaBox\n", + "\n", + "HICCUPS juicer_tools:\n", + "\n", + "-bedpe file\n", + "\n", + "Enrichmnet Juicer\n", + "\n", + "APA,\n", + "TADS: Arrowhead\n", + "\n", + "Juicer\n", + "\n", + "‘Timmomatic, FastQC\n", + "\n", + "Hic-Pro,\n", + "\n", + "tbedpe ~—————>_GenomicLinks\n", + "\n", + "Visualization: JuiceBox\n", + "Anolysis: Hic Straw\n", + "\n", + "Juicer\n", + "dump\n", + "\n", + "Hic-Pro -\n", + "build_matrix\n", + "\n", + "Individual Matrices <——\n", + "\n", + "Analysis: Cooler\n", + "liorary python,\n", + "\n", + "> FitHiC2 loop caller\n", + "\n", + "Enrichment:\n", + "‘coolpup.ey\n", + "\n", + "Visualization: HiGloss\n", + "\n", + "Visualization: HiGlass\n", + "\n", + "HICCUPS juicer_tools:\n", + "\n", + "bbedpe file\n", + "\n", + "Enrichmnet: Juicer\n", + "APA,\n", + "‘TADs: Arrowhead\n", + "\n", + "Juicer\n", + "\n", + "Visualization: JuiceBox\n", + "Analysis: HIC Straw\n", + "\n", + "Tiimmomatic, FastQc\n", + "\n", + "Hic-Pro\n", + "(Current)\n", + "\n", + "validpairs file\n", + "\n", + "Analysis: Cooler\n", + "library python\n", + "\n", + ">\n", + "\n", + "FithiC2 loop caller\n", + "\n", + "Enrichment:\n", + "coolpup.py\n", + "\n", + "corrected Hi-C counts\n", + "\n", + "10!\n", + "\n", + "10°\n", + "\n", + "107}\n", + "\n", + "104\n", + "\n", + "10°\n", + "genomic distance\n", + "\n", + "10®\n", + "\n", + "—— data_mcool.h5\n", + "\n", + "\n", + "Aman\n", + "_——\n", + "\n", + "Parallel Computing Hi-C Fragment\n", + "\n", + "——\n", + "\n", + "—_—_—\n", + "—\n", + "—4 Sequencing ——\n", + "\n", + "Singleton\n", + "Low MAPQ\n", + "\n", + "Dumped Pairs\n", + "\n", + "\n", + "706883\n", + "706884\n", + "706886\n", + "706885\n", + "706887\n", + "706888\n", + "706890\n", + "706891\n", + "706892\n", + "706889\n", + "706875\n", + "706873\n", + "706876\n", + "706874\n", + "1\n", + "\n", + "1321\n", + "\n", + "root\n", + "root\n", + "root\n", + "root\n", + "root\n", + "root\n", + "root\n", + "root\n", + "root\n", + "root\n", + "root\n", + "root\n", + "root\n", + "root\n", + "root\n", + "\n", + "messagebu\n", + "\n", + "20\n", + "20\n", + "20\n", + "20\n", + "20\n", + "20\n", + "20\n", + "20\n", + "20\n", + "20\n", + "20\n", + "20\n", + "20\n", + "20\n", + "20\n", + "20\n", + "\n", + "2866M\n", + "2866M\n", + "2866M\n", + "2866M\n", + "2866M\n", + "2866M\n", + "2866M\n", + "2866M\n", + "2866M\n", + "2866M\n", + "23440\n", + "23448\n", + "19992\n", + "19992\n", + "\n", + "164M\n", + "\n", + "9456\n", + "\n", + "2391M\n", + "2393M\n", + "2391M\n", + "2393M\n", + "2393M\n", + "2391M\n", + "2391M\n", + "2393M\n", + "2391M\n", + "2393M\n", + "7344\n", + "7344\n", + "2252\n", + "2136\n", + "11788\n", + "3364\n", + "\n", + "2412\n", + "2324\n", + "2412\n", + "2324\n", + "2324\n", + "2412\n", + "2412\n", + "2324\n", + "2412\n", + "2324\n", + "2372\n", + "2372\n", + "1720\n", + "1604\n", + "6216\n", + "1908\n", + "\n", + "NANNNNHNDDDDDDDDANN\n", + "\n", + "400.\n", + "400.\n", + "10@.\n", + "10@.\n", + "10@.\n", + "10@.\n", + "10@.\n", + "10@.\n", + "10@.\n", + "\n", + "DOAAD\n", + "\n", + "PLCTCTDPSORPRRPRRPRREBRBR\n", + "POSCTCT®VVDDVD0D0090 0\n", + "\n", + "VPVTVTAAAD\n", + "\n", + "45h22:\n", + "45h22:\n", + "11h20:\n", + "11h20:\n", + "11h20:\n", + "11h20:\n", + "11h20:\n", + "11h20:\n", + "11h20:\n", + "11h20:\n", + "Q4.\n", + "13.\n", + "40.\n", + "44.\n", + ":1@.\n", + "18.\n", + "\n", + "51:\n", + "50:\n", + "39:\n", + "39:\n", + "\n", + "10:\n", + "\n", + "46\n", + "32\n", + "32\n", + "32\n", + "32\n", + "32\n", + "31\n", + "31\n", + "32\n", + "81\n", + "12\n", + "59\n", + "96\n", + "44\n", + "96\n", + "\n", + "/usr/\n", + "/usr/\n", + "/usr/\n", + "/usr/\n", + "/usr/\n", + "/usr/\n", + "/usr/\n", + "/usr/\n", + "/usr/\n", + "/usr/\n", + "\n", + "oca\n", + "oca\n", + "oca\n", + "oca\n", + "oca\n", + "oca\n", + "oca\n", + "oca\n", + "oca\n", + "oca\n", + "\n", + "/anaconda/envs/HiC-Pro_v3.\n", + "/anaconda/envs/HiC-Pro_v3.\n", + "/anaconda/envs/HiC-Pro_v3.\n", + "/anaconda/envs/HiC-Pro_v3.\n", + "/anaconda/envs/HiC-Pro_v3.\n", + "/anaconda/envs/HiC-Pro_v3.\n", + "/anaconda/envs/HiC-Pro_v3.\n", + "/anaconda/envs/HiC-Pro_v3.\n", + "/anaconda/envs/HiC-Pro_v3.\n", + "/anaconda/envs/HiC-Pro_v3.\n", + "perl /usr/local/anaconda/envs/HiC-Pro_\n", + "\n", + "VPV®VVVVVVOVO\n", + "\n", + "-0/\n", + "-0/\n", + "-0/\n", + "-0/\n", + "-0/\n", + "-0/\n", + "-0/\n", + "-0/\n", + "-0/\n", + "-0/\n", + "\n", + "v3.\n", + "\n", + "perl /usr/local/anaconda/envs/HiC-Pro_v3.\n", + "/anaconda/envs/HiC-Pro_v3.0.0/\n", + "/anaconda/envs/HiC-Pro_v3.0.0/\n", + "/lib/systemd/systemd --system --deserialize 33\n", + "@dbus—daemon --system —-address=systemd:\n", + "\n", + "/usr/\n", + "/usr/\n", + "\n", + "oca\n", + "oca\n", + "\n", + "bin/bowtie2-align-s --wrapper\n", + "bin/bowtie2-align-s --wrapper\n", + "bin/bowtie2-align-s --wrapper\n", + "bin/bowtie2-align-s --wrapper\n", + "bin/bowtie2-align-s --wrapper\n", + "bin/bowtie2-align-s --wrapper\n", + "bin/bowtie2-align-s --wrapper\n", + "bin/bowtie2-align-s --wrapper\n", + "bin/bowtie2-align-s --wrapper\n", + "bin/bowtie2-align-s --wrapper\n", + "\n", + "basic-®\n", + "basic-®\n", + "basic-®\n", + "basic-®\n", + "basic-®\n", + "basic-®\n", + "basic-®\n", + "basic-®\n", + "basic-®\n", + "basic-®\n", + "\n", + "--very-sensitive\n", + "—-very-sensitive\n", + "--very-sensitive\n", + "--very-sensitive\n", + "--very-sensitive\n", + "--very-sensitive\n", + "--very-sensitive\n", + "--very-sensitive\n", + "—-very-sensitive\n", + "—-very-sensitive\n", + "\n", + "30\n", + "30\n", + "30\n", + "30\n", + "30\n", + "30\n", + "30\n", + "30\n", + "30\n", + "30\n", + "\n", + "--score-min\n", + "--score-min\n", + "--score-min\n", + "--score-min\n", + "--score-min\n", + "--score-min\n", + "--score-min\n", + "--score-min\n", + "--score-min\n", + "—-score-min\n", + "\n", + "DAARBAAADH\n", + "NNNNNNNNN\n", + "\n", + "Pere rere\n", + "SoooKoKOOOO\n", + "\n", + "--end-to-end\n", + "--end-to-end\n", + "--end-to-end\n", + "—-end-to-end\n", + "--end-to-end\n", + "--end-to-end\n", + "--end-to-end\n", + "--end-to-end\n", + "--end-to-end\n", + "--end-to-end\n", + "\n", + "—-reo\n", + "—-reo\n", + "—-reo\n", + "—-reo\n", + "—-reo\n", + "—-reo\n", + "—-reo\n", + "—-reo\n", + "—-reo\n", + "—-reo\n", + "\n", + "@.0/bin/bowtie2 --very-sensitive -L 3@ --score-min L,-@.6,-@.2 --end-to- end --reorder --un bowtie_resu\n", + "@.0/bin/bowtie2 --very-sensitive -L 3@ --score-min L,-@.6,-@.2 --end-to-end --reorder --un bowtie_resu\n", + "\n", + "bin/samtools view -F 4 -bS —\n", + "bin/samtools view -F 4 -bS —\n", + "\n", + "--nofork --nopidfile --systemd-activation --syslog-only\n", + "\n", + "Fig x: Visualization in Juicebox for two HiC datasets\n", + "\n", + "The 10*10 chromosomes full contact matrix was visualized in Juicebox GUI app by importing files\n", + "locally. The left panel shows the matrix from the cis-regulatory elements in Maize study and the one\n", + "on the right is from (7). Even though the raw hic sequencing data was trimmed correctly the second\n", + "dataset showed poor quality as is evident from the figure. The noise was high and HiCCUPs couldn't\n", + "find loops correctly.\n", + "\n", + "-hic & .cooV.mcool:; Binary formats for Hi-C data\n", + "> Compressed contact matrices at multiple resolutions\n", + "Genomic intervals for binned data\n", + "\n", + ">\n", + "> Interaction frequencies between loci\n", + "> Supports multiple bin sizes & corrections in one file\n", + "\n", + "\n", + "GQAAAGP RPP PPP PPP PRP PPP PPP PP RS\n", + "\n", + "«/16\n", + "-717\n", + "-717\n", + "-717\n", + "-718\n", + "-718\n", + "-718\n", + "-718\n", + "-718\n", + "-719\n", + "-719\n", + "-719\n", + "-719\n", + "-720\n", + "-720\n", + "-720\n", + "-720\n", + "+721\n", + "- 786\n", + "-814\n", + "+917\n", + "-969\n", + "-969\n", + "-340\n", + "-341\n", + "-342\n", + "-343\n", + "-346\n", + "\n", + "inflating: hiC-Pro-master/scripts/onlarget.py\n", + "\n", + "inflating: HiC-Pro-master/scripts/plot_hic_contacts.R\n", + "\n", + "inflating: HiC-Pro-master/scripts/plot_hic_fragment.R\n", + "\n", + "inflating: HiC-Pro-master/scripts/plot_mapping_portion.R\n", + "\n", + "inflating: HiC-Pro-master/scripts/plot_pairing_portion.R\n", + "\n", + "inflating: HiC-Pro-master/scripts/split_valid_interactions.py\n", + "\n", + "creating: HiC-Pro-master/scripts/src/\n", + "\n", + "extracting: HiC-Pro-master/scripts/src/README\n", + "\n", + "inflating: HiC-Pro-master/scripts/src/build_matrix.cpp\n", + "\n", + "inflating: HiC-Pro-master/scripts/src/cutsite_trimming.cpp\n", + "\n", + "creating: HiC-Pro-master/test-op/\n", + "\n", + "inflating: HiC-Pro-master/test-op/config_test_as.txt\n", + "\n", + "inflating: HiC-Pro-master/test-op/config_test_cap.txt\n", + "\n", + "inflating: HiC-Pro-master/test-op/config_test_dnase.txt\n", + "\n", + "inflating: HiC-Pro-master/test-op/config_test_latest.txt\n", + "\n", + "inflating: HiC-Pro-master/test-op/run-test-op.sh\n", + "finishing deferred symbolic links:\n", + "\n", + "HiC-Pro-master/doc/themes/paris/logos -> ../../_static/logos/\n", + "make -f ./scripts/install/Makefile CONFIG_SYS=./config-install.txt prefix=/opt/hicpro\n", + "make[1]: Entering directory '/opt/HiC-Pro-master'\n", + "./scripts/install/install_dependencies.sh -c ./config-install.txt -p /opt/hicpro -o /opt/hicpro/HiC-Pro_3.1.@ -q\n", + "Make sure internet connection works for your shell prompt under current user's privilege ...\n", + "Starting HiC-Pro installation !\n", + "Directory /opt/hicpro does not exist!\n", + "Exit - Error - unable to install/check dependancies !\n", + "make[1]: **x* [scripts/install/Makefile:41: configure] Error 1\n", + "make[1]: Leaving directory '/opt/HiC-Pro-master'\n", + "make: **x* [Makefile:38: configure] Error 2\n", + "\n", + "40 | # Install HiC-Pro\n", + "\n", + "41 | >>> RUN cd /opt && \\\n", + "\n", + "42 | >>> wget https://github.com/nservant/HiC-Pro/archive/master.zip -O hicpro_latest.zip && \\\n", + "\n", + "43 | >>> unzip hicpro_latest.zip && \\\n", + "\n", + "44 | >>> cd HiC-Pro-master && \\\n", + "\n", + "45 | >>> make configure prefix=/opt/hicpro && \\\n", + "\n", + "46 | >>> make install && \\\n", + "\n", + "47 | >>> 1n -s /opt/hicpro/bin/HiC-Pro /usr/local/bin/HiC-Pro && \\\n", + "\n", + "48 | >>> rm -rf /opt/hicpro_latest.zip /opt/HiC-Pro-master\n", + "\n", + "49 |\n", + "ERROR: failed to solve: process \"/bin/sh -c cd /opt && wget https://github.com/nservant/HiC-Pro/archive/master.zip -O hicpro_latest.zip && unzip hicpro_latest.zip && cd HiC-Pro-master && make\n", + "configure prefix=/opt/hicpro && make install && ln -s /opt/hicpro/bin/HiC-Pro /usr/local/bin/HiC-Pro && rm -rf /opt/hicpro_latest.zip /opt/HiC-Pro-master\" did not complete successfully: exit code:\n", + "\n", + "2\n", + "\n", + "A Sequenced\n", + "Hi-C Reads\n", + "\n", + "Alignment and\n", + "Chimera Handling Merge Sort\n", + "\n", + "SS Sass = SS\n", + "—_ oo i\n", + "—\n", + "SSS oo\n", + "a\n", + "\n", + "oe OT\n", + "\n", + "Duplicate\n", + "removal\n", + "\n", + "Map creation\n", + "\n", + "i\n", + "—————\n", + "\n", + "\n", + "-hic & .cool/.mcool: Binary formats for Hi-C data\n", + "Compressed contact matrices at multiple resolutions\n", + "\n", + "Genomic intervals for binned data\n", + "Interaction frequencies between loci\n", + "Supports multiple bin sizes & corrections in one file\n", + "\n", + "\n", + "Overall Interpretation\n", + "\n", + "e The data show a good proportion of valid Hi-C contacts (17.40%), but a large number of reads\n", + "(64.87%) are excluded due to low quality (MAPQ). This could be due to sequence complexity,\n", + "\n", + "genome alignment issues, or technical problems during sequencing.\n", + "\n", + "e The balance in pair types and dominance of intra-chromosomal contacts indicate proper\n", + "\n", + "library preparation and plausible results for downstream analysis.\n", + "\n", + "e Long-range contacts provide meaningful insights into chromatin organization and can be\n", + "\n", + "used for modeling chromosomal structure.\n", + "\n", + "workflow_aman\n", + "\n", + "a i\n", + "a\n", + "ToDo\n", + "\n", + "hic hic2cool cool\n", + "\n", + "plot\n", + "\n", + "matrix (exported from\n", + "\n", + "juicerbon) Python script Plot\n", + "\n", + "\n", + "QW 6B github.com/kuikui-C/DconnLoop W © Search Startpage\n", + "\n", + "(1) README o\n", + "\n", + "pip install matplotlib\n", + "conda install hicexplorer\n", + "conda activate DconnLoop\n", + "\n", + "Usage\n", + "\n", + "The input data used can be downloaded in the supplementary materials of the paper. The input contact maps use\n", + "the cool file format, which, if needed, can be converted and normalized using the HiCExplorer's hicConvertFormat\n", + "command.\n", + "\n", + "HiC to cool\n", + "\n", + "hicConvertFormat -m ./ENCFFQ97SKJ.hic --inputFormat hic --outputFormat cool -o ./ENCFF@97SKJ.c oO\n", + "hicConvertFormat -m ./ENCFFQ97SKJ_10000.cool --inputFormat cool —-outputFormat cool -o ./ENCFF\n", + "\n", + "Generate positive and negative samples\n", + "\n", + "python PosNeg_Samp_Gen.py -p ./input/gm12878/Ra02014—GM12878-MboI-allreps—filtered.1@kb.cool — oO\n", + "\n", + "Training\n", + "\n", + "python leave_one_train.py -d ./PosNeg_samp/ -g 1,2,3 —b 256 -lr @.001 -e 3@ -w 0.0005 -c ./mod oO\n", + "\n", + "Testing\n", + "\n", + "python leave_one_test.py -d ./PosNeg_samp/ -g 1,2,3 -c ./model/ -f ./model/chri5-record_test. oO\n", + "\n", + "Score\n", + "\n", + "python score_chromosome.py -p ./input/gm12878/Ra02014—GM12878-MboI-allreps—filtered.1@kb.cool oO\n", + "\n", + "Cluster\n", + "\n", + "python cluster.py -d 5 -i ./scores/chr15.bed -r 10000 -m 0.97 -p 75 -e 10 -o ./cluster/chr15-L oO\n", + "\n", + "@ Terminal Shell Edit View Window Help SU GB O+ 8 © & WD ® F Q B® SatFeb15 12:24\n", + "\n", + "ee@ aman — aman@unicorn: ~/fihic_bias — ssh -L 9005:localhost:9005 aman@10.162.143.69 — 208x63\n", + "\n", + "-20.00.jar hiccups --cpu --threads 16 -r 500000 -f @.15 -p 1.5 -i 12 -d 250000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_5@@kb/\n", + "\n", + "-20.00.jar hiccups --cpu --threads 16 -r 500000 -f @.15 -p 2 -i 12 -d 250000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_5@@kb/\n", + "\n", + "-20.00.jar hiccups --cpu --threads 16 -r 500000 -f @.15 -p 1 -i 12 -d 250000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_5@@kb/\n", + "\n", + "-20.00.jar hiccups --cpu --threads 16 -r 100000 -f @.2 -p 2 -i 12 -d 250000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_5@@kb/\n", + "\n", + "-20.00.jar hiccups --cpu --threads 16 -r 1000000 -f @.2 -p 2 -i 12 -d 250000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_5@@kb/\n", + "\n", + "-20.0@.jar hiccups --cpu —-threads 16 -r 25000 -f @.2 -p 2 -i 12 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_5@0kb/\n", + "\n", + "-20.00.jar hiccups --cpu —-threads 16 -r 50000 -f @.2 -p 2 -i 12 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_5@0kb/\n", + "\n", + "-20.00.jar hiccups --cpu --threads 16 -r 25000 -f @.25 -p 2 -i 14 -d 25000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_5@@kb/\n", + "\n", + "-20.00.jar hiccups --cpu --threads 16 -r 50000 -f @.25 -p 2 -i 14 -d 50000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_5@@kb/\n", + "\n", + "java -jar ~/juicer/CPU/common/juicer_too -20.00.jar hiccups --cpu --threads 16 -r 5000,10000,25000 -f 0.30 -p 1.5 -i 10 -d 50000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_results/\n", + "java -jar ~/juicer/CPU/common/juicer_too -20.00.jar hiccups --cpu --threads 16 -r 5000, 25000 -f @.3@ -p 1 -i 10 -d 50000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_results/\n", + "\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -r 25000 -f @.3@ -p 1 -i 10 -d 50000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_results/\n", + "\n", + "nano /home/aman/hiccups_results/enriched_pixels_25000.bedpe\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -m 512 -c all -r 5000,10000 -k KR -f .1,.1 -p 4,2 -i 7,5 -t @.02,1.5,1.75,2 -d 20000,20000,50000 /mnt/storage3/aman/hicpro2jui\n", + "cebox/data.allValidPairs.hic ~/hiccups_optimized_results/\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -m 512 -c all -r 5000,10000 -k KR -f .1,.1 -p 4,2 -i 7,5 -t @.02,1.5,1.75,2 -d 20000, 25000,50000 /mnt/storage3/aman/hicpro2jui\n", + "cebox/data.allValidPairs.hic ~/hiccups_optimized_results/\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -m 512 -c all -r 5000,10000 -k KR -f .1,.1 -p 4,2 -i 7,5 -t @.02,1.5,1.75,2 -d 20000,50000 /mnt/storage3/aman/hicpro2juicebox/\n", + "data.allValidPairs.hic ~/hiccups_optimized_results/\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -r 5000,10000 -k KR -f .1 -p 4 -i 7 -t @.02,1.5,1.75,2 -d 20000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hi\n", + "ccups_optimized_results/\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -r 5000,10000 -f .1 -p 4 -i 7 -t @.02,1.5,1.75,2 -d 20000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_\n", + "optimized_results/\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -r 5000,10000 -f 2 -p 4 -i 7 -t @.02,1.5,1.75,2 -d 20000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccups_o\n", + "ptimized_results/\n", + "\n", + "cd ~/hiccups_optimized_results/\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -r 5000,10000 -f @.2 -p 4 -i 7 -t 0.02,1.5,1.75,2 -d 20000 /mnt/storage3/aman/hicpro2juicebox/data.allValidPairs.hic ~/hiccup\n", + "_optimized_results/\n", + "\n", + "~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups -h\n", + "\n", + "~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups\n", + "\n", + "cat ~/.bash_history | hiccups\n", + "\n", + "cat ~/.bash_history | grep hiccups\n", + "\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -r 10000 -i /mnt/storage3/aman/data.allValidPairs.hic ~/hiccups2_10kb/\n", + "\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -i /mnt/storage3/aman/data.allValidPairs.hic ~/hiccups2_1@kb/\n", + "\n", + "mkdir hiccups2_1@kb\n", + "\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -r 10000 -i /mnt/storage3/aman/data.allValidPairs.hic ~/hiccups2_10kb/\n", + "\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -r 10000 /mnt/storage3/aman/data.allValidPairs.hic ~/hiccups2_10kb/\n", + "\n", + "1s -lh ~/hiccups2_1@kb/\n", + "\n", + "we -l ~/hiccups2_10kb/fdr_thresholds_10000\n", + "\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -r 10000 /mnt/storage3/aman/data.allValidPairs.hic ~/hiccups2_10kb/\n", + "\n", + "java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 /mnt/storage3/aman/data.allValidPairs.hic ~/hiccups2_10kb/\n", + "\n", + "1s ~/hiccups2_10kb/\n", + "\n", + "we -l1 ~/hiccups2_10kb/*\n", + "\n", + "cat ~/hiccups2_10kb/fdr_thresholds_5000\n", + "\n", + "1s -ltrh ~/hiccups2_10kb/\n", + "\n", + "(base) aman@unicorn:~/fihic_bias$ java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -r 1000@ -i /mnt/storage3/aman/data.allValidPairs.hic ~/hiccupsfinal_10kb/\n", + "\n", + "WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.\n", + "\n", + "WARN [2025-@2-15T11:23:46,503] [Globals.java:138] [main] Development mode is enabled\n", + "\n", + "Usage: juicer_tools hiccups [-m matrixSize] [-k normalization (NONE/VC/VC_SQRT/KR)] [-c chromosome(s)] [-r resolution(s)] [--restrict] [-f fdr] [-p peak width] [-i window] [-t thresholds] [-d centroid dista\n", + "neces] [specified_loop_list]\n", + "\n", + "(base) aman@unicorn:~/fihic_bias$ java -jar ~/juicer/CPU/common/juicer_tools.2.20.0@.jar hiccups --cpu --threads 16 -r 10000 /mnt/storage3/aman/data.allValidPairs.hic ~/hiccupsfinal_10kb/\n", + "\n", + "WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.\n", + "\n", + "WARN [2025-@2-15T11:24:14,443] [Globals.java:138] [main] Development mode is enabled\n", + "\n", + "Reading file: /mnt/storage3/aman/data.allValidPairs.hic\n", + "\n", + "Using the following configurations for HiCCUPS:\n", + "\n", + "Config res: 10000 peak: 2 window: 5 fdr: 10% radius: 20000\n", + "\n", + "WARNING - You are using the CPU version of HiCCUPS.\n", + "\n", + "The GPU version of HiCCUPS is the official version and has been tested extensively.\n", + "\n", + "The CPU version only searches for loops within 8MB (by default) of the diagonal and is still experimental.\n", + "\n", + "Using 16 CPU thread(s) for primary task\n", + "\n", + "Warning Hi-C map may be too sparse to find many loops via HiCCUPS.\n", + "\n", + "Running HiCCUPS for resolution 10000\n", + "\n", + "java -jar ~/juicer/CPU/common/juicer_too\n", + "java -jar ~/juicer/CPU/common/juicer_too\n", + "java -jar ~/juicer/CPU/common/juicer_too\n", + "java -jar ~/juicer/CPU/common/juicer_too\n", + "java -jar ~/juicer/CPU/common/juicer_too\n", + "java -jar ~/juicer/CPU/common/juicer_too\n", + "java -jar ~/juicer/CPU/common/juicer_too\n", + "java -jar ~/juicer/CPU/common/juicer_too\n", + "java -jar ~/juicer/CPU/common/juicer_too\n", + "\n", + "DHOHDHHHHHHHD\n", + "NNNNNNNNNNN\n", + "\n", + "\n", + "Upregulated Downregulated\n", + "\n", + "H3K27me’ signal(cp) at upregulated genes HoKzrmed slgnalcotrt at uprepuaes genes nes\n", + "3x27 me3 signal HaK2763 signal\n", + "\n", + "‘ene astce (9) ‘ge dace\n", + "\n", + "In [54]: import fanc\n", + "import fanc.peaks\n", + "import fanc.plotting as fancplot\n", + "\n", + "import logging\n", + "logging. basicConfig(level=logging. INFO, format=\"%(asctime)s %(levelname)s %(message)s\")\n", + "\n", + "hic_data = fanc. load('/mnt/storage3/aman/wdbasejuicer_new/aligned/inter_3@.hic')\n", + "loop_caller = fanc.RaoPeakCaller()\n", + "\n", + "/home/aman/. lLocal/lib/python3.10/site-packages/fanc/compatibility/juicer.py:330: UserWarning: No resolution chosen\n", + "for Juicer Hic - using 2500000bp. Specify a custom resolution using <.hic file>@\n", + "warnings.warn(\"No resolution chosen for Juicer Hic - using {}bp. \"\n", + "/home/aman/. lLocal/lib/python3.10/site-packages/fanc/compatibility/juicer.py:353: UserWarning: Support for Juicer .h\n", + "ic v9 is still in beta. Please report any issues to https://github.com/vaquerizas lab/fanc/issues/92\n", + "warnings.warn(f\"Support for Juicer .hic v{self.version} is still in beta. \"\n", + "\n", + "ne...@broadinstitute.org Jan 18, 2019, 8:16:32PM y+ roN\n", + "to AS, 3D Genomics\n", + "\n", + "Hello,\n", + "You can just run HiCCUPS or Arrowhead on the hic file using the latest jar: https://github.com/aidenlab/juicer/wiki/Download\n", + "There is extensive documentation here: https://github.com/aidenlab/juicer/wiki/CPU-HiCCUPS\n", + "\n", + "Please note: 300 million reads is not enough to reliably call loops. We also do not recommend domain calling at this depth. The ENCODE standard for loop\n", + "calling is 2 billion reads.\n", + "\n", + "Best\n", + "Neva\n", + "\n", + "You received this message because you are subscribed to the Google Groups \"3D Genomics\" group.\n", + "\n", + "To unsubscribe from this group and stop receiving emails from it, send an email to 3d-genomics...@googlegroups.com.\n", + "To view this discussion on the web visit https://groups.google.com/d/msgid/3d-genomics/761 6da19-9387-4c46-99d6-\n", + "ef852e2b0170%40googlegroups.com.\n", + "\n", + "For more options, visit https://groups.google.com/d/optout.\n", + "\n", + "Neva Cherniavsky Durand, Ph.D.\n", + "Staff Scientist, Aiden Lab\n", + "www.aidenlab.org\n", + "\n", + "> ValidPairs file from HiC-Pro used\n", + "as pre-input. 78M entries. Format:\n", + "\n", + "chri start1l endl chr2 start2 end2 readID strand1 strand2\n", + "\n", + "> .bedpe format (input):\n", + "\n", + "chri start1l endl chr2 start2 end2\n", + "\n", + "> Output csv\n", + "format:\n", + "\n", + "chr sl el chr s2 e2 prob interacted\n", + "\n", + "> 50000 entry bedpe file - 11249\n", + "with interacted score 1\n", + "\n", + "\n", + "Hi-C Signal\n", + "\n", + "25\n", + "\n", + "N\n", + "°\n", + "\n", + "rR\n", + "uw\n", + "\n", + "10\n", + "\n", + "Interaction Decay\n", + "\n", + "—— Row Sum (interactions by position)\n", + "—— Column Sum (Interactions by position)\n", + "\n", + "2 3 4 5 6\n", + "Position Relative to Anchor\n", + "\n", + "EXPLORER\n", + "\n", + "\\ AMAN [SSH: SCC]\n", + "\n", + "> hic-pro-git\n", + "\n", + "> mustache-git\n", + "\n", + "chrom.sizes\n", + "\n", + "cool_balance.sh\n", + "GEO2457_5kb_mustache_loops.bedpe\n", + "GEO2457_5kb.cool\n", + "GEO2457_dots_5kb.bedpe\n", + "GEO2457_expected_1kb.tsv\n", + "GEO2457_expected_5kb.tsv\n", + "GEO2457_v2.mcool\n", + "\n", + "GEO2457.hic\n", + "GEO2459_5kb_mustache_loops.bedpe\n", + "GEO2459_5kb.cool\n", + "GEO2459_expected_5kb.tsv\n", + "GEO2459_v2_expected_cis.tsv\n", + "GEO2459_v2.mcool\n", + "\n", + "GEO2459.hic\n", + "\n", + "$ hic2cool_aman.sh\n", + "\n", + "$ test.sh\n", + "\n", + "mY 6 oO DB\n", + "\n", + "6]\n", + "\n", + "@\n", + "\n", + "hy «D “OUTLINE\n", + "\n", + "PP aman [SSH: SCC]\n", + "\n", + "Show All Commands\n", + "Go to File\n", + "\n", + "Find in Files\n", + "\n", + "Toggle Full Screen\n", + "\n", + "Show Settings\n", + "\n", + "Veeb dls MOODLE E-mail Help\n", + "\n", + "Tah, Catalogue Dashboard My courses Q Dp Aman Shamil Nalakath © aa\n", + "\n", + "Bioinformatics Il MOOC: View: Overview report\n", + "\n", + "Bioinformatics Il information 2024 Course Participants Grades\n", + "\n", + "General Introduction to Bioinformatics II - Ol...\n", + "\n", + "Introduction to the course\n", + "\n", + "Course info (link to study information syste... Overview report\n", + "\n", + "Teacher's announcements\n", + "\n", + "Course participant's forum (ask questions fr... Aman Shamil Nalakath\n", + "\n", + "Project Work 1 - Genome project plan\n", + "\n", + "Grade\n", + "\n", + "Bioinformatics group project example from ... Course name\n", + "\n", + "Week 1 Bioinformatics II MOOC 97.00\n", + "\n", + "Week 1 general discussion\n", + "\n", + "Lecture 1 A - Introduction\n", + "\n", + "Video: Lecture 1 A - Introduction\n", + "\n", + "Students introduction and aims (DL 12.09. 2...\n", + "\n", + "How to (seriously) read a scientific paper\n", + "\n", + "Article 1\n", + "\n", + "Lecture 1B - Setting up HPC Access\n", + "\n", + "Video: Lecture 1 B - Setting up HPC access TAL\n", + "\n", + "Meet and greet\n", + "\n", + "Get the mobile app\n", + "\n", + "Coursework 1 on Article 1: Aspects of geno... Policies\n", + "\n", + "\n", + "Leaf Hi-C K4me3 HiChIP K27me3 HiChIP\n", + "\n", + "face mar rapa mat\n", + "\n", + "eQTL-gene\n", + "links >20 kb\n", + "\n", + "shuffled pairs :\n", + "\n", + "\n", + "Sequencing\n", + "\n", + "Sequenced Reads: 547812856\n", + "\n", + "Duplication and Complexity (% Sequenced Reads)\n", + "\n", + "Analysis of Unique Reads (% Sequenced Reads / % Unique Reads)\n", + "\n", + "Intra-fragment Reads: 34,307,600\n", + "\n", + "Below MAPQ Threshold: 355,353,763 (64.87% / 73.27%)\n", + "\n", + "Hi-C Contacts: 95,311,495 (17.40% / 19.65%)\n", + "3' Bias (Long Range): 97% - 3%\n", + "\n", + "Pair Type % (L-I-O-R): 25% - 25% - 25% - 25%\n", + "\n", + "Analysis of Hi-C Contacts (% Sequenced Reads / % Unique Reads)\n", + "\n", + "Inter-chromosomal: 22,195,088 (4.05% / 4.58%)\n", + "Intra-chromosomal: 73,116,407 (13.35% / 15.08%)\n", + "Long Range (>20Kb): 35,425,148 (6.47% / 7.30%)\n", + "\n", + "Solving environment: ...working... INFO conda.cc\n", + "INFO conda.conda_libmamba_solver.solver:_solve_é\n", + "{\n", + "\n", + "\"INSTALL\": [\n", + "\n", + "\"hicexplorer\"\n", + "\n", + "]\n", + "}\n", + "info libmamba Parsing MatchSpec hicexplorer\n", + "info libmamba Parsing MatchSpec hicexplorer\n", + "info libmamba Adding job: hicexplorer\n", + "\n", + ".\n", + "\n", + "@ MainWindow Mon Nov 4 21:17\n", + "\n", + "eee [Juicebox 2.17.00] Hi-C Map <9>: inter.hic\n", + "\n", + "File View Bookmarks Assembly Dev\n", + "Chromosomes\n", + "\n", + "All All Be\n", + "\n", + "Show\n", + "\n", + "Normalization (Obs | Ctrl) Color Range\n", + "2 I Tr\n", + "\n", + "3773\n", + "\n", + "Observed None None\n", + "\n", + "I I I I I I It\n", + "2.5MB 500 KB 100KB 25KB 5KB 1KB 200BP\n", + "\n", + "LayerO << oO\n", + "\n", + "Show Annotation Panel J\n", + "\n", + "\n", + "GSM3398051: HiC maize Leaf-HiC rep2; Zea mays; Hi-C\n", + "\n", + "1 ILLUMINA (NextSeq 500) run: 528.9M spots, 80.4G bases, 30.8Gb downloads\n", + "Accession: SRX4727418\n", + "\n", + "GSM3398050: HiC maize Leaf-HiC rep1; Zea mays; Hi-C\n", + "\n", + "1 ILLUMINA (NextSeq 500) run: 89.8M spots, 13.7G bases, 4.5Gb downloads\n", + "Accession: SRX4727417\n", + "\n", + "(mustache_aman) [papantonis1@gwdu1@1 aman]$ awk '$1 == $9 {print $1}' GE02457_dots_5kb.bedpe | sort | unig -c && wc -1 GE02457_dots_5kb.bedpe\n", + "842 chri\n", + "413 chr1e\n", + "465 chri1\n", + "442 chr12\n", + "244 chri3\n", + "254 chri4\n", + "234 chris\n", + "174 chri16\n", + "248 chr17\n", + "196 chri8\n", + "122 chri9\n", + "817 chr2\n", + "196 chr2e\n", + "\n", + "81 chr21\n", + "78 chr22\n", + "731 chr3\n", + "\n", + "594 chr4\n", + "609 chr5\n", + "631 chr6é\n", + "478 chr7\n", + "\n", + "505 chr8&\n", + "349 chr9\n", + "\n", + "184 chrx\n", + "\n", + "8888 GE02457_dots_5kb.bedpe\n", + "\n", + "@ Mainwindow @®@@6OeOr+ezek@ee =) FS Q SS MonDec30 1\n", + "\n", + "[ Rem ) [Juicebox 2.17.00] Hi-C Map <9>: data.allValidPairs.hic\n", + "\n", + "File View Bookmarks Assembly Dev\n", + "Chromosomes\n", + "\n", + "6 @ « Ge\n", + "\n", + "Normalization (Obs | Ctrl) Resolution (BP) Color Range\n", + "\n", + "6:113,950,001-114,000,000\n", + "\n", + "100 MB\n", + "\n", + "merge...

[> oO\n", + "\n", + "10000... <—@\n", + "LayerO <<\n", + "\n", + "Show Annotation Panel\n", + "\n", + "\n", + "% TADS\n", + "\n", + "[papantonis1@gwdu101 mustache_results]$\n", + "BIN1_CHR BIN1_START BIN1_END\n", + "chr 5510000 5515000 chr1 5610000\n", + "chr1 5505000 5510000 chr1 5745000\n", + "chr1 5635000 5640000 chr1 5745000\n", + "chr1 7665000 7670000 chr1 7750000\n", + "chr1 7985000 7990000 chr1 8325000\n", + "chri1 7990000 7995000 chr1 8105000\n", + "chr1 8020000 8025000 chr1 8310000\n", + "chri1 8020000 8025000 chr1 8240000\n", + "chr 8560000 8565000 chr1 8725000\n", + "papantonis1@gwdu101 mustache_results]$\n", + "papantonis1@gwdu101 mustache_results]$\n", + "papantonis1@gwdu101 mustache_results]$\n", + "papantonis1@gwdu101 mustache_results]$\n", + "papantonis1@gwdu101 mustache_results]$\n", + "papantonis1@gwdu101 mustache_results]$\n", + "papantonis1@gwdu101 mustache_results]$\n", + "12007 anchor_bed_try2/rbp1_anchors_final\n", + "papantonis1@gwdu101 mustache_results]$\n", + "13598 anchor_bed_try2/ctrl_anchors_final\n", + "\n", + "papantonis1@gwdu101 mustache_results]$\n", + "\n", + "> -a anchor_bed_try2/rbp1_anchors_fina\n", + "> -b ~/aman/microc_data/nadine_macro/C!\n", + "> -u > anchor_bed_try2/rbp1_anchors_wi\n", + "\n", + "papantonis1@gwdu101 mustache_results]$\n", + "1153 anchor_bed_try2/rbp1_anchors_with_C\n", + "\n", + "papantonis1@gwdu101 mustache_results]$\n", + "> -a anchor_bed_try2/ctrl_anchors_fina\n", + "> -b ~/aman/microc_data/nadine_macro/C.\n", + "> -u > anchor_bed_try2/ctrl_anchors_wi\n", + "\n", + "1767 anchor_bed_try2/ctrl_anchors_with_C\n", + "papantonis1@gwdu101 mustache_results]$\n", + "> -a anchor_bed_try2/rbp1_anchors_fina\n", + "> -b ~/aman/microc_data/nadine_macro/Cl\n", + "> -w 500@ -u > anchor_bed_try2/rbp1_an\n", + "papantonis1@gwdu101 mustache_results]$\n", + "> -a anchor_bed_try2/ctrl_anchors_fina\n", + "> -b ~/aman/microc_data/nadine_macro/C.\n", + "> -w 50@@ -u > anchor_bed_try2/ctrl_an\n", + "\n", + "1833 anchor_bed_try2/rbp1_anchors_near5k\n", + "\n", + "2689 anchor_bed_try2/ctrl_anchors_near5k\n", + "papantonis1@gwdu101 mustache_results]$\n", + "> -a anchor_bed_try2/rbp1_anchors_fina\n", + "> -b ~/aman/microc_data/nadine_macro/Cl\n", + "> -w 10000 -u > anchor_bed_try2/rbp1_a\n", + "papantonis1@gwdu101 mustache_results]$\n", + "> -a anchor_bed_try2/ctrl_anchors_fina\n", + "> -b ~/aman/microc_data/nadine_macro/C.\n", + "> -w 10000 -u > anchor_bed_try2/ctrl_a\n", + "\n", + "2046 anchor_bed_try2/rbp1_anchors_near1®\n", + "\n", + "3055 anchor_bed_try2/ctrl_anchors_near1@\n", + "\n", + "head rbp1_loops_5k.bedpe\n", + "BIN2_CHROMOSOME BIN2_START BIN2_END\n", + "5615000\n", + "5750000\n", + "5750000\n", + "7755000\n", + "8330000\n", + "8110000\n", + "8315000\n", + "8245000\n", + "8730000\n", + "tail -n +2 rbp1_loops_5k.bedpe | cut -f1-3 > anchor_bed_try2/rbp1_anchor1.bed\n", + "tail -n +2 rbp1_loops_5k.bedpe | cut -f4-6 > anchor_bed_try2/rbp1_anchor2.bed\n", + "cat anchor_bed_try2/rbp1_anchor1.bed anchor_bed_try2/rbp1_anchor2.bed | sort -k1,1 -k2,2n | uniq > anchor_bed_try2/rbp1_anchors_final_tab.bed\n", + "tail -n +2 ctrl_loops_5k.bedpe | cut -f1-3 > anchor_bed_try2/ctrl_anchor1.bed\n", + "tail -n +2 ctrl_loops_5k.bedpe | cut -f4-6 > anchor_bed_try2/ctrl_anchor2.bed\n", + "cat anchor_bed_try2/ctrl_anchor1.bed anchor_bed_try2/ctrl_anchor2.bed | sort -k1,1 -k2,2n | uniq > anchor_bed_try2/ctrl_anchors_final_tab.bed\n", + "we -l anchor_bed_try2/rbp1_anchors_final_tab.bed\n", + "\n", + "_tab.bed\n", + "\n", + "we -l anchor_bed_try2/ctrl_anchors_final_tab.bed\n", + "_tab.bed\n", + "\n", + "bedtools intersect \\\n", + "\n", + "l_tab.bed \\\n", + "PI_CTCF_seacr_top@.@1.peaks.stringent.bed \\\n", + "th_CTCF.bed\n", + "\n", + "we -l anchor_bed_try2/rbp1_anchors_with_CTCF.bed\n", + "TCF. bed\n", + "\n", + "bedtools intersect \\\n", + "\n", + "l_tab.bed \\\n", + "_CTCF_seacr_top@.01.peaks.stringent.bed \\\n", + "th_CTCF.bed\n", + "\n", + "papantonis1@gwdu101 mustache_results]$ we -l anchor_bed_try2/ctrl_anchors_with_CTCF.bed\n", + "\n", + "TCF. bed\n", + "\n", + "bedtools window \\\n", + "\n", + "l_tab.bed \\\n", + "PI_CTCF_seacr_top@.@1.peaks.stringent.bed \\\n", + "chors_near5kb_CTCF.bed\n", + "\n", + "bedtools window \\\n", + "\n", + "l_tab.bed \\\n", + "_CTCF_seacr_top@.01.peaks.stringent.bed \\\n", + "chors_near5kb_CTCF.bed\n", + "\n", + "papantonis1@gwdu101 mustache_results]$ wc -1 anchor_bed_try2/rbp1_anchors_near5kb_CTCF.bed\n", + "\n", + "b_CTCF.bed\n", + "\n", + "papantonis1@gwdu101 mustache_results]$ wc -1 anchor_bed_try2/ctrl_anchors_near5kb_CTCF.bed\n", + "\n", + "b_CTCF.bed\n", + "\n", + "bedtools window \\\n", + "\n", + "l_tab.bed \\\n", + "PI_CTCF_seacr_top@.@1.peaks.stringent.bed \\\n", + "nchors_near1@kb_CTCF.bed\n", + "\n", + "bedtools window \\\n", + "\n", + "l_tab.bed \\\n", + "_CTCF_seacr_top@.01.peaks.stringent.bed \\\n", + "nchors_near1@kb_CTCF.bed\n", + "\n", + "papantonis1@gwdu101 mustache_results]$ wc -1 anchor_bed_try2/rbp1_anchors_near1@kb_CTCF.bed\n", + "\n", + "kb_CTCF.bed\n", + "\n", + "papantonis1@gwdu101 mustache_results]$ wc -1 anchor_bed_try2/ctrl_anchors_neari@kb_CTCF.bed\n", + "\n", + "kb_CTCF.bed\n", + "\n", + "ML classification model\n", + "\n", + "ML regression model\n", + "\n", + "ATAC QC\n", + "\n", + "ATAC peak detection\n", + "\n", + "7 ,\n", + "GitHub\n", + "8\n", + "Nextflow for ML\n", + "\n", + "\n", + "Extracting Hi-C contact matrix from.hic file\n", + "\n", + "The process obtains the hic contact matrix for each chromosome from the.hic file. It will output the\n", + "frequency_matrix file.\n", + "\n", + "Modify the path to the input and output files in the GetBigMatrix_Cells_KRobserved.sh file: The.jar file is the path\n", + "where the juicer tools resides, and run:\n", + "\n", + "bash GetBigMatrix_Cells_KRobserved.sh O\n", + "\n", + "Generating sub-matrix from Hi-C contact matrix\n", + "\n", + "The process cuts the hic contact matrix of each chromosome into multiple submatrices. Modify the path to the\n", + "input and output files in the Getnpymatrix_chr_all_sample.sh file, where the input file is the output file from the\n", + "previous step, DPATH is the root directory of the frequence_matrix file, and run:\n", + "\n", + "bash Getnpymatrix_chr_all_sample.sh oO\n", + "\n", + "i al\n", + "\n", + "Leaf Hi-C K4me3 HiChIP K27me3 HiChIP\n", + "\n", + "eQTL-gene\n", + "links >20 kb |\n", + "\n", + "shuffled pairs\n", + "\n", + "\n", + "A Sequenced ignment and\n", + "Hi-C Reads Chimera Handling\n", + "RI Re\n", + "a —————I\n", + "\n", + "Duplicate\n", + "Merge Sort removal\n", + "\n", + "Map creation\n", + "\n", + "_\n", + "\n", + "\n", + "Insights from the study\n", + "\n", + "> Total identified loops according to the study (long-range\n", + "loops 2 20 kb): 1,177;\n", + "\n", + "> Paper only analyzed chromatin loops 2 20 kb in length in\n", + "the Hi-C dataset\n", + "\n", + "> Resolutions and parameters not mentioned in paper and\n", + "Sl\n", + "\n", + "> Less no. of chromatin loops identified due to limited\n", + "sequencing depth.HenceHiChIP to detect more loops\n", + "\n", + "> H3K4me3-HiChIP dataset: 24,141 loops;\n", + "> H3K27me3-HiChIP dataset: 18,106 loops\n", + "\n", + "> FitHiC2 on resolution 20kb : 89000 loops\n", + "\n", + "PRC1 PRC2\n", + "\n", + "\n", + "What this suggests:\n", + "\n", + ">\n", + "\n", + "High Mapping Percentage: The mapping quality is quite high (99.20%), which is good.\n", + "\n", + "However, the properly paired issue (0%) should be looked into further.\n", + "\n", + "Paired Read Alignment Issues: The @% for properly paired reads suggests that the\n", + "alignment tool or the pairing information may not be correct. This is a crucial issue for Hi-C\n", + "data since proper pairing indicates the correct relationship between paired-end reads. It's\n", + "worth verifying that the correct options are being used in the alignment step and whether the\n", + "\n", + "pairing information is retained properly.\n", + "\n", + "Inter-chromosomal Interactions: Given the significant number of reads with mates mapped\n", + "to different chromosomes, this aligns with your Hi-C analysis, which typically shows inter-\n", + "chromosomal interactions. However, excessive inter-chromosomal mapping could indicate a\n", + "\n", + "problem if the number is unusually high.\n", + "\n", + "A Sequenced Alignment and Duplicate Map creation\n", + "Hi-C Reads Chimera Handling Merge Sort removal\n", + "RI R2 RARA\n", + "i 1 — =~ —————— = .\n", + "SS .\n", + "; == Se = I\n", + "C ‘t 1 ———— a =\"\n", + "ess I | =\n", + ": p 1 }\n", + "+ —v ===. >} 1 7 , 1\n", + "\n", + "[2]:\n", + "\n", + "hicPlotDistVsCounts —-matrix /mnt/storage3/aman/data_mcool.h5 —-outFileName contact_decay.png\n", + "usage: hicPlotDistVsCounts --matrices MATRICES [MATRICES ...] --plotFile file\n", + "\n", + "name [--labels LABELS [LABELS ...]]\n", + "[--skipDiagonal] [--maxdepth INT bp] [--perchr]\n", + "\n", + "[--chromosomeExclude CHROMOSOMEEXCLUDE [CHROMOSOMEEXCLUDE ...]]\n", + "[--outFileData OUTFILEDATA]\n", + "[--plotsize PLOTSIZE PLOTSIZE] [--help] [--version]\n", + "\n", + "hicPlotDistVsCounts: error: the following arguments are required: —-matrices/—m, —-plotFile/-o\n", + "B 2\n", + "\n", + "icv ome v Tracks ¥ mple Info v Session v Share Bookmark Save Image Circular View v Help v\n", + "\n", + "IGV oxford_e...me.fasta tig00000002:1,752,510-1,825,110 Q 72 kb (Select Tracks ) (\"Crosshairs ) (Center Line ) (Track Labels) @ qumm@ +)\n", + "\n", + "C | D)\n", + "\n", + "1,760 kb 1,770 kb 1,780 kb 1,790 kb 1,800 kb 1,810 kb 1,820 kb\n", + "L 1 n L L 1 n\n", + "\n", + "11D 2 ee) ee es 2 ie\n", + "\n", + "IKAOHOFJ_01984yijE metFmetLrpmE cytR hsilU gipF gipX tpiA pfkAcpxR sodA_1 thaBrhaD IKAOHOFJ_02046fdhE_1 dtdcsqR yihTyihR yihQ yihP_1 ompL IKAOHOFJ_02079 glnA gl\n", + "\n", + "priA_2 menA_2 sbp_2 cpxA_1 rhaT_2rhaA_1 ysdC_2 fdoG_3 yinV yihP_2 GFM1\n", + "\n", + "Adapter Content [Zi\n", + "\n", + "‘ecamuteprctoe cout open tr lay wanna aap ce a oat\n", + "\n", + "FastQC: Adapter Content\n", + "\n", + "Status Checks\n", + "Sua ren Fc sc ston wn mite ere erm ey eg ys\n", + "FastQC: Status Checks\n", + "Software Versions\n", + "\n", + "ai 8 a nse\n", + "‘anes Py ar ot es le\n", + "\n", + "Siseqera\n", + "\n", + "Y cut_n_tag/nadine_cut_tag /nadine_cut_tag\n", + "> Aux_CPI_H3K27me3_results\n", + "> C_H3K27me3_results\n", + "> C_H3K27me3_Spi_results\n", + "> CPILH3K27me3_results\n", + "\n", + "\n", + "AIAG SH =m\n", + "\n", + "\n", + "@ Terminal Shell Edit View Window Help\n", + "\n", + "4) FS Q S Sun 29. Jun 12:54\n", + "\n", + "eee ~~ aman — a.nalakath@node08:~ — ssh -L 9006:localhost:9006 a.nalakath@10.152.154.1 — 208x61\n", + "\n", + "Last login: Sun Jun 29 11:01:36 on ttys@ee\n", + "\n", + "aman@Laptop-von-Aman ~ % tum_ngs\n", + "\n", + "DRA A AA AA A A RRA A RA AA RR RRA A RRR RRA A AR AR AR HAA RR A\n", + "* Welcome to PGEN cluster *\n", + "DRA AA A AR A AA A RRA A AA ARR A RA RR A RR RRA AR A RR AA A HRA RR A\n", + "\n", + "Please use this node only to submit your jobs.\n", + "Don't use it for calculations or CPU/RAM intensive tasks!!!\n", + "\n", + "DARA A AAA AA A A AA A RA AR A RR RRA A RRR RR ARR A HRA HAR A HAA RR A\n", + "(a.nalakath@10.152.154.1) Password:\n", + "\n", + "Kickstarted on 2018-12-07\n", + "\n", + "Last login: Wed Jun 25 08:01:17 2025 from 10.157.58.238\n", + "[a.nalakath@frontend ~]$ ssh node@s\n", + "\n", + "Password:\n", + "\n", + "Kickstarted on 2018-12-04\n", + "\n", + "Last login: Wed Jun 25 08:01:32 2025 from 10.152.154.1\n", + "[a.nalakath@node@8 ~]$ tmux ls\n", + "\n", + "@: 1 windows (created Sat Jun 21 08:01:35 2025)\n", + "[a.nalakath@nodees ~1$ ff\n", + "\n", + "\n", + "AN Tene enginevelry, 5 Py weeds\n", + "- eZ Anal biota Beat dp\n", + "ate - Tyce Gear bei Oo\n", + "46, Trtwesip * Feashig UP ES\n", + "\n", + "yor\n", + "\n", + "SONGS [ab 2 S welts wort\n", + "\n", + "Coup peggy\n", + "\n", + "Repars — PDO)\n", + "Brcacvnud,\n", + "Summer school\n", + "\n", + "Reding Dalukinnoyy gprs\n", + "L¥ Pap & Stiles\n", + "Chater — Pronses\n", + "Saf & Stee Duatle fo Ui?\n", + "Anping fe Hos. Haig HG\n", + "\n", + "Mar\n", + "\n", + "elp\n", + "\n", + "@ Vivaldi File Edit View Bookmarks Mail Tools Wine\n", + "\n", + "New merch store now open, including a limited edition metal keycap! monkeytype.store\n", + "\n", + "monkeytype\n", + "\n", + "70\n", + "97%\n", + "\n", + "time 15\n", + "english\n", + "\n", + "19% 15s\n", + "\n", + "77 88/1/8/8\n", + "\n", + "jectives a’ hivinter | Etherpad\n", + "\n", + "0@0B6\n", + "\n", + "[1]:\n", + "\n", + "import h5py\n", + "\n", + "# Open the HDF5 file\n", + "\n", + "with h5py.File('cool_pileup_combined', 'r') as f:\n", + "# Inspect the structure\n", + "print(\"Keys:\", list(f.keys()))\n", + "\n", + "# Check the 'data' dataset\n", + "\n", + "data = f['data'][:]\n", + "\n", + "print(f\"'data' dataset shape: {data.shape}\")\n", + "print(f\"'data' dataset contents:\\n{data}\")\n", + "\n", + "Keys: ['annotation', ‘attrs', ‘data']\n", + "‘data' dataset shape: (16488, 3)\n", + "‘data' dataset contents:\n", + "[[1.1873085 1.2874519 1.4797186]\n", + "[1. 7349982 2.228282 3.1729212]\n", + "[1.5040904 1.3009566 1.1008095]\n", + "[1.9000989 2.8981235 1.9658103]\n", + "[2.9235291 4.7604017 2.8729181]\n", + "[1.9822323 2.930699 1.9129672]]\n", + "\n", + "\n", + "0@°8@\n", + "W PICO 5.09\n", + "\n", + "Docus_tag\n", + "KBOCNLJJ_00001\n", + "KBOCNLIJJ_00002\n", + "KBOCNLIJJ_00003\n", + "KBOCNLJJ_00004\n", + "KBOCNLJJ_00005\n", + "KBOCNLIIJ_00006\n", + "KBOCNLJJ_00007\n", + "KBOCNLJJ_00008\n", + "KBOCNLJJ_00009\n", + "KBOCNLJJ_00010\n", + "CRISPR\n", + "KBOCNLJJ_00011\n", + "KBOCNLJJ_00012\n", + "KBOCNLIJJ_00013\n", + "KBOCNLIJJ_00014\n", + "KBOCNLIJJ_00015\n", + "KBOCNLIJJ_00016\n", + "KBOCNLIJJ_00017\n", + "KBOCNLJJ_00018\n", + "KBOCNLIJJ_00019\n", + "KBOCNLJJ_00020\n", + "KBOCNLIJJ_00021\n", + "KBOCNLIJ_00022\n", + "KBOCNLIJJ_00023\n", + "KBOCNLIJJ_00024\n", + "KBOCNLIJJ_00025\n", + "KBOCNLIJJ_00026\n", + "KBOCNLJJ_00027\n", + "KBOCNLJJ_00028\n", + "KBOCNLIJJ_00029\n", + "KBOCNLIJJ_00030\n", + "KBOCNLIJJ_00031\n", + "KBOCNLIJJ_00032\n", + "KBOCNLIJJ_00033\n", + "KBOCNLIJIJ_00034\n", + "KBOCNLIJJ_00035\n", + "KBOCNLIJIJ_00036\n", + "KBOCNLIJJ_00037\n", + "KBOCNLJJ_00038\n", + "KBOCNLIJ_00039\n", + "KBOCNLIJJ_00040\n", + "KBOCNLIJJ_00041\n", + "KBOCNLIJ_00042\n", + "KBOCNLJJ_00043\n", + "KBOCNLIJ_00044\n", + "KBOCNLJJ_00045\n", + "KBOCNLIIJ_00046\n", + "KBOCNLIJJ_00047\n", + "KBOCNLIJJ_00048\n", + "KBOCNLIJJ_00049\n", + "KBOCNLJJ_00050\n", + "KBOCNLIJJ_00051\n", + "KBOCNLIIJ_00052\n", + "KBOCNLIJJ_00053\n", + "KBOCNLIJJ_00054\n", + "KBOCNLJJ_00055\n", + "KBOCNLIIJ_00056\n", + "\n", + "Wie) Get Help\n", + "Wed Exit\n", + "\n", + "ftype\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "cDS\n", + "cDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "763\n", + "CDS\n", + "CDS\n", + "cDS\n", + "cDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "cDS\n", + "cDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "cDS\n", + "cDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "cDS\n", + "cDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "cDS\n", + "cDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "CDS\n", + "cDS\n", + "cDS\n", + "CDS\n", + "CDS\n", + "\n", + "length_bp\n", + "\n", + "1545 cysI_1\n", + "735 cysH_1\n", + "2667 ygcB_1\n", + "1509 casA_1\n", + "483 casB_1\n", + "1092 casC_1\n", + "675 casD_1\n", + "600 casE_1\n", + "918 ygbT_1\n", + "285 ygbF_1\n", + "1038\n", + "\n", + "909 cysD_1\n", + "1428 cysN\n", + "606 cysC\n", + "324 ygbE\n", + "312 ftsB\n", + "711 ispD\n", + "480 ispF\n", + "1050 truD\n", + "762 surE\n", + "627 pem\n", + "1140 nlpD_1\n", + "993 rpoS\n", + "1365 ygbN\n", + "777 otni\n", + "639 otnc\n", + "372 otnK_1\n", + "834 otnK_2\n", + "909 1tnD\n", + "768 glcR\n", + "657 pphB\n", + "2562 mutS\n", + "135\n", + "\n", + "354\n", + "\n", + "2079 fhlA\n", + "1011 hypE\n", + "1122 hypD\n", + "273 hypC\n", + "873 hypB\n", + "351 hypA\n", + "462 hycA\n", + "612 hyfA_1\n", + "1827 ndhB_1\n", + "924 hycD\n", + "1710 hycE\n", + "543 ndhI_1\n", + "768 hycG_1\n", + "411\n", + "\n", + "471 hycI\n", + "1425 bglH_1\n", + "1458 bglF_1\n", + "1014 ascG\n", + "528 hyfA_2\n", + "2253 hypF\n", + "1134 norw\n", + "1440 norv\n", + "\n", + "n\n", + "\n", + "8.1.2 COG@155\n", + "8.4.8 COG@175\n", + "tbo=\n", + "\n", + "ge\n", + "1.\n", + "1.\n", + "3. -- C0G1203\n", + "\n", + "3.1.-.-\n", + "- c0G1518\n", + "\n", + "2.7.7.4 COG0175\n", + "2.7.7.4 CO0G2895\n", + "2.7.1.2\n", + "\n", + "c0G2919\n", + "\n", + "7\n", + "COGQ496\n", + "\n", + "C0G0739\n", + "COG@568\n", + "C0G2610\n", + "\n", + "C0G1349\n", + "3.1.3.16\n", + "C0GE249\n", + "\n", + "CO0G3604\n", + "4.2.1.- C0G0@309\n", + "COGe409\n", + "C0G0298\n", + "C0G378\n", + "C0G@375\n", + "\n", + "1.-.-.- C0G1142\n", + "7.1.1.-\n", + "\n", + "COGe65e\n", + "\n", + "C0G3261\n", + "7.1.1.-\n", + "\n", + "C0G3260\n", + "3.4.23.51\n", + "3.2.1.86\n", + "\n", + "C0G1263\n", + "CO0G1609\n", + "1.-.-.- C0G1142\n", + "6.2.-.- C0G0068\n", + "\n", + "1.18.1.-\n", + "COGQ426\n", + "\n", + "We) WriteOut\n", + "We) Justify\n", + "\n", + "EC_number CoG\n", + "\n", + "aman — nano ./Downloads/assignment/Ecoli_hifi/Ecoli_hifi_genome.tsv — 208x63\n", + "\n", + "/Downloads/as\n", + "\n", + "product\n", + "\n", + "Sulfite reductase [NADPH] hemoprotein beta-component\n", + "Phosphoadenosine phosphosulfate reductase\n", + "CRISPR-associated endonuclease/helicase Cas3\n", + "CRISPR system Cascade subunit CasA\n", + "\n", + "CRISPR system Cascade subunit CasB\n", + "\n", + "CRISPR system Cascade subunit CasC\n", + "\n", + "CRISPR system Cascade subunit CasD\n", + "\n", + "CRISPR system Cascade subunit CasE\n", + "CRISPR-associated endonuclease Cas1\n", + "CRISPR-associated endoribonuclease Cas2\n", + "\n", + "hypothetical protein\n", + "Sulfate adenylyltransferase subunit 2\n", + "Sulfate adenylyltransferase subunit 1\n", + "C0G@529 Adenylyl-sulfate kinase\n", + "Inner membrane protein YgbE\n", + "Cell division protein FtsB\n", + "C0G1211 2-C-methyl-D-erythritol 4-phosphate cytidylyltransferase\n", + "C0G@245 2-C-methyl-D-erythritol 2,4-cyclodiphosphate synthase\n", + "COG@585 tRNA pseudouridine synthase D\n", + "5'/3'-nucleotidase SurE\n", + "C0G2518 Protein-L-isoaspartate O-methyltransferase\n", + "Murein hydrolase activator NlpD\n", + "RNA polymerase sigma factor RpoS\n", + "Inner membrane permease YgbN\n", + "C0G3622 2-oxo-tetronate isomerase\n", + "\n", + "3-oxo-tetronate 4-phosphate decarboxylase\n", + "C0G3395 3-oxo-tetronate kinase\n", + "C0G3395 3-oxo-tetronate kinase\n", + "\n", + "L-threonate dehydrogenase\n", + "HTH-type transcriptional repressor GlcR\n", + "C0G@639 Serine/threonine-protein phosphatase 2\n", + "DNA mismatch repair protein MutS\n", + "hypothetical protein\n", + "hypothetical protein\n", + "Formate hydrogenlyase transcriptional activator FhlA\n", + "Carbamoyl dehydratase HypE\n", + "Hydrogenase maturation factor HypD\n", + "Hydrogenase maturation factor HypC\n", + "Hydrogenase maturation factor HypB\n", + "Hydrogenase maturation factor HypA\n", + "Formate hydrogenlyase regulatory protein HycA\n", + "Hydrogenase-4 component A\n", + "NAD(P)H-quinone oxidoreductase subunit 2, chloroplastic\n", + "Formate hydrogenlyase subunit 4\n", + "Formate hydrogenlyase subunit 5\n", + "NAD(P)H-quinone oxidoreductase subunit I, chloroplastic\n", + "Formate hydrogenlyase subunit 7\n", + "hypothetical protein\n", + "COG@68@ Hydrogenase 3 maturation protease\n", + "C0G2723 Aryl-phospho-beta—D-glucosidase BglH\n", + "PTS system beta-glucoside-specific EIIBCA component\n", + "HTH-type transcriptional regulator AscG\n", + "Hydrogenase-4 component A\n", + "Carbamoyltransferase HypF\n", + "C0G1251 Nitric oxide reductase F1Rd-NAD(+) reductase\n", + "Anaerobic nitric oxide reductase flavorubredoxin\n", + "\n", + "Ws) Read File Way Prev Pg\n", + "Wi] Where is WAY Next Pg\n", + "\n", + "AKI\n", + "AU\n", + "\n", + "nment/Ecoli_hifi/Ecoli_hifi_genome.tsv\n", + "\n", + "Cut Text\n", + "UnCut Text\n", + "\n", + "me Cur Pos\n", + "Way To Spell\n", + "\n", + "OMB\n", + "\n", + "100 MB\n", + "\n", + "200 MB\n", + "\n", + "Chromosomes Show Normalization (Obs | Ctrl) Resolution (BP)\n", + "“aw “aw a a a — y,\n", + "2 Bp Observed Bala... None © Pivrb ttre teins\n", + "2.5MB 500KB 100KB 25KB 5KB 1KB 200BP\n", + "OMB 100 MB 200 MB 300 MB\n", + "\n", + "\n", + "Fragment\n", + "\n", + "= ————\n", + "\n", + "=. —,\n", + "> sequencing *——— a\n", + "\n", + "\n", + "@ Mainwindow\n", + "\n", + "Omeoerork oe oO =\n", + "\n", + "[Juicebox 2.17.00] Hi-C Map <9>: inter.hic\n", + "\n", + "View Bookmarks\n", + "Chromosomes\n", + "\n", + "Assembly Dev\n", + "\n", + "Normalization (Obs | Ctrl) Resolution (BP) Color Range\n", + "\n", + "F Q SBS MonNov4 20:49\n", + "\n", + "2.5MB 500KB 100KB 25KB 5KB\n", + "\n", + "156,000 KB 155,000 KB 154,000 KB 153,000 KB 152,000 KB 151,000 KB\n", + "\n", + "157,000 KB\n", + "\n", + "1:159,230,001-159,240,000\n", + "1:153,390,001-153,400,000\n", + "observed value (O) = 0.0\n", + "lexpected value (E) = 0.032\n", + "O/E =0\n", + "\n", + "LayerO <> | & |\n", + "\n", + "Show Annotation Panel\n", + "\n", + "\n", + "(mustache_aman) [papantonis1@gwdu101 aman]$ awk '$1 == $4 {print $1}' GEO2457_5kb_mustache_loops.bedpe | sort | unig -c && wc -1 GE02457_5kb_mustache_loops.bedpe\n", + "88@ chri\n", + "457 chr1e\n", + "536 chri1\n", + "542 chri2\n", + "297 chr13\n", + "306 chri4\n", + "278 chri5\n", + "173 chr16\n", + "253 chr17\n", + "244 chri8\n", + "\n", + "92 chri9\n", + "942 chr2\n", + "216 chr2e\n", + "\n", + "88 chr21\n", + "\n", + "65 chr22\n", + "804 chr3\n", + "686 chr4\n", + "663 chrd\n", + "731 chré\n", + "552 chr7\n", + "574 chr8&\n", + "402 chr9\n", + "205 chrx\n", + "\n", + "9987 GEO2457_5kb_mustache_loops.bedpe\n", + "\n", + "(mustache_aman) [papantonis1@gwdu1@1 aman]$ awk '$1 == $4 {print $1}' GE02459_5kb_mustache_loops.bedpe | sort | unig -c && wc -1 GE02459_5kb_mustache_loops.bedpe\n", + "673 chri\n", + "341 chr1e\n", + "394 chri1\n", + "433 chri2\n", + "233 chr13\n", + "254 chri4\n", + "207 chri5\n", + "108 chr16\n", + "147 chr1i7\n", + "234 chri8\n", + "\n", + "29 chri9\n", + "626 chr2\n", + "173 chr2e\n", + "\n", + "83 chr21\n", + "\n", + "30 chr22\n", + "60@ chr3\n", + "534 chr4\n", + "484 chrd5\n", + "536 chré\n", + "425 chr7\n", + "481 chr8\n", + "286 chr9\n", + "158 chrx\n", + "\n", + "7478 GEO2459_5kb_mustache_loops.bedpe\n", + "\n", + "Contact Matrices:\n", + "\n", + "Fig x: Visualization in Juicebox for two HiC datasets\n", + "\n", + "The 10*10 chromosomes full contact matrix was visualized in Juicebox GUI app by importing files\n", + "locally. The left panel shows the matrix from the cis-regulatory elements in Maize study and the one\n", + "on the right is from (7).The right panel is chromosome one at resolution 500 kb. The 10*10\n", + "chromosomes full contact matrix was visualized in Juicebox GUI app by importing files locally. The\n", + "10*10 chromosomes full contact matrix was visualized in Juicebox GUI app by importing files locally.\n", + "\n", + "Ice ot\n", + "‘earn ere ta rao pen 2 prema ne oe [eeremsne [seen]\n", + "\n", + "FastQC: Per Sequence GC Content\n", + "Pea Samp\n", + "\n", + "Per Base N Content [aim\n", + "\n", + "‘epocenapecttancastcan poten ren an asa\n", + "\n", + "FastQC: Per Base N Content\n", + "\n", + "‘Sequence Length Distribution [a\n", + "\n", + "Mimosa equa ci ng)\n", + "\n", + "‘Sequence Duplication Levels SE (ome)\n", + "‘eae ge yer\n", + "[eeewwres [cere]\n", + "\n", + "FastQC: Sequence Duplication Levels,\n", + "\n", + "Overrepresented sequences by sample SKIN\n", + "\n", + "‘Pett arr ctonnpeericsminceanh eaten.\n", + "\n", + "Top overrepresented sequences\n", + "\n", + "‘ie onmmteeseince sr ssarde The soe 2 trent ser cern aye noosa yr\n", + "\n", + "‘Adapter Content [ZI [ome]\n", + "\n", + "‘Peamusiepenep cathe sana yay te asa en aspen enon\n", + "[eeremsne [seen]\n", + "\n", + "FastQC: Adapter Content\n", + "\n", + "\n", + "© aman — nano ./Downloads/assignment/Ecoli_|\n", + "\n", + "fi/Ecoli_hifi_genome.gff — 208x63\n", + "\n", + "nment/Ecoli_hifi/Ecoli_hifi_genome.gff\n", + "\n", + "ile: ./Downloads/as\n", + "\n", + "Ww PICO 5.09\n", + "\n", + "i#gff-version 3\n", + "\n", + "##sequence-region tig@0000001 1\n", + "\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tigeeeee0e1\n", + "\n", + "Wie) Get Help\n", + "Wed Exit\n", + "\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "minced:@.2.0\n", + "\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "\n", + "465\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "CRI\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "\n", + "7533\n", + "\n", + "99\n", + "1718\n", + "2811\n", + "5892\n", + "7393\n", + "7888\n", + "8982\n", + "9643\n", + "10258\n", + "11177\n", + "11567\n", + "12412\n", + "13701\n", + "14611\n", + "16038\n", + "16693\n", + "17210\n", + "17540\n", + "18250\n", + "18726\n", + "19756\n", + "20511\n", + "21277\n", + "22479\n", + "23565\n", + "25018\n", + "25799\n", + "26529\n", + "26863\n", + "27693\n", + "28797\n", + "29615\n", + "30377\n", + "33014\n", + "33225\n", + "33615\n", + "35767\n", + "36774\n", + "37895\n", + "38158\n", + "39034\n", + "39596\n", + "40182\n", + "40790\n", + "42619\n", + "43560\n", + "45279\n", + "45821\n", + "46585\n", + "46988\n", + "47617\n", + "49050\n", + "50764\n", + "51926\n", + "52606\n", + "54986\n", + "\n", + "SPR\n", + "\n", + "We) WriteOut\n", + "We) Justify\n", + "\n", + "1643\n", + "\n", + "2452\n", + "\n", + "5477\n", + "\n", + "7400\n", + "\n", + "7875\n", + "\n", + "8979\n", + "\n", + "9656\n", + "\n", + "10242\n", + "11175\n", + "11461\n", + "12329\n", + "13449\n", + "14609\n", + "16038\n", + "16643\n", + "17016\n", + "17521\n", + "18250\n", + "18729\n", + "19775\n", + "20517\n", + "21137\n", + "22416\n", + "23471\n", + "24929\n", + "25794\n", + "26437\n", + "26900\n", + "27696\n", + "28601\n", + "29564\n", + "30271\n", + "32938\n", + "33148\n", + "33578\n", + "35693\n", + "36777\n", + "37895\n", + "38167\n", + "39030\n", + "39384\n", + "40057\n", + "40793\n", + "42616\n", + "43542\n", + "45269\n", + "45821\n", + "46588\n", + "46995\n", + "47458\n", + "49041\n", + "50507\n", + "51777\n", + "52453\n", + "54858\n", + "56119\n", + "\n", + "tet et eteetse\n", + "\n", + "tet et etetetsei\n", + "\n", + "i\n", + "\n", + "tet etetesti\n", + "\n", + "++H1\n", + "\n", + "F\n", + "\n", + "SPBV2VVDVVVOVVO\n", + "\n", + "PBYWDVDDWDD WDD VDD DD VDD VV VDD VDDD DVD VDDVDDVDDVDVDVDVDVDVDVDVDVDVVVVVVOVOQ:\n", + "\n", + "Wii Read File\n", + "Wil) Where is\n", + "\n", + "ID=KBOCNLJJ_00001; eC_number=1.8.1.2;Name=cysI_1;db_xref=COG:C0G0155; gene=cysI_1;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00002; eC_number=1.8.4.8;Name=cysH_1;db_xref=COG:C0G0175; gene=cysH_1;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00003; eC_number=3.1.-.-—;Name=ygcB_1;db_xref=COG:C0G1203; gene=ygcB_1;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00004;Name=casA_1;gene=casA_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:Q4690$\n", + "ID=KBOCNLJJ_@0005 ; Name=casB_1;gene=casB_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:P7663$\n", + "ID=KBOCNLJJ_00006;Name=casC_1;gene=casC_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:Q4689$\n", + "ID=KBOCNLJJ_@0007 ; Name=casD_1;gene=casD_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:Q4689$\n", + "ID=KBOCNLJJ_00008; eC_number=3.1. j;Name=casE_1;gene=casE_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequen$\n", + "ID=KBOCNLIJJ_00009; eC_number=3.1. j;Name=ygbT_1;db_xref=COG:C0G1518; gene=ygbT_1;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00010; eC_number=3.1.-.-—;Name=ygbF_1;gene=ygbF_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequen$\n", + "note=CRISPR with 13 repeat units;rpt_family=CRISPR;rpt_type=direct\n", + "\n", + "ID=KBOCNLJJ_00011;inference=ab initio prediction: Prodigal : 002006; locus_tag=KBOCNLJJ_00011;product=hypothetical protein\n", + "ID=KBOCNLJJ_00012; eC_number=2.7.7.4;Name=cysD_1;db_xref=COG:C0G0175; gene=cysD_1;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00013; eC_number=2.7.7.4;Name=cysN; db_xref=COG:C0G2895; gene=cysN;inference=ab initio prediction:Prodigal: 002006, simi$\n", + "ID=KBOCNLJJ_00014; eC_number=2.7.1.25;Name=cysC; db_xref=COG:C0G@529; gene=cysC;inference=ab initio prediction:Prodigal: 002006, sim$\n", + "ID=KBOCNLJJ_00015 ; Name=ygbE; gene=ygbE;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:P46141;1lo$\n", + "ID=KBOCNLJJ_00016;Name=ftsB; db_xref=COG:C0G2919; gene=ftsB;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00017; eC_number=2.7.7.60;Name=ispD; db_xref=COG:C0G1211; gene=ispD;inference=ab initio prediction:Prodigal: 002006, sim$\n", + "ID=KBOCNLJJ_00018; eC_number=4.6.1.12;Name=ispF; db_xref=COG:C0G@245; gene=ispF;inference=ab initio prediction:Prodigal: 002006, sim$\n", + "ID=KBOCNLJJ_00019; eC_number=5.4.99.27;Name=truD; db_xref=COG:C0G0585; gene=truD; inference=ab initio prediction:Prodigal:002006,si$\n", + "ID=KBOCNLJJ_00020; eC_number=3.1.3.5;Name=surE; db_xref=COG:C0G0496; gene=surE;inference=ab initio prediction:Prodigal: 002006, simi$\n", + "ID=KBOCNLJJ_00021; eC_number=2.1.1.77;Name=pcm; db_xref=COG:C0G2518; gene=pcm; inference=ab initio prediction:Prodigal: 002006, simil$\n", + "ID=KBOCNLJJ_00022;Name=n1pD_1; db_xref=COG:C0G@739; gene=nlpD_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequ$\n", + "ID=KBOCNLJJ_00023;Name=rpoS; db_xref=COG:C0G0568; gene=rpoS;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00024;Name=ygbN; db_xref=COG:C0G2610;gene=ygbN; inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00025; eC_number=5.3.1.35;Name=otnI; db_xref=COG:C0G3622; gene=otnI;inference=ab initio prediction:Prodigal: 002006, sim$\n", + "ID=KBOCNLJJ_00026; eC_number=4.1.1.104;Name=otnC;gene=otnC;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_0@0027; eC_number=2.7.1.217;Name=otnK_1;db_xref=COG:C0G3395; gene=otnK_1;inference=ab initio prediction:Prodigal:00200$\n", + "ID=KBOCNLJJ_00028; eC_number=2.7.1.217;Name=otnK_2;db_xref=COG:C0G3395; gene=otnK_2;inference=ab initio prediction:Prodigal:00200$\n", + "ID=KBOCNLJJ_00029; eC_number=1.1.1.411;Name=1tnD; gene=1tnD;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00030;Name=g1lcR;db_xref=COG:C0G1349; gene=glcR;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00031; eC_number=3.1.3.16;Name=pphB; db_xref=COG:C0G@639; gene=pphB; inference=ab initio prediction:Prodigal: 002006, sim$\n", + "ID=KBOCNLJJ_00032;Name=mutS;db_xref=COG:C0G0249; gene=mutS;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00033;inference=ab initio prediction: Prodigal: 002006; locus_tag=KBOCNLJJ_00033;product=hypothetical protein\n", + "ID=KBOCNLJJ_00034;inference=ab initio prediction: Prodigal : 002006; locus_tag=KBOCNLJJ_00034;product=hypothetical protein\n", + "ID=KBOCNLJJ_00035 ; Name=fh1A; db_xref=COG:C0G3604;gene=fhlA;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLIJJ_00036; eC_number=4.2.1.—;Name=hypE; db_xref=COG:C0G@309; gene=hypE;inference=ab initio prediction:Prodigal: 002006, simi$\n", + "ID=KBOCNLJJ_00037 ; Name=hypD; db_xref=COG:C0G0409; gene=hypD; inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00038; Name=hypC; db_xref=COG:C0G0298; gene=hypC;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLIJJ_00039 ; Name=hypB; db_xref=COG:C0G0378; gene=hypB; inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_@0040 ; Name=hypA; db_xref=COG:C0G0375; gene=hypA;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_@0041;Name=hycA;gene=hycA;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:P@AEV4; 1lo$\n", + "ID=KBOCNLJJ_00042; eC_number=1.-. j;Name=hyfA_1; db_xref=COG:C0G1142; gene=hyfA_1;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00043; eC_number=7.1.1.—;Name=ndhB_1;gene=ndhB_1;inference=ab initio prediction:Prodigal:002006,protein motif :HAMAP:$\n", + "ID=KBOCNLJJ_00044;Name=hycD; db_xref=COG:C0G0650;gene=hycD;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_@0045 ; Name=hycE; db_xref=COG:C0G3261; gene=hycE;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00046; eC_number=7.1.1.—;Name=ndhI_1;gene=ndhI_1;inference=ab initio prediction:Prodigal:002006,protein motif :HAMAP:$\n", + "ID=KBOCNLJJ_00047 ; Name=hycG_1; db_xref=COG:C0G3260; gene=hycG_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequ$\n", + "ID=KBOCNLJJ_00048;inference=ab initio prediction: Prodigal: 002006; locus_tag=KBOCNLJJ_00048;product=hypothetical protein\n", + "ID=KBOCNLJJ_0@0049; eC_number=3.4.23.51;Name=hycI ;db_xref=COG:C0G0680;gene=hycI;inference=ab initio prediction:Prodigal:002006,si$\n", + "ID=KBOCNLJJ_@0050; eC_number=3.2.1.86;Name=bg1H_1;db_xref=COG:C0G2723; gene=bg1H_1;inference=ab initio prediction:Prodigal:002006$\n", + "ID=KBOCNLJJ_00051; Name=bg1F_1;db_xref=COG:C0G1263; gene=bg1F_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequ$\n", + "ID=KBOCNLJJ_00052;Name=ascG; db_xref=COG:C0G1609; gene=ascG; inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00053; eC_number=1.-.-.-—;Name=hyfA_2;db_xref=COG:C0G1142;gene=hyfA_2;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00054; eC_number=6.2.-—.—;Name=hypF; db_xref=COG:C0G@068; gene=hypF;inference=ab initio prediction:Prodigal: 002006, simi$\n", + "ID=KBOCNLJJ_00055; eC_number=1.18.1.-—;Name=norw; db_xref=COG:C0G1251; gene=norW; inference=ab initio prediction:Prodigal: 002006, sim$\n", + "\n", + "bad Prev Pg Wag Cut Text wie Cur Pos\n", + "WA) Next Pg wig) UnCut Text Way To Spell\n", + "\n", + "\n", + "About Library _ Statistics\n", + "\n", + "Sequencing\n", + "\n", + "Sequenced Reads: 547812856\n", + "\n", + "Duplication and Complexity (% Sequenced Reads)\n", + "\n", + "Analysis of Unique Reads (% Sequenced Reads / % Unique Reads)\n", + "\n", + "Intra-fragment Reads: 34,307,613\n", + "\n", + "Below MAPQ Threshold: 355,354,506 (64.87% / 73.27%)\n", + "\n", + "Hi-C Contacts: 95,311,375 (17.40% / 19.65%)\n", + "3' Bias (Long Range): 97% - 3%\n", + "\n", + "Pair Type % (L-I-O-R): 25% - 25% - 25% - 25%\n", + "\n", + "Analysis of Hi-C Contacts (% Sequenced Reads / % Unique Reads)\n", + "\n", + "Inter-chromosomal: 22,194,956 (4.05% / 4.58%)\n", + "Intra-chromosomal: 73,116,419 (13.35% / 15.08%)\n", + "Long Range (>20Kb): 35,425,178 (6.47% / 7.30%)\n", + "\n", + "RUN wget https://github.com/samtools/htslib/releases/download/1.18/htslib-1.18.tar.bz2 && \\\n", + "tar -xvf htslib-1.18.tar.bz2 && \\\n", + "cd htslib-1.18 && \\\n", + "./configure --enable-libcurl && \\\n", + "make -j$(nproc) && \\\n", + "make install && \\\n", + "cd .. && rm -rf htslib-1.18*\n", + "\n", + "# User addition\n", + "\n", + "RUN useradd -m -u 1001 aman && echo 'aman:123' | chpasswd\n", + "RUN usermod —aG sudo aman\n", + "\n", + "RUN usermod -aG rstudio aman\n", + "\n", + "# persistent volumes. Use flag -v\n", + "RUN mkdir -p /home/rstudio/data\n", + "\n", + "RUN chown -R aman:aman /home/rstudio\n", + "VOLUME [\"/home/rstudio/data\"]\n", + "\n", + "[1]\n", + "\n", + "(4)\n", + "\n", + "tv)\n", + "\n", + "(4)\n", + "\n", + "print(hic.getGenomeID())\n", + "print(hic.getResolutions())\n", + "\n", + "hg19\n", + "[2500000, 1000000, 500000, 250000, 100000, 50000, 25000, 10000, 5000, 1000]\n", + "\n", + "now print out the chromosomes in this file.\n", + "\n", + "for chrom in hic.getChromosomes():\n", + "print(chrom.name, chrom. length)\n", + "\n", + "All 3098789\n", + "249250621\n", + "243199373\n", + "198022430\n", + "191154276\n", + "180915260\n", + "171115067\n", + "159138663\n", + "146364022\n", + "141213431\n", + "10 135534747\n", + "11 135006516\n", + "12 133851895\n", + "13 115169878\n", + "14 107349540\n", + "15 102531392\n", + "16 90354753\n", + "17 81195210\n", + "18 78077248\n", + "19 59128983\n", + "20 63025520\n", + "21 48129895\n", + "22 51304566\n", + "X 155270560\n", + "Y 59373566\n", + "MT 16569\n", + "\n", + "COIYAHAWNE\n", + "\n", + "@ Zed File Edit Selection View Go Window Help\n", + "\n", + "ma BmeOorrtktoewwwn F-\n", + "FastQc\n", + "all_sections multiqc_data.json\n", + "GLDS-251_rna-seq_13JUN2017H\n", + "GLDS-251_rna-seq_13JUN2017H\n", + "GLDS-251_rna-seq_13JUN2017H\n", + "GLDS-251_rna-seq_13JUN2017H\n", + "GLDS-251_rna-seq_13JUN2017H\n", + "GLDS-251_rna-seq_13JUN2017H\n", + "GLDS-251_rna-seq_13JUN2017H\n", + "report_general_stats_data\n", + "GLDS-251_rna-seq_13JUN2017HiSeq_|\n", + "percent_gc\n", + "avg_sequence_length\n", + "median_sequence_length\n", + "total_sequences\n", + "percent_duplicates\n", + "percent_fails\n", + "GLDS-251_rna-seq_13JUN2017HiSeq_|\n", + "percent_gc\n", + "avg_sequence_length\n", + "median_sequence_length\n", + "total_sequences\n", + "percent_duplicates\n", + "percent_fails\n", + "GLDS-251_rna-seq_13JUN2017HiSeq_|\n", + "percent_gc\n", + "avg_sequence_length\n", + "median_sequence_length\n", + "total_sequences\n", + "\n", + "percent_duplicates\n", + "Filter... z\n", + "\n", + "st & v\n", + "\n", + "Click to restart and update Zed\n", + "\n", + "multiqc_data.json\n", + "\n", + "PELCSNL_LaLLsS 2 7.UFUFTUTUIVUIVIVIZ\n", + "\n", + "Bo\n", + "\n", + "Sign in\n", + "\n", + "+\n", + "\n", + "oO\n", + "\n", + "Q*® I\n", + "\n", + "\"GLDS-251_rna-seq_13JUN2017HiSeq_Run_Sample_235_239_UMISS_Hoeksema_GTTTCG_L0@3_R1_001_1M\": {\n", + "\n", + "\"percent_gc\": 46.0,\n", + "\"avg_sequence_length\": 125.0,\n", + "\"median_sequence_length\": 125,\n", + "\"total_sequences\": 1000000.0,\n", + "\"percent_duplicates\": 23.347216247708587,\n", + "\"percent_fails\": 9.090909090909092\n", + "\n", + "Bo\n", + "\n", + "\"GLDS-251_rna-seq_13JUN2017HiSeq_Run_Sample_120_UMISS_Hoeksema_TGACCA_L001_R1_001_1M\":\n", + "\n", + "\"percent_gc\": 49.0,\n", + "\"avg_sequence_length\": 125.0,\n", + "\"median_sequence_length\": 125,\n", + "\"total_sequences\": 1000000.0,\n", + "\"percent_duplicates\": 52.07411329479328,\n", + "\"percent_fails\": 18.181818181818183\n", + "\n", + "Bo\n", + "\n", + "\"GLDS-251_rna-seq_13JUN2017HiSeq_Run_Sample_175_UMISS_Hoeksema_AGTTCC_L00Q2_R1_001_1M\":\n", + "\n", + "\"percent_gc\": 47.0,\n", + "\"avg_sequence_length\": 125.0,\n", + "\"median_sequence_length\": 125,\n", + "\"total_sequences\": 1000000.0,\n", + "\"percent_duplicates\": 30.77778969527732,\n", + "\"percent_fails\": 9.090909090909092\n", + "\n", + "Bo\n", + "\n", + "\"GLDS-251_rna-seq_13JUN2017HiSeq_Run_Sample_179_UMISS_Hoeksema_CCGTCC_LO0Q3_R1_001_1M\":\n", + "\n", + "\"percent_gc\": 45.0,\n", + "\"avg_sequence_length\": 125.0,\n", + "\"median_sequence_length\": 125,\n", + "\n", + "Nt ata enniianene \". ANNNANAAN A\n", + "\n", + "Updated to Zed 0.163.2\n", + "View the release notes\n", + "\n", + "algal JSON\n", + "\n", + "v\n", + "\n", + "aman@Laptop-von-Aman juicer_hpro % docker build -t juicer_hicpro .\n", + "\n", + "[+] Building 2.3s (16/18)\n", + "\n", + "=> [internal] load build definition from Dockerfile\n", + "\n", + "=> transferring dockerfile: 2.07kB\n", + "\n", + "[internal] load metadata for docker.io/nvidia/cuda:11.7.1-devel-ubuntu22.04\n", + "\n", + "[auth] nvidia/cuda:pull token for registry-1.docker.io\n", + "\n", + "[internal] load .dockerignore\n", + "\n", + "=> transferring context: 2B\n", + "\n", + "CANCELED [ 1/13] FROM docker.io/nvidia/cuda:11.7.1-devel—ubuntu22.04@sha256 : 18aade8cf@2eede9d4db5d8a8a73d4505bb2322e91cd54e4c601e5ae100ed691\n", + "=> resolve docker.io/nvidia/cuda:11.7.1-devel-ubuntu22.04@sha256: 18aade8c f02eede9d4db5d8a8a73d4505bb2322e91cd54e4c601e5ae100ed691\n", + "[internal] load build context\n", + "\n", + "=> transferring context: 2B\n", + "\n", + "CACHED [ 3/13] RUN locale-gen en_US.UTF-8\n", + "\n", + "CACHED [ 4/13] RUN wget https://repo.continuum.io/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh -O /tmp/miniconda.sh && bash /tmp/miniconda.sh -b -p /usr/local/anaconda &&\n", + "ERROR [ 5/13] COPY environment.yml /\n", + "\n", + "CACHED [ 6/13] RUN conda env create -f /environment.yml && conda clean -a\n", + "\n", + "CACHED [ 7/13] RUN cd /opt && wget https://github.com/nservant/HiC-Pro/archive/master.zip -O hicpro_latest.zip && unzip hicpro_latest.zip && cd HiC-Pro-master &&\n", + "\n", + "> CACHED [ 8/13] WORKDIR /opt\n", + "\n", + "ERROR [ 9/13] COPY install-dependencies.sh /opt/install-dependencies.sh\n", + "\n", + "> ERROR [10/13] COPY download-and-run-demo.sh /aidenlab/\n", + "\n", + "> ERROR [11/13] COPY download-demo.txt /aidenlab/\n", + "\n", + "v\n", + "\n", + "COPY install-dependencies.sh /opt/install-dependencies.sh:\n", + "\n", + "COPY download-and-run-demo.sh /aidenlab/:\n", + "\n", + "COPY download-demo.txt /aidenlab/:\n", + "\n", + "COPY install-dependencies.sh /opt/install-dependencies.sh\n", + "\n", + "COPY download-and-run-demo.sh /aidenlab/\n", + "\n", + "COPY download-demo.txt /aidenlab/\n", + "\n", + "RUN chmod +x /opt/install-dependencies.sh && /opt/install-dependencies.sh && \\\n", + "chmod +x /aidenlab/download-and-run-demo.sh && \\\n", + "\n", + "ERROR: failed to solve: failed to compute cache key: failed to calculate checksum of ref mh9tt@9a7urz4xt51386tebzw: :xdsz6f9f1g9z1t18j4ipud5@bf: \"/download-demo.txt\": not found\n", + "\n", + "View build details: docker-desktop://dashboard/build/desktop-—linux/desktop—linux/4aiwsé6vrixqnjrre@férxiuzt4\n", + "aman@Laptop-von-Aman juicer_hpro % I\n", + "\n", + "docker:desktop-—linux\n", + "\n", + "CACHED [ 2/13] RUN apt-get update && apt-get install -y build-essential wget unzip bzip2 gcc gt+ openjdk-11-jdk git curl make ca-certificates vim\n", + "\n", + "rm /tmp/minicon\n", + "\n", + "make configure pref\n", + "\n", + "Q.\n", + "-@s\n", + "-1s\n", + "-@s\n", + "-@s\n", + "-@s\n", + "«1s\n", + "-@s\n", + "-1s\n", + "-@s\n", + "-@s\n", + "-@s\n", + "-@s\n", + "-Os\n", + "-@s\n", + "-@s\n", + "-@s\n", + "-Os\n", + "-Os\n", + "-Os\n", + "\n", + "PBVVVWWWVVVVVVVGTGVONO\n", + "\n", + "Qs\n", + "\n", + "Last login: Wed Sep 18 15:46:07 on ttys@0ee\n", + "aman@Laptop-von-Aman ~ % ssh amnala@base.hpc.taltech.ee\n", + "amnala@base.hpc.taltech.ee's password:\n", + "\n", + "Last login: Wed Sep 18 16:49:35 2024 from 193.40.250.119\n", + "\n", + "Welcome to base.hpc.taltech.ee.\n", + "It has been freshly upgraded to Rocky 8!\n", + "\n", + "This is HPC Centre's main batch cluster.\n", + "If you run into any trouble, let us know in Teams 'HPC Support Chat' or write to us: hpcsupport@taltech.ee\n", + "\n", + "User guides: https://hpc.pages.taltech.ee/user-guides\n", + "\n", + "NEW MODULES:\n", + "\n", + "module load rocky8/all\n", + "\n", + "module load rocky8-spack/master\n", + "\n", + "module load openmpi/4.1.1-gcc-10.3.0-r8\n", + "\n", + "URGENT ==\n", + "\n", + ". The module system has changed so your job submission scripts need to be changed\n", + "\n", + "-— amp*, green* and gray* modules have been replaced by rocky8* modules.\n", + "\n", + "-— most of the module names have changed, use module avail to see the available ones\n", + "\n", + "- Infiniband is not available currently, for MPI jobs use the openmpi/4.1.1-gcc-10.3.@-r8-tcp module\n", + "\n", + "2. We are missing some software currently, it will become available in the coming weeks\n", + "\n", + "3. The user-guide will be updated in the coming weeks and the example scripts and modules do not yet reflect the current module structure/naming\n", + "4. user-guides have been moved to https://docs.hpc.taltech.ee\n", + "\n", + "If you run into any trouble, let us know in Teams 'HPC Support Chat' or e-mail us: hpcsupport@taltech.ee\n", + "\n", + "[amnala@base ~]$ ls\n", + "\n", + "fruitsalad.txt fruitsalad_cleaned.txt history_aman.txt\n", + "\n", + "[amnala@base ~]$ cat history_aman.txt\n", + "14 cd smbgroup/bioinf-students/\n", + "\n", + "15 s -ltr\n", + "16 clear\n", + "17 s -ltr\n", + "\n", + "18 echo $HOME\n", + "\n", + "19 cp fruitsalad.txt $HOME\n", + "\n", + "20 cd $HOME\n", + "\n", + "21 s\n", + "\n", + "22 cat fruitsalad.txt\n", + "\n", + "23 uniq fruitsalad.txt\n", + "\n", + "24 cat fruitsalad.txt | sort | uniq -u\n", + "25 s\n", + "26 cat fruitsalad.txt\n", + "\n", + "27 cat fruitsalad.txt | sort | uniq -u > fruitsalad_cleaned.txt\n", + "\n", + "28 s\n", + "29 cat fruitsalad_cleaned.txt\n", + "3@ we -h\n", + "\n", + "31 we --help\n", + "\n", + "32 we -l1 fruitsalad_cleaned.txt\n", + "\n", + "33 cat fruitsalad_cleaned.txt\n", + "\n", + "34 history | less\n", + "\n", + "35 history | tail\n", + "\n", + "36 history\n", + "\n", + "37 history | tail -n +14 > history_aman.txt\n", + "[amnala@base ~]$\n", + "\n", + "\n", + "Genome vv Tracks ¥ Sample Info v Session v Share Bookmark Save Image Circular View v Help v\n", + "\n", + "IGV oxford_e...me.fasta tig00000002:1,989,819-1,993,234 Q 3,416 bp (Select Tracks ) (Crosshairs )(_Center Line )(TrackLabels) @ +)\n", + "1,990 kb j 1,991 kb j 1,992 kb j 1,993 kb\n", + "AQ 0 EA A MY TAY A AY a\n", + "|= SS SS en |\n", + "tnaB tnaA mnmE_1\n", + "\n", + "INSTITUTE\n", + "\n", + "Heng igv.org UCSan Diego fe BROAD\n", + "\n", + "\n", + "@FastQC Report\n", + "\n", + "Summary\n", + "\n", + "Qeasic Statistics\n", + "Ore base sequence quality\n", + "\n", + "Ober sequence quality scores\n", + "\n", + "Ober base sequence content\n", + "Qeer sequence GC content\n", + "Oeer base N content\n", + "\n", + "Q sequence Length Distribution\n", + "Qseauence Duplication Levels\n", + "Qoverrepresented sequences\n", + "Qadapter Content\n", + "\n", + "Qrxmmer Content\n", + "\n", + "Qbasic Statistics\n", + "\n", + "a\n", + "\n", + "Filename\n", + "\n", + "File type\n", + "\n", + "Encoding\n", + "\n", + "Total Sequences\n", + "\n", + "Sequences flagged as poor quality\n", + "Sequence length\n", + "\n", + "%GC\n", + "\n", + "wood_sample_3_forward_paired. fq.gz\n", + "Conventional base calls\n", + "\n", + "Sanger / Illumina 1.9\n", + "\n", + "185642\n", + "\n", + ")\n", + "\n", + "30-150\n", + "\n", + "36\n", + "\n", + "@per base sequence quality\n", + "\n", + "Quality scores across all bases (Sanger / Illumina 1.9 encoding)\n", + "\n", + "40\n", + "\n", + "16\n", + "\n", + "14\n", + "12\n", + "10\n", + "\n", + "oN B&O\n", + "\n", + "12345 67 8 9 1519\n", + "\n", + "30-34 45-49 60-64 75-79 90-94 105-109 120-124 135-139 150\n", + "\n", + "@FastQC Report\n", + "\n", + "Summary\n", + "\n", + "Qbasic Statistics\n", + "\n", + "Ober base sequence quality\n", + "\n", + "Ober sequence quality scores\n", + "Ober base sequence content\n", + "OQer sequence GC content\n", + "Ober base N content\n", + "Osequence Length Distribution\n", + "Osequence Duplication Levels\n", + "Q overrepresented sequences\n", + "Qoaaapter Content\n", + "\n", + "Tue 8 Oct 2024\n", + "\n", + "5_merged_2_paired.fastq\n", + "\n", + "Oper base sequence content\n", + "\n", + "100\n", + "\n", + "90\n", + "\n", + "80\n", + "\n", + "70\n", + "\n", + "60\n", + "\n", + "50\n", + "\n", + "40\n", + "\n", + "30\n", + "\n", + "20\n", + "\n", + "10\n", + "\n", + "123456789\n", + "\n", + "Sequence content across all bases\n", + "\n", + "11 $13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63 65\n", + "Position in read (bp)\n", + "\n", + "%T\n", + "%C\n", + "\n", + "%G\n", + "\n", + "eoo <— > OQ VD G monkeytype.com Ws Search SEARXNG-NALAKATH eave @ @ ~™@®\n", + "ay & a Ab New merch store now open, including a limited edition metal keycap! monkeytype.store x\n", + "\n", + "monkeytype\n", + "\n", + "73\n", + "96%\n", + "\n", + "cautich 76 182/3/1/0 84% 30s\n", + "\n", + "GCO@Qe2® ag Avnud9g HSBTOC BD\n", + "\n", + "english\n", + "&\n", + "S Workspaces v (mi) Monkeytype | A minimalisti + Vv\n", + "0&8 S CI CQ Reset Om 100% 11:12\n", + "\n", + "Labs/Group Leaders that interest\n", + "you (up to 5)\n", + "\n", + "Labs/Group Leaders that interest\n", + "you (up to 5)\n", + "\n", + "Labs/Group Leaders that interest\n", + "you (up to 5)\n", + "\n", + "Labs/Group Leaders that interest\n", + "you (up to 5)\n", + "\n", + "Christa Buecker\n", + "\n", + "Daniel Gerlich\n", + "\n", + "Marco Hein\n", + "\n", + "Yan Ma\n", + "\n", + "@FastQC Report\n", + "\n", + "Summary\n", + "\n", + "Qeasic Statistics\n", + "Ore base sequence quality\n", + "\n", + "Ober sequence quality scores\n", + "\n", + "Ober base sequence content\n", + "Qeer sequence GC content\n", + "Oeer base N content\n", + "\n", + "Q sequence Length Distribution\n", + "Qseauence Duplication Levels\n", + "Qoverrepresented sequences\n", + "Qadapter Content\n", + "\n", + "Okmmer Content\n", + "\n", + "Qbasic Statistics\n", + "\n", + "a\n", + "\n", + "Filename\n", + "\n", + "File type\n", + "\n", + "Encoding\n", + "\n", + "Total Sequences\n", + "\n", + "Sequences flagged as poor quality\n", + "Sequence length\n", + "\n", + "%GC\n", + "\n", + "wood_sample_5_forward_paired. fq.gz\n", + "Conventional base calls\n", + "\n", + "Sanger / Illumina 1.9\n", + "\n", + "179506\n", + "\n", + ")\n", + "\n", + "30-150\n", + "\n", + "37\n", + "\n", + "@per base sequence quality\n", + "\n", + "Quality scores across all bases (Sanger / Illumina 1.9 encoding)\n", + "\n", + "40\n", + "\n", + "16\n", + "\n", + "14\n", + "12\n", + "10\n", + "\n", + "oN B&O\n", + "\n", + "12345 67 8 9 1519\n", + "\n", + "30-34 45-49 60-64 75-79 90-94 105-109 120-124 135-139 150\n", + "\n", + "In [36]:\n", + "\n", + "%%sbash\n", + "\n", + "head /mnt/storage3/aman/wdbasejuicer_new/hiccups_output/postprocessed_pixels_10000.bedpe\n", + "\n", + "#chr1— x1 x2 chr2\n", + "expectedDonut expectedH\n", + "centroid2 radius\n", + "\n", + "# juicer_tools version 2.20.00\n", + "10 6090000 6100000 10\n", + "6.738838 8.369542\n", + "\n", + "6098333 6208333 7454\n", + "\n", + "10 139920000 13993000\n", + "55,255 62.0 8.725843\n", + "455184E-15 9.31793E-40\n", + "\n", + "10 76000000 76010000\n", + "55,255 57.0 9.344456\n", + "203114E-17 2.29482E-25\n", + "\n", + "10 149390000 14940000\n", + "55,255 56.0 5.521386\n", + "702141E-16 2.387457E-16\n", + "\n", + "10 136480000 13649000\n", + "55,255 56.0 5 8624353\n", + "23398E-20 1.2297154E-24\n", + "\n", + "10 148200000 14821000\n", + "55,255 55.0 78222165\n", + "\n", + "19 8.71397@5E-12 2\n", + "\n", + "10 145390000 14540000\n", + "55,255 52.0 9.858375\n", + "\n", + "17 1.6487045E-21 2\n", + "\n", + "10 143300000 14331000\n", + "55,255 48.0 7.270913\n", + "923472E-15 8.827955E-12\n", + "\n", + "yl y2 name score strand1 strand2 color observed expectedBL\n", + "expectedV fdrBL fdrDonut fdrH fdrv numCollapsed centroid1\n", + "6200000 6210000 . . : : @,255,255 69.0 7.9115663\n", + "13.515236 1.45373255E-30 5.202941E-36 3.1267008E-30 1.2960435E-19 3\n", + "0 10 139980000 139990000 : : : . 0,2\n", + "7.795326 15.521655 4.7749968 5.0803407E-25 6.842732E-30 1.4\n", + "3 139925000 139985000 10000\n", + "\n", + "10 76080000 76090000 : : : . 0,2\n", + "8.861963 11.599155 7.0608373 2.5698042E-21 1.734446E-21 4.3\n", + "6 76006666 76076666 14337\n", + "0 10 149450000 149460000 : : : . 0,2\n", + "7.006336 10.389031 11.967166 2.12049@5E-29 6.3991415E-25 1.6\n", + "4 149390000 149450000 7071\n", + "0 10 136880000 136890000 : : : . 0,2\n", + "4.0235314 9.664011 6.9882493 2.1204905E-29 8.194439E-34 2.8\n", + "7 136483571 136879285 16659\n", + "0 10 148260000 148270000 : : : . 0,2\n", + "9.238162 9.26983 14.654494 6.932115E-24 3.9216012E-20 1.4314703E-\n", + "148205000 148260000 5000\n", + "0 10 145440000 145450000 : : : . 0,2\n", + "6.957423 8.590018 6.5711 5.5672264E-14 6.6677316E-22 1.1138844E-\n", + "145395000 145450000 5000\n", + "0 10 143360000 143370000 : : : . 0,2\n", + "55802155 8.395383 12.302593 1.2983397E-18 1.498726E-22 4.4\n", + "2 143310000 143365000 5000\n", + "\n" + ] + } + ], + "source": [ + "screenshots = get_screenshots(\"/Users/aman/Pictures\")\n", + "texts = extract_text(screenshots)\n", + "embeddings = create_and_index(texts)\n", + "results = query_embedding(embeddings, \"hic\")\n", + "for r in results:\n", + " print(r)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a613a361", + "metadata": {}, + "outputs": [], + "source": [ + "# # create embeddings\n", + "# embeddings = Embeddings({\n", + "# \"path\": \"sentence-transformers/all-MiniLM-L6-v2\",\n", + "# \"content\": True,\n", + "# \"graph\": True,\n", + "# \"hybrid\": True, \n", + "# \"scoring\": True\n", + "# })\n", + "\n", + "# # do indexing\n", + "# embeddings.index(txt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f94de70", + "metadata": {}, + "outputs": [], + "source": [ + "# embeddings search\n", + "print(\"%-20s %s\" % (\"Query\", \"Best Match\"))\n", + "print(\"-\" * 50)\n", + "\n", + "for query in [\"genome\"]:\n", + " results = embeddings.search(query, 100)\n", + " for r in results:\n", + " print(r[\"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10c81e27", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"OPENROUTER_API_KEY\"] = \"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\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9519cf2", + "metadata": {}, + "outputs": [], + "source": [ + "from txtai import LLM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58bce2ae", + "metadata": {}, + "outputs": [], + "source": [ + "OPENROUTER_API_KEY = os.getenv(\"OPENROUTER_API_KEY\")\n", + "OPENROUTER_BASE_URL = os.getenv(\"OPENROUTER_API_BASE\", \"https://openrouter.ai/api/v1\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e20bf7e", + "metadata": {}, + "outputs": [], + "source": [ + "messages = \"What is Hi-C and how does it work?\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41f0f066", + "metadata": {}, + "outputs": [], + "source": [ + "import litellm\n", + "\n", + "response = litellm.completion(\n", + " model=\"openrouter/minimax/minimax-m2.5:free\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": \"How do population size fluctuations affect effective population size??\"}\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "879c7011", + "metadata": {}, + "outputs": [], + "source": [ + "# Just the answer\n", + "print(\"Answer:\", response.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2f7af13", + "metadata": {}, + "outputs": [], + "source": [ + "# The reasoning/thinking\n", + "print(\"Reasoning:\", response.choices[0].message.reasoning_content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e29bc4c", + "metadata": {}, + "outputs": [], + "source": [ + "# Token usage\n", + "print(\"Tokens used:\", response.usage.total_tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c5ca3c7", + "metadata": {}, + "outputs": [], + "source": [ + "# do embedding search\n", + "question = \"How do population size fluctuations affect effective population size?\"\n", + "results = embeddings.search(question, 3)\n", + "context = \"\\n\\n\".join([r[\"text\"] for r in results]) # pass to llm\n", + "\n", + "# verify\n", + "print(\"Retrieved from docs\")\n", + "for r in results:\n", + " print(f\"[Score: {r['score']:.3f}] {r['text'][:150]}...\")\n", + " print()\n", + "\n", + "# send with context\n", + "response = litellm.completion(\n", + " model=\"openrouter/minimax/minimax-m2.5:free\",\n", + " messages=[\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": \"Answer ONLY using the provided context. Cite which parts you're drawing from. If the context doesn't cover something, say 'not in my documents'.\"\n", + " },\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": f\"Context from my documents:\\n{context}\\n\\nQuestion: {question}\"\n", + " }\n", + " ]\n", + ")\n", + "print(\"\\nllm ans\")\n", + "print(response.choices[0].message.content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/03_flow.ipynb b/notebooks/03_flow.ipynb new file mode 100644 index 0000000..e98e655 --- /dev/null +++ b/notebooks/03_flow.ipynb @@ -0,0 +1,2827 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "143717cd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/opt/homebrew/anaconda3/bin/python\n" + ] + } + ], + "source": [ + "import sys\n", + "print(sys.executable)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "508336f4", + "metadata": {}, + "outputs": [], + "source": [ + "from kg_ocr import get_screenshots, extract_text, create_and_index, query_embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "11055f85", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e22406e942764e928a8bf58776e96e45", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading weights: 0%| | 0/103 [00:00 Oy and D > 0\n", + ", as intermediate-frequency alleles become more common. The slide emphasizes the challenge of\n", + "interpreting Tajima's D in bottleneck models due to these variable outcomes and underscores the\n", + "\n", + "need for careful consideration of demographic history in genetic analyses.\n", + "\n", + "@ Vivaldi\n", + "\n", + "at)\n", + "\n", + "v Speed Dial\n", + "\n", + "File Edit View Bookmarks\n", + "\n", + "S\n", + "\n", + "&% Workspaces v\n", + "\n", + "—- > a U8\n", + "\n", + "v Imported From... YY Imported From...\n", + "\n", + "© sic all pats\n", + "\n", + "Sources\n", + "\n", + "+ Add source\n", + "\n", + "Select all sources\n", + "\n", + "]\n", + "\n", + "Formulae.pdf\n", + "\n", + "Lecture O.pdf\n", + "\n", + "Lecture 1.pdf\n", + "\n", + "Lecture 1_1.pdf\n", + "\n", + "Lecture 2.pdf\n", + "\n", + "Lecture 3.pdf\n", + "\n", + "Lecture 4.pdf\n", + "\n", + "Lecture 5.pdf\n", + "\n", + "Lecture 6.pdf\n", + "\n", + "0 @ 0 ®\n", + "\n", + "Mail\n", + "\n", + "@ Cool Guides\n", + "\n", + "google.com\n", + "\n", + "Online Bewerbung QGIS API Docume...\n", + "\n", + "Oo\n", + "\n", + "Tools Window Help\n", + "\n", + "Startpage Search Results Kurs: Evolutionsgenetik de § NotebookLM\n", + "\n", + "om {fl v\n", + "\n", + "qgis - Trying to pe... New Script - Earth...\n", + "\n", + "Chat @ Refresh\n", + "\n", + "explain more about the following as per the sources\n", + "- 1. The change in allele frequency due to selection\n", + "2. The probability of fixation for an allele with\n", + "selection advantage and the time to fixation 3. The\n", + "equilibrium frequencies for both alleles in\n", + "overdominance\n", + "\n", + "Here's more information about allele frequency\n", + "change due to selection, the probability and time to\n", + "fixation of an allele with a selective advantage, and\n", + "equilibrium frequencies in overdominance, based on\n", + "the provided sources:\n", + "1. The change in allele frequency due to selection\n", + "* General principle Natural selection causes\n", + "changes in allele frequencies from one generation\n", + "to the next 1... . Fitter individuals produce more\n", + "viable offspring, influencing future generations 3.\n", + "- Fisher's theorem The fitness of the population is\n", + "\n", + "maximized 4\n", + "\n", + "14 sources\n", + "\n", + "Start typing...\n", + "\n", + "°©@\n", + "\n", + "Describe Wright-Fisher model assumptions abou >\n", + "\n", + "Pastebin.com - #1...\n", + "\n", + "So\n", + "te)\n", + "\n", + "$vertr? 6 8B &@ SB FQ\n", + "\n", + "® EvolG all pdfs - Notebook BAKSMMaTi ela dale most rel is\n", + "\n", + "Q: Search Startpage v @\n", + "TargetP 2.0-DTU... https://www.mood... OnePlus12R revie... Whois “Indian\"in.. vA\n", + "«~ Share 3 Settings CO\n", + "Studio > Note a\n", + "\n", + "Genomic Signatures of Natural Selection\n", + "\n", + "(Saved responses are view only)\n", + "\n", + "a reduction in variation [9].\n", + "2. Detecting Selection: Neutrality Tests\n", + "\n", + "- Neutrality tests compare observed data to theoretical\n", + "expectations under a neutral model (where alleles are not under\n", + "selection) to detect if selection has occurred [10].\n", + "\n", + "* Comparing multiple loci across the genome: Because selection\n", + "acts on few genes, loci under selection should be outliers when\n", + "compared to other parts of the genome [10, 11].\n", + "\n", + "* Tajima's D: This test compares two estimators of the population\n", + "mutation rate, 6 (theta). Tajima's D will be:\n", + "\n", + "* Negative when there is an excess of rare alleles, suggesting a\n", + "recent selective sweep or population expansion [5, 12].\n", + "\n", + "+ Positive when there is an excess of intermediate-frequency\n", + "alleles, suggesting balancing selection, population structure,\n", + "or a recent bottleneck [8, 9, 12].\n", + "\n", + "* Zero when a population is in equilibrium, and evolving\n", + "neutrally [12].\n", + "\n", + "* McDonald-Kreitman (MK) test: Compares the ratio of non-\n", + "synonymous (amino acid changing) to synonymous (silent)\n", + "substitutions within a species and between species. A higher ratio\n", + "between species compared to within species suggests positive\n", + "selection, and a lower ratio suggests purifying selection [13, 14].\n", + "\n", + "* Site Frequency Spectrum (SFS): The distribution of allele\n", + "frequencies in a sample. Selection leaves specific sianatures in the\n", + "\n", + "G) Convert to source\n", + "\n", + "NotebookLM can be inaccurate; please double check its responses.\n", + "\n", + "+\n", + "Q\n", + "\n", + "QC) Co reset —O—— 100 %\n", + "\n", + "c\n", + "\n", + "Mon Feb 10 16:00\n", + "\n", + "aw\n", + "\n", + "v\n", + "\n", + "O © HD\n", + "\n", + "QD\n", + "\n", + "ee\n", + "\n", + "Qu qa\n", + "\n", + "OH O®e< GO 8W OW A\n", + "\n", + "&\n", + "\n", + "16:00\n", + "\n", + "”\n", + "\n", + "@ Vivaldi\n", + "\n", + "at)\n", + "\n", + "v Speed Dial\n", + "\n", + "File Edit View Bookmarks\n", + "\n", + "S\n", + "\n", + "&% Workspaces v\n", + "\n", + "—- > a U8\n", + "\n", + "v Imported From... YY Imported From...\n", + "\n", + "© sic all pats\n", + "\n", + "Sources\n", + "\n", + "+ Add source\n", + "\n", + "Select all sources\n", + "\n", + "]\n", + "\n", + "Formulae.pdf\n", + "\n", + "Lecture O.pdf\n", + "\n", + "Lecture 1.pdf\n", + "\n", + "Lecture 1_1.pdf\n", + "\n", + "Lecture 2.pdf\n", + "\n", + "Lecture 3.pdf\n", + "\n", + "Lecture 4.pdf\n", + "\n", + "Lecture 5.pdf\n", + "\n", + "Lecture 6.pdf\n", + "\n", + "0 @ 0 ®\n", + "\n", + "Mail\n", + "\n", + "@ Cool Guides\n", + "\n", + "google.com\n", + "\n", + "Online Bewerbung QGIS API Docume...\n", + "\n", + "Oo\n", + "\n", + "Tools Window Help\n", + "\n", + "Startpage Search Results Kurs: Evolutionsgenetik de § NotebookLM\n", + "\n", + "om {fl v\n", + "\n", + "qgis - Trying to pe... New Script - Earth...\n", + "\n", + "Chat @ Refresh\n", + "\n", + "explain more about the following as per the sources\n", + "- 1. The change in allele frequency due to selection\n", + "2. The probability of fixation for an allele with\n", + "selection advantage and the time to fixation 3. The\n", + "equilibrium frequencies for both alleles in\n", + "overdominance\n", + "\n", + "Here's more information about allele frequency\n", + "change due to selection, the probability and time to\n", + "fixation of an allele with a selective advantage, and\n", + "equilibrium frequencies in overdominance, based on\n", + "the provided sources:\n", + "1. The change in allele frequency due to selection\n", + "* General principle Natural selection causes\n", + "changes in allele frequencies from one generation\n", + ".. . Fitter individuals produce more\n", + "viable offspring, influencing future generations 3.\n", + "\n", + "to the next 1\n", + "\n", + "- Fisher's theorem The fitness of the population is\n", + "\n", + "maximized 4\n", + "\n", + "14 sources\n", + "\n", + "Start typing...\n", + "\n", + "°©@\n", + "\n", + "Describe Wright-Fisher model assumptions abou >\n", + "\n", + "Pastebin.com - #1...\n", + "\n", + "So\n", + "te)\n", + "\n", + "$vertr? 6 8B &@ SB FQ\n", + "\n", + "® EvolG all pdfs - Notebook BAKSMMaTi ela dale most rel is\n", + "\n", + "Q: Search Startpage v\n", + "\n", + "TargetP 2.0-DTU... https://www.mood... OnePlus 12R revie... Who is “Indian” in ...\n", + "\n", + "«~ Share 3 Settings\n", + "\n", + "Studio > Note a\n", + "\n", + "Genomic Signatures of Natural Selection\n", + "\n", + "(Saved responses are view only)\n", + "\n", + "a reduction in variation [9].\n", + "2. Detecting Selection: Neutrality Tests\n", + "\n", + "- Neutrality tests compare observed data to theoretical\n", + "expectations under a neutral model (where alleles are not under\n", + "selection) to detect if selection has occurred [10].\n", + "\n", + "* Comparing multiple loci across the genome: Because selection\n", + "acts on few genes, loci under selection should be outliers when\n", + "compared to other parts of the genome [10, 11].\n", + "\n", + "* Tajima's D: This test compares two estimators of the population\n", + "mutation rate, 6 (theta). Tajima's D will be:\n", + "\n", + "* Negative when there is an excess of rare alleles, suggesting a\n", + "recent selective sweep or population expansion [5, 12].\n", + "\n", + "+ Positive when there is an excess of intermediate-frequency\n", + "alleles, suggesting balancing selection, population structure,\n", + "or a recent bottleneck [8, 9, 12].\n", + "\n", + "* Zero when a population is in equilibrium, and evolving\n", + "neutrally [12].\n", + "\n", + "* McDonald-Kreitman (MK) test: Compares the ratio of non-\n", + "synonymous (amino acid changing) to synonymous (silent)\n", + "substitutions within a species and between species. A higher ratio\n", + "between species compared to within species suggests positive\n", + "selection, and a lower ratio suggests purifying selection [13, 14].\n", + "\n", + "* Site Frequency Spectrum (SFS): The distribution of allele\n", + "frequencies in a sample. Selection leaves specific sianatures in the\n", + "\n", + "G) Convert to source\n", + "\n", + "NotebookLM can be inaccurate; please double check its responses.\n", + "\n", + "+\n", + "Q\n", + "\n", + "QC) Co reset —O—— 100 %\n", + "\n", + "c\n", + "\n", + "vA\n", + "\n", + "Mon Feb 10 16:00\n", + "\n", + "aw\n", + "\n", + "v\n", + "\n", + "QD\n", + "\n", + "ee\n", + "\n", + "Qu qa\n", + "\n", + "OH O®e< GO 8W OW A\n", + "\n", + "&\n", + "\n", + "16:00\n", + "\n", + "O © HD\n", + "\n", + "”\n", + "\n", + "ioh\n", + "\n", + "R\n", + "\n", + "i\n", + "\n", + "response to selection\n", + "selection intensity\n", + "\n", + "/ genetic variance\n", + "\n", + "' heritability\n", + "\n", + "Intro\n", + "\n", + "¢ Plant Morphogenesis\n", + "¢ Arabidopsis\n", + "\n", + "* Ovule development\n", + "* Kink & Bend\n", + "\n", + "Figure 1: Kink and Bend in Arabidopsis Thaliana\n", + "\n", + "Bottleneck models\n", + "\n", + "(A) (B)\n", + "\n", + "time\n", + "\n", + "population size\n", + "\n", + "Figure 5.2: Two cases in a bottleneck mode. (A) Only one ancestral line survives the\n", + "bottleneck. (B) Two or more lines survive which leads to different patterns in observed\n", + "data.\n", + "\n", + "8, 8, < Oy 8, > Ow\n", + "Tajima‘s D D<0 D>0\n", + "\n", + "It is more difficult for bottleneck modell!!\n", + "\n", + "Why is important to have an accurate demography?\n", + "\n", + "‘ol\n", + "\n", + "TY ey\n", + "\n", + "® o\n", + "position along genome\n", + "\n", + "\n", + "‘The difference between orthologs and paralogs lies in their evolutionary origin and functional\n", + "\n", + "rgence:\n", + "\n", + "1. Orthologs\n", + "\n", + "* Def\n", + "\n", + "1n: Genes that originate from a common ancestor due to a speci\n", + "\n", + "+ Function: Often retain similar functions across different species.\n", + "\n", + "+ Example: The hemoglobin gene in humans and mice is orthologous because both species\n", + "inherited it from a common ancestor.\n", + "\n", + "ints:\n", + "\n", + "Key\n", + "\n", + "V Arise from speciation events\n", + "V Found in different species\n", + "\n", + "V Generally have similar functions\n", + "\n", + "2. Paralogs\n", + "\n", + "* Def\n", + "\n", + "1n: Genes that arise from a gene duplication event within the same species.\n", + "+ Function: May evolve new or specialized functions.\n", + "\n", + "+ Example: Hemoglobin and myoglobin in humans—both originated from a gene duplication event\n", + "but evolved to serve different functions.\n", + "\n", + "Key Points:\n", + "V Arise from gene duplication events\n", + "V Found within the same species (or later diverging species)\n", + "\n", + "V Can have different functions\n", + "\n", + "Summary Table\n", + "Feature Orthologs Paralogs\n", + "Origin Speciation Gene duplication\n", + "Found in Different species ‘Same species (or later divergence)\n", + "Function Often conserved Can diverge significantly\n", + "\n", + "Example Human vs. mouse hemoglobin Human hemoglobin vs. myoglobin\n", + "\n", + "Self-fertilization TM\n", + "\n", + "Parents AA x aa Hetero- Homo-\n", + "J zygosity zygosity\n", + "\n", + "Aa «Aa\n", + "\n", + "— !~ may\n", + "\n", + "F, generation 50\n", + "\n", + "F, generation\n", + "\n", + "F, generation\n", + "\n", + "F, generation\n", + "\n", + "¢,corerion A ss\n", + "\n", + "Prof. Chns-Carolin Schon (TUM) | Plunt Brooding\n", + "\n", + "F, versus DH\n", + "\n", + "\n", + "Outcrossing — Panmixia — Hardy-Weinberg-Law TM\n", + "\n", + "In the absence of\n", + "\n", + "- selection\n", + "\n", + "- migration\n", + "\n", + "~ mutation\n", + "\n", + "we have under panmixia\n", + "\n", + "no change in gene frequencies\n", + "\n", + "EE recone\n", + "\n", + "=P, =p, -...\n", + "p=P+05H id mene\n", + "\n", + "equilibrium genotype\n", + "AA: Aa: aa=p?:2pq:q?\n", + "\n", + "after one generation!\n", + "\n", + "\n", + "Figure 1: Kink and Bend in Arabidopsis Thaliana\n", + "\n", + "\n", + "lf T’ is not significantly smaller than the fluctuation scale, the harmonic mean calculation risks\n", + "smoothing out critical periods of small population size, underestimating the true effect of genetic\n", + "drift on N.. For accurate modeling of genetic processes, T < min|[.N;] ensures that the\n", + "calculation aligns with the biological timescales of population size changes and their genetic\n", + "\n", + "consequences.\n", + "\n", + "Project 4: Phylogenetic Analysis\n", + "\n", + "Phylogenetic analysis is a crucial aspect of evolutionary biology and bioinformatics that\n", + "involves studying the evolutionary relationships among organisms. This project idea offers\n", + "opportunities for both undergraduate (UG) and postgraduate (PG) students to engage in\n", + "phylogenetic analysis, starting with constructing basic phylogenetic trees and progressing\n", + "to more complex methods.\n", + "\n", + "Bioinformatics Project Ideas — Undergraduate Level: Construct a Simple\n", + "Phylogenetic Tree\n", + "\n", + "At the undergraduate level, students can begin by constructing a basic phylogenetic tree\n", + "based on a gene or protein sequence. This project provides a foundational understanding of\n", + "phylogenetics and evolutionary relationships.\n", + "\n", + "Steps for UG Students:\n", + "\n", + "1. Gene or Protein Selection: Choose a gene or protein of interest that is well-\n", + "documented and has sequences available for multiple organisms.\n", + "\n", + "2. Sequence Alignment: Align the sequences of the chosen gene or protein using\n", + "software like ClustalW or MAFFT to identify conserved regions.\n", + "\n", + "3. Phylogenetic Tree Construction: Utilize software such as MEGA or PhyML to construct\n", + "a phylogenetic tree based on the aligned sequences. Apply methods like neighbor-\n", + "joining or maximum parsimony.\n", + "\n", + "4. Tree Visualization: Visualize the phylogenetic tree, highlighting the evolutionary\n", + "relationships among the organisms.\n", + "\n", + "5. Interpretation: Gain insights into the evolutionary history and relatedness of the\n", + "organisms based on the tree’s topology. Consider factors like branching patterns and\n", + "branch lengths.\n", + "\n", + "Postgraduate Level: Complex Phylogenetic Analyses and Co-evolutionary Patterns\n", + "\n", + "Bioinformatics Project Ideas — For postgraduate students, the project can advance to more\n", + "complex phylogenetic analyses, incorporating maximum likelihood methods and exploring\n", + "co-evolutionary patterns among genes or organisms.\n", + "\n", + "Additional Steps for PG Students:\n", + "\n", + "1. Maximum Likelihood Analysis: Learn and apply maximum likelihood methods for\n", + "phylogenetic tree reconstruction, which offer more accurate models of sequence\n", + "evolution. Software packages like RAXML or PhyML can be used.\n", + "\n", + "2. Molecular Clock Analysis: Investigate the concept of molecular clocks to estimate\n", + "divergence times between species. This involves incorporating evolutionary rates into\n", + "phylogenetic analyses.\n", + "\n", + "3. Co-evolutionary Analysis: Explore co-evolutionary patterns between genes, proteins,\n", + "or organisms using tools like Coevol or CAPS. Understand how changes in one\n", + "component correlate with changes in another.\n", + "\n", + "4. Advanced Tree Visualization: Use advanced tree visualization tools to create\n", + "informative and publication-quality figures. Highlight key evolutionary events or\n", + "relationships.\n", + "\n", + "5. Biological Interpretation: Analyze the implications of the phylogenetic findings. How\n", + "do the results contribute to our understanding of evolutionary processes, adaptations, or\n", + "co-evolutionary dynamics?\n", + "\n", + "6. Publication and Presentation: Encourage PG students to disseminate their findings\n", + "through research publications or presentations at scientific conferences, contributing to\n", + "the field of evolutionary biology and phylogenetics.\n", + "\n", + "In summary, phylogenetic analysis projects offer a captivating journey into the study of\n", + "evolutionary relationships among organisms. These projects provide valuable insights into\n", + "the evolutionary history of genes, proteins, and species, and they equip students with\n", + "essential skills in bioinformatics and computational biology. Additionally, complex\n", + "phylogenetic analyses enable postgraduate students to explore cutting-edge methods and\n", + "contribute to our understanding of co-evolutionary dynamics in biology.\n", + "\n", + "Project 5: Drug Discovery and Virtual Screening\n", + "\n", + "Drug discovery is a multidisciplinary field that combines biology, chemistry, and\n", + "computational methods to identify and design potential drug candidates. This project idea\n", + "provides opportunities for both undergraduate (UG) and postgraduate (PG) students to\n", + "explore the exciting world of drug discovery, starting with basic virtual screening\n", + "experiments and progressing to advanced structure-based drug design.\n", + "\n", + "Undergraduate Level: Basic Virtual Screening\n", + "\n", + "At the undergraduate level, students can start by learning about drug databases and\n", + "conducting basic virtual screening experiments to identify potential drug candidates. This\n", + "project offers an introduction to the concepts and tools used in drug discovery.\n", + "\n", + "Steps for UG Students:\n", + "\n", + "1. Drug Database Exploration: Familiarize yourself with drug databases like PubChem or\n", + "DrugBank. Select a target protein of interest, preferably one with known drug-binding\n", + "sites.\n", + "\n", + "2. Ligand Preparation: Retrieve ligand molecules (small compounds) from the database\n", + "that may potentially bind to your target protein. Prepare the ligands by removing any\n", + "irrelevant atoms or functional groups.\n", + "\n", + "3. Protein-Ligand Docking: Utilize software tools like AutoDock or PyRx to perform\n", + "\n", + "2-IV\n", + "\n", + "Figure 1: Kink and Bend in Arabidopsis Thaliana\n", + "\n", + "> Science. 2009 Oct 9;326(5950):289-93. doi: 10.1126/science.1181369.\n", + "\n", + "Comprehensive mapping of long-range interactions\n", + "reveals folding principles of the human genome\n", + "\n", + "Erez Lieberman-Aiden ’, Nynke L van Berkum, Louise Williams, Maxim Imakaev, Tobias Ragoczy,\n", + "Agnes Telling, Ido Amit, Bryan R Lajoie, Peter J Sabo, Michael O Dorschner, Richard Sandstrom,\n", + "Bradley Bernstein, M A Bender, Mark Groudine, Andreas Gnirke, John Stamatoyannopoulos,\n", + "Leonid A Mirny, Eric S Lander, Job Dekker\n", + "\n", + "Affiliations + expand\n", + "PMID: 19815776 PMCID: PMC2858594 DOI: 10.1126/science.1181369\n", + "\n", + "Abstract\n", + "\n", + "We describe Hi-C, a method that probes the three-dimensional architecture of whole genomes by\n", + "coupling proximity-based ligation with massively parallel sequencing. We constructed spatial\n", + "proximity maps of the human genome with Hi-C at a resolution of 1 megabase. These maps confirm\n", + "the presence of chromosome territories and the spatial proximity of small, gene-rich chromosomes.\n", + "We identified an additional level of genome organization that is characterized by the spatial\n", + "segregation of open and closed chromatin to form two genome-wide compartments. At the\n", + "megabase scale, the chromatin conformation is consistent with a fractal globule, a knot-free,\n", + "polymer conformation that enables maximally dense packing while preserving the ability to easily\n", + "fold and unfold any genomic locus. The fractal globule is distinct from the more commonly used\n", + "globular equilibrium model. Our results demonstrate the power of Hi-C to map the dynamic\n", + "conformations of whole genomes.\n", + "\n", + "Figure 4 Genetic separation between\n", + "population pairs. (a) Relative cross\n", + "coalescence rates in and out of Africa.\n", + "African-non-African pairs are shown in red,\n", + "and pairs within Africa are shown in purple.\n", + "(b) Relative cross coalescence rates between\n", + "populations outside Africa. European—East\n", + "Asian pairs are shown in blue, Asian-MXL\n", + "pairs are shown in green, and other\n", + "non-African pairs are shown in other\n", + "\n", + "colors, as indicated. The pairs that include\n", + "MXL are masked to include only the putative\n", + "Native American components. In a and b,\n", + "the most recent population separations\n", + "\n", + "are inferred from eight haplotypes, that is,\n", + "four haplotypes from each population, and\n", + "corresponding pairs are indicated by a\n", + "\n", + "cross. (c) Comparison of the African—non-\n", + "African split with simulations of clean splits.\n", + "We simulated three scenarios, at split times\n", + "50,000, 100,000 and 150,000 years ago.\n", + "The comparison demonstrates that the history\n", + "of relative cross coalescence rate between\n", + "African and non-African ancestors\n", + "\n", + "is incompatible with a clean split model\n", + "\n", + "and suggests it progressively decreased from\n", + "\n", + "Relative cross coalescence rate\n", + "\n", + "Relative cross coalescence rate ©\n", + "\n", + "0.8\n", + "\n", + "0.6\n", + "\n", + "O4\n", + "\n", + "0.2\n", + "\n", + "— MXL-YRI\n", + "— CEU-YRI\n", + "— CHB-YRI\n", + "— CEU-MKK\n", + "— CEU-LWK\n", + "~ YRI-MKKT\n", + "= LWK-MKKt\n", + "= YRI-LWkt\n", + "\n", + "10°\n", + "\n", + "Time (years ago)\n", + "\n", + "100\n", + "\n", + "Time (x1 o years ago)\n", + "\n", + "150\n", + "\n", + "a\n", + "fs 1.0\n", + "2\n", + "2 08 — CHB-CEU\n", + "8 ~ MXL-CEU\n", + "8 0.6 — CHB-MXL\n", + "8 — GIH-MXL\n", + "8 04 = CHB-GIH\n", + "3 — GIH-CEut\n", + "2 02 - CHB-UPT!\n", + "= CEU-TSI\n", + "2 o CEU-TSI\n", + "10°\n", + "200\n", + "® 100\n", + "~ CEU-YRI Fy\n", + "~ 50,000 years ago, %b 50\n", + "simulation XK\n", + "— 100,000 years ago, 3 20\n", + "simulation E\n", + "= 150,000 years ago, 10\n", + "\n", + "200\n", + "\n", + "simulation\n", + "\n", + "250\n", + "\n", + "beyond 150,000 years ago to approximately 50,000 years ago. (d) Schematic of population separations. Timings of splits, population separations,\n", + "gene flow and bottleneck are shown along a logarithmic axis of time.\n", + "\n", + "\n", + "© aman — nano ./Downloads/assignment/Ecoli_|\n", + "\n", + "fi/Ecoli_hifi_genome.gff — 208x63\n", + "\n", + "nment/Ecoli_hifi/Ecoli_hifi_genome.gff\n", + "\n", + "ile: ./Downloads/as\n", + "\n", + "Ww PICO 5.09\n", + "\n", + "i#gff-version 3\n", + "\n", + "##sequence-region tig@0000001 1\n", + "\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tige0eee0e1\n", + "tigeeeee0e1\n", + "\n", + "Wie) Get Help\n", + "Wed Exit\n", + "\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "minced:@.2.0\n", + "\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "Prodigal: 002006\n", + "\n", + "465\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "CRI\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "cDS\n", + "\n", + "7533\n", + "\n", + "99\n", + "1718\n", + "2811\n", + "5892\n", + "7393\n", + "7888\n", + "8982\n", + "9643\n", + "10258\n", + "11177\n", + "11567\n", + "12412\n", + "13701\n", + "14611\n", + "16038\n", + "16693\n", + "17210\n", + "17540\n", + "18250\n", + "18726\n", + "19756\n", + "20511\n", + "21277\n", + "22479\n", + "23565\n", + "25018\n", + "25799\n", + "26529\n", + "26863\n", + "27693\n", + "28797\n", + "29615\n", + "30377\n", + "33014\n", + "33225\n", + "33615\n", + "35767\n", + "36774\n", + "37895\n", + "38158\n", + "39034\n", + "39596\n", + "40182\n", + "40790\n", + "42619\n", + "43560\n", + "45279\n", + "45821\n", + "46585\n", + "46988\n", + "47617\n", + "49050\n", + "50764\n", + "51926\n", + "52606\n", + "54986\n", + "\n", + "SPR\n", + "\n", + "We) WriteOut\n", + "We) Justify\n", + "\n", + "1643\n", + "\n", + "2452\n", + "\n", + "5477\n", + "\n", + "7400\n", + "\n", + "7875\n", + "\n", + "8979\n", + "\n", + "9656\n", + "\n", + "10242\n", + "11175\n", + "11461\n", + "12329\n", + "13449\n", + "14609\n", + "16038\n", + "16643\n", + "17016\n", + "17521\n", + "18250\n", + "18729\n", + "19775\n", + "20517\n", + "21137\n", + "22416\n", + "23471\n", + "24929\n", + "25794\n", + "26437\n", + "26900\n", + "27696\n", + "28601\n", + "29564\n", + "30271\n", + "32938\n", + "33148\n", + "33578\n", + "35693\n", + "36777\n", + "37895\n", + "38167\n", + "39030\n", + "39384\n", + "40057\n", + "40793\n", + "42616\n", + "43542\n", + "45269\n", + "45821\n", + "46588\n", + "46995\n", + "47458\n", + "49041\n", + "50507\n", + "51777\n", + "52453\n", + "54858\n", + "56119\n", + "\n", + "tet et eteetse\n", + "\n", + "tet et etetetsei\n", + "\n", + "i\n", + "\n", + "tet etetesti\n", + "\n", + "++H1\n", + "\n", + "F\n", + "\n", + "SPBV2VVDVVVOVVO\n", + "\n", + "PBYWDVDDWDD WDD VDD DD VDD VV VDD VDDD DVD VDDVDDVDDVDVDVDVDVDVDVDVDVDVVVVVVOVOQ:\n", + "\n", + "Wii Read File\n", + "Wil) Where is\n", + "\n", + "ID=KBOCNLJJ_00001; eC_number=1.8.1.2;Name=cysI_1;db_xref=COG:C0G0155; gene=cysI_1;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00002; eC_number=1.8.4.8;Name=cysH_1;db_xref=COG:C0G0175; gene=cysH_1;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00003; eC_number=3.1.-.-—;Name=ygcB_1;db_xref=COG:C0G1203; gene=ygcB_1;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00004;Name=casA_1;gene=casA_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:Q4690$\n", + "ID=KBOCNLJJ_@0005 ; Name=casB_1;gene=casB_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:P7663$\n", + "ID=KBOCNLJJ_00006;Name=casC_1;gene=casC_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:Q4689$\n", + "ID=KBOCNLJJ_@0007 ; Name=casD_1;gene=casD_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:Q4689$\n", + "ID=KBOCNLJJ_00008; eC_number=3.1. j;Name=casE_1;gene=casE_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequen$\n", + "ID=KBOCNLIJJ_00009; eC_number=3.1. j;Name=ygbT_1;db_xref=COG:C0G1518; gene=ygbT_1;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00010; eC_number=3.1.-.-—;Name=ygbF_1;gene=ygbF_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequen$\n", + "note=CRISPR with 13 repeat units;rpt_family=CRISPR;rpt_type=direct\n", + "\n", + "ID=KBOCNLJJ_00011;inference=ab initio prediction: Prodigal : 002006; locus_tag=KBOCNLJJ_00011;product=hypothetical protein\n", + "ID=KBOCNLJJ_00012; eC_number=2.7.7.4;Name=cysD_1;db_xref=COG:C0G0175; gene=cysD_1;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00013; eC_number=2.7.7.4;Name=cysN; db_xref=COG:C0G2895; gene=cysN;inference=ab initio prediction:Prodigal: 002006, simi$\n", + "ID=KBOCNLJJ_00014; eC_number=2.7.1.25;Name=cysC; db_xref=COG:C0G@529; gene=cysC;inference=ab initio prediction:Prodigal: 002006, sim$\n", + "ID=KBOCNLJJ_00015 ; Name=ygbE; gene=ygbE;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:P46141;1lo$\n", + "ID=KBOCNLJJ_00016;Name=ftsB; db_xref=COG:C0G2919; gene=ftsB;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00017; eC_number=2.7.7.60;Name=ispD; db_xref=COG:C0G1211; gene=ispD;inference=ab initio prediction:Prodigal: 002006, sim$\n", + "ID=KBOCNLJJ_00018; eC_number=4.6.1.12;Name=ispF; db_xref=COG:C0G@245; gene=ispF;inference=ab initio prediction:Prodigal: 002006, sim$\n", + "ID=KBOCNLJJ_00019; eC_number=5.4.99.27;Name=truD; db_xref=COG:C0G0585; gene=truD; inference=ab initio prediction:Prodigal:002006,si$\n", + "ID=KBOCNLJJ_00020; eC_number=3.1.3.5;Name=surE; db_xref=COG:C0G0496; gene=surE;inference=ab initio prediction:Prodigal: 002006, simi$\n", + "ID=KBOCNLJJ_00021; eC_number=2.1.1.77;Name=pcm; db_xref=COG:C0G2518; gene=pcm; inference=ab initio prediction:Prodigal: 002006, simil$\n", + "ID=KBOCNLJJ_00022;Name=n1pD_1; db_xref=COG:C0G@739; gene=nlpD_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequ$\n", + "ID=KBOCNLJJ_00023;Name=rpoS; db_xref=COG:C0G0568; gene=rpoS;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00024;Name=ygbN; db_xref=COG:C0G2610;gene=ygbN; inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00025; eC_number=5.3.1.35;Name=otnI; db_xref=COG:C0G3622; gene=otnI;inference=ab initio prediction:Prodigal: 002006, sim$\n", + "ID=KBOCNLJJ_00026; eC_number=4.1.1.104;Name=otnC;gene=otnC;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_0@0027; eC_number=2.7.1.217;Name=otnK_1;db_xref=COG:C0G3395; gene=otnK_1;inference=ab initio prediction:Prodigal:00200$\n", + "ID=KBOCNLJJ_00028; eC_number=2.7.1.217;Name=otnK_2;db_xref=COG:C0G3395; gene=otnK_2;inference=ab initio prediction:Prodigal:00200$\n", + "ID=KBOCNLJJ_00029; eC_number=1.1.1.411;Name=1tnD; gene=1tnD;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00030;Name=g1lcR;db_xref=COG:C0G1349; gene=glcR;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00031; eC_number=3.1.3.16;Name=pphB; db_xref=COG:C0G@639; gene=pphB; inference=ab initio prediction:Prodigal: 002006, sim$\n", + "ID=KBOCNLJJ_00032;Name=mutS;db_xref=COG:C0G0249; gene=mutS;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00033;inference=ab initio prediction: Prodigal: 002006; locus_tag=KBOCNLJJ_00033;product=hypothetical protein\n", + "ID=KBOCNLJJ_00034;inference=ab initio prediction: Prodigal : 002006; locus_tag=KBOCNLJJ_00034;product=hypothetical protein\n", + "ID=KBOCNLJJ_00035 ; Name=fh1A; db_xref=COG:C0G3604;gene=fhlA;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLIJJ_00036; eC_number=4.2.1.—;Name=hypE; db_xref=COG:C0G@309; gene=hypE;inference=ab initio prediction:Prodigal: 002006, simi$\n", + "ID=KBOCNLJJ_00037 ; Name=hypD; db_xref=COG:C0G0409; gene=hypD; inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00038; Name=hypC; db_xref=COG:C0G0298; gene=hypC;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLIJJ_00039 ; Name=hypB; db_xref=COG:C0G0378; gene=hypB; inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_@0040 ; Name=hypA; db_xref=COG:C0G0375; gene=hypA;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_@0041;Name=hycA;gene=hycA;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:P@AEV4; 1lo$\n", + "ID=KBOCNLJJ_00042; eC_number=1.-. j;Name=hyfA_1; db_xref=COG:C0G1142; gene=hyfA_1;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00043; eC_number=7.1.1.—;Name=ndhB_1;gene=ndhB_1;inference=ab initio prediction:Prodigal:002006,protein motif :HAMAP:$\n", + "ID=KBOCNLJJ_00044;Name=hycD; db_xref=COG:C0G0650;gene=hycD;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_@0045 ; Name=hycE; db_xref=COG:C0G3261; gene=hycE;inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00046; eC_number=7.1.1.—;Name=ndhI_1;gene=ndhI_1;inference=ab initio prediction:Prodigal:002006,protein motif :HAMAP:$\n", + "ID=KBOCNLJJ_00047 ; Name=hycG_1; db_xref=COG:C0G3260; gene=hycG_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequ$\n", + "ID=KBOCNLJJ_00048;inference=ab initio prediction: Prodigal: 002006; locus_tag=KBOCNLJJ_00048;product=hypothetical protein\n", + "ID=KBOCNLJJ_0@0049; eC_number=3.4.23.51;Name=hycI ;db_xref=COG:C0G0680;gene=hycI;inference=ab initio prediction:Prodigal:002006,si$\n", + "ID=KBOCNLJJ_@0050; eC_number=3.2.1.86;Name=bg1H_1;db_xref=COG:C0G2723; gene=bg1H_1;inference=ab initio prediction:Prodigal:002006$\n", + "ID=KBOCNLJJ_00051; Name=bg1F_1;db_xref=COG:C0G1263; gene=bg1F_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequ$\n", + "ID=KBOCNLJJ_00052;Name=ascG; db_xref=COG:C0G1609; gene=ascG; inference=ab initio prediction:Prodigal:002006,similar to AA sequence$\n", + "ID=KBOCNLJJ_00053; eC_number=1.-.-.-—;Name=hyfA_2;db_xref=COG:C0G1142;gene=hyfA_2;inference=ab initio prediction:Prodigal:002006,$\n", + "ID=KBOCNLJJ_00054; eC_number=6.2.-—.—;Name=hypF; db_xref=COG:C0G@068; gene=hypF;inference=ab initio prediction:Prodigal: 002006, simi$\n", + "ID=KBOCNLJJ_00055; eC_number=1.18.1.-—;Name=norw; db_xref=COG:C0G1251; gene=norW; inference=ab initio prediction:Prodigal: 002006, sim$\n", + "\n", + "bad Prev Pg Wag Cut Text wie Cur Pos\n", + "WA) Next Pg wig) UnCut Text Way To Spell\n", + "\n", + "\n", + "tion divergence as a function of divergence time\n", + "\n", + "bt (generators\n", + "\n", + "Ot (gene ations\n", + "\n", + "\n", + "© Pupiisn\n", + "\n", + "10-\n", + "\n", + "group\n", + "° 1G\n", + "\n", + "BOUBUBA %LZ 'ZOd\n", + "\n", + "\n", + "Figure 1 MSMC locally infers branch lengths a Recombination\n", + "\n", + "and coalescence times from observed\n", + "\n", + "mutations. (a) Schematic of the model. Total branch length T Time\n", + "Local genealogies change along the sequences (past)\n", + "by recombination events that rejoin branches of First coalescence t\n", + "\n", + "the tree, according to the SMC’ model®®. (hidden state) %\n", + "The pattern of mutations depends on the %\n", + "\n", + "genealogy, with few mutations on branches % a SS\n", + "with recent coalescences and more mutations\n", + "in deeper branches. The hidden states of the\n", + "model are the time to the first coalescence and\n", + "the identity of the two sequences participating\n", + "in the first coalescence. (b) MSMC can locally\n", + "infer its hidden states, shown by the posterior\n", + "probability with color. In black, we plot the\n", + "first coalescence time as generated by the\n", + "simulation. This local inference works well\n", + "\n", + "for two, four and eight haplotypes. As more 300\n", + "haplotypes are used, the typical time to the Position (kb)\n", + "first coalescence event decreases, whereas the 4 haplotypes\n", + "typical segment length increases.\n", + "\n", + "cs\n", + "\n", + "Log\n", + "\n", + "First coalescence fy...\n", + "\n", + "of the sample size (M), = 2/(M(M — 1)), in\n", + "units of 2No generations (Fig. 1b and Online\n", + "Methods), where No is the long-term average\n", + "effective population size. Here we demonstrate\n", + "\n", + "0 200 400 600 800 1,000 1,200 1,400\n", + "application of our model on up to 8 haplotypes, Position (kb)\n", + "which allows us to study changes in popula- 8 haplotypes\n", + "tion size occurring as recently as 70 genera- 0.15\n", + "tions ago. As a special case of MSMC for two\n", + "haplotypes, we provide a new implementation\n", + "of PSMC that we call PSMC’ because it uses\n", + "the SMC’ model, which accounts for recombi-\n", + "nation events between segments with the same\n", + "time to coalescence®. PSMC’ accurately esti- 500 1,000 1,500 2,000 2,500\n", + "mates the recombination rate (Supplementary Position (kb)\n", + "Fig. 1), which is not the case for PSMC’.\n", + "\n", + "First coalescence tj,.\n", + "\n", + "S\n", + "o\n", + "\n", + "0.05\n", + "\n", + "First coalescence tj...\n", + "\n", + "Ayiqeqosd 10N0}s0q\n", + "\n", + "What excites you about doing science?\n", + "\n", + "What excites you about doing science?\n", + "do you have? Please describe a past ex\n", + "your drive for scientific inquiry. {max 3C\n", + "\n", + "COURSEWORK 4 6\n", + "DL: check the Moodle athe\n", + "\n", + "7 >\n", + "\n", + "Towards complete and error-free genome assemblies of\n", + "\n", + "all vertebrate species\n", + "1.Pick one of main themes . t Pp\n", + "\n", + "MORE VIDEOS\n", + "[BREE TA\n", + "\n", + "TECH\n", + "Pm i) 35:57/37:42 © @ & Voulube ++\n", + "\n", + "\n", + "Reference genome $$ eee eC CCC OOOO OCT OO a\n", + "\n", + "= Sa « al\n", + "’ ’\n", + "nn | Pullup (or grastso dactang\n", + "\n", + ">\n", + "-\n", + "\n", + "*\n", + "\n", + "—\n", + "- : =\n", + ". -\n", + "*\n", + "ee\n", + "\n", + "Effects of Population Size on Genetic Diversity Metrics\n", + "\n", + "“To understand how population growth, dactne, and stability influence genetic diversity, we consider\n", + "ther effects on tres kay mats:\n", + "\n", + "+x (Theta n): The average numberof pairwise cifferences between sequences.\n", + "+ 8W (Theta Watterson}: A measure based on the number af segregating sites.\n", + "‘+ Tojima's D: A statistical test that compares 8 and 6W to detect deviations from neutral\n", + "\n", + "evolution,\n", + "\n", + "“These simplified scenarios illustrate how population size changes impact genetic variation.\n", + "\n", + "Scenario 1: Population Growth\n", + "\n", + "Description:\n", + "‘When @ population expands rapidly, many rare alleles appear due to the racent increase in inevicuals.\n", + "\n", + "‘Assumptions (Hypothetical Values):\n", + "\n", + "+ on\n", + "\n", + "(Pairwise citferences are low since most sequences are very similar due tothe\n", + "expansion)\n", + "\n", + "+ 8W-=4 (More segregating sites appear due to expansion)\n", + "+ Tajima’sb Calculation:\n", + "\n", + "O, = Ow 2-4\n", + "” Sandard deviation 1\n", + "\n", + "Since 6r < 8W, Tajma’s Dis negative.\n", + "\n", + "‘conclusion:\n", + "Population growth results in 6x < OW and Tajima’s D <0, indicating an excoss of rare variants\n", + "\n", + "Scenario 2: Population Deciit\n", + "\n", + "Description:\n", + "\n", + "e (Bottleneck)\n", + "\n", + "‘A population experiences a drastic reduction in size, leacing to the loss of rare alleles and an\n", + "‘overrpresentation of comman ones.\n", + "\n", + "‘Assumptions (Hypothetical Values):\n", + "+ on\n", + "\n", + "(Pairwise citferonces are higher because the remaining sequences are more divergent)\n", + "+ eW=4 (Fewer segregating sites due tothe bottleneck)\n", + "+ Tajima’sb Calculation:\n", + "\n", + "0, = Ow. on4\n", + "~ Wandard deviation ~ 1\n", + "\n", + "Since 6x > BW, Tajma’s Dis postive\n", + "\n", + "D =2\n", + "\n", + "‘conclusion:\n", + "\n", + "Population dectne results in @x > BW and Taima's D > 0, suggesting a loss of rae aloes.\n", + "\n", + "Scenario 3: Constant Population Size\n", + "\n", + "Description:\n", + "[A population remains stable ver time, with alle frequencies evolving neutral\n", + "\n", + "‘Assumptions (Hypothetical Values):\n", + "+ on\n", + "\n", + "(Pairwise citferances match the expected diversity level)\n", + "+ @W=5 (Segregating sites align with a stable population)\n", + "+ Tajima’sb Calculation:\n", + "\n", + "0, — Ow\n", + "Randard deviation 1\n", + "\n", + "Since 6x = BW, Tajima’s Dis zor,\n", + "\n", + "D\n", + "\n", + "‘conclusion:\n", + "\n", + "[A stable population results in 8x = 8W and Taima’s D = 0, indicating neutral evelution.\n", + "\n", + "Summary\n", + "\n", + "‘Changes in population size affect genetic variation in distinct ways:\n", + "+ Population Growth > More rave alleles > Negative Tajima’sD.\n", + "+ Population Decline > Fewer rare alleles > Positive Tajima's .\n", + "\n", + "+ Stable Population > Balanced allele frequencies > Tajima’s D= 0.\n", + "\n", + "‘These tends help researchers infer historical der raphic changes in populations from genetic data,\n", + "\n", + "Fragment\n", + "\n", + "= ————\n", + "\n", + "=. —,\n", + "> sequencing *——— a\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "This slide focuses on the effect of slow fluctuations in population size on the effective\n", + "population size (V.) and emphasizes the conditions under which the harmonic mean formula for\n", + "N- applies. Slow fluctuations occur when the time period of interest (Z’) is much shorter than the\n", + "minimum population size (min[N;]) across the fluctuation cycle. In such cases, the population\n", + "\n", + "-1\n", + "size appears relatively stable, and the harmonic mean formula (N. = (4 wh x) ) may not\n", + "\n", + "accurately represent the effective population size over longer periods. The diagram illustrates that\n", + "during slow fluctuations, the coalescent events occur more gradually, and population size changes\n", + "are less abrupt compared to rapid fluctuations. The key message is that for the harmonic mean\n", + "calculation to be meaningful, the time scale of observation (Z') must be significantly smaller than\n", + "the scale of population size changes, ensuring accurate modeling of genetic drift and coalescence\n", + "\n", + "processes over generations.\n", + "\n", + "= F, compares the average expected heterozygosity of\n", + "individual subpopulations (S) to the total expected\n", + "heterozygosity if the subpopulations are combined (T).\n", + "\n", + "py, = n= Hs) -\\-(%)\n", + "H, H,\n", + "\n", + "@ Safari File Edit\n", + "\n", + "M- <\n", + "\n", + "Paraphraser\n", + "\n", + "x\n", + "\n", + "Grammar\n", + "Checker\n", + "\n", + "ws\n", + "Al Detector\n", + "Q@\n", + "\n", + "Plagiarism\n", + "Checker\n", + "\n", + "@\n", + "Al\n", + "Humanizer\n", + "\n", + "ie)\n", + "Al Chat\n", + "\n", + "cs\n", + "\n", + "Al lmage\n", + "Generator\n", + "\n", + "Summarizer\n", + "\n", + "MA\n", + "\n", + "Translate\n", + "\n", + "99\n", + "\n", + "Citation\n", + "Generator\n", + "\n", + "eG\n", + "\n", + "QuillBot\n", + "Flow\n", + "\n", + "a\n", + "a\n", + "QuillBot for\n", + "macOS\n", + "\n", + "View History Bookmarks Window’ Help\n", + "\n", + "118\n", + "\n", + "© ® ec\n", + "\n", + "nH Al Detector - QuillBot Al oe G\n", + "\n", + "quillbot.com\n", + "\n", + "@&@ GO BO ¥ 3 ©\n", + "\n", + "Gea ©\n", + "\n", + "(4) Perfect your writing in all your favorite apps with QuillBot for macOS\n", + "\n", + "Al Detector\n", + "\n", + "English French Spanish German Ally\n", + "\n", + "factors. Also some of the differential genes were associated with compartment switches too, W\n", + "especially upregulated ones, but these were not statistically significant. It was seen that\n", + "upregulated genes had more significant structural links as compared to the downregulated\n", + "genes. Although the smaller number of downregulated genes may reduce statistical power,\n", + "\n", + "the consistent lack of enrichment across architectural levels suggests that their regulation is\n", + "\n", + "less connected to architecture reorganization.\n", + "\n", + "Taken together, the transcriptional changes in the PRC2 mutant are linked to regions\n", + "undergoing architectural reorganisation in the form of loops, weak insulation, and\n", + "compartment switches. It was also noted that not all architectural changes connected to\n", + "transcriptional changes, and not all DEGs aligned with structural reorganization, implying\n", + "presence of additional regulatory layers. Chromatin architecture provides a necessary\n", + "framework for gene regulation, but it may not be sufficient on its own.\n", + "\n", + "with many being linked to upregulated genes. These results indicate that the effect of PRC2\n", + "\n", + "loss on transcription is not restricted to newly formed contacts but extends across different\n", + "\n", + "categories of loop stability. Moreover, genes were often contacted by multiple loops, in some\n", + "cases over ten, pointing to a high degree of regulatory connectivity. The reason for this\n", + "multiplicity or redundancy was not explored in terms of log fold change. Some genes had\n", + "\n", + "oD\n", + "\n", + "2,909 Words @ Analysis complete\n", + "\n", + "Want your text to sound more authentic?\n", + "\n", + "Model Version: v5.7.1\n", + "\n", + "2%\n", + "\n", + "of text is likely Al ©\n", + "© QuillBot\n", + "\n", + "Al\n", + "\n", + "Al-generated @\n", + "Al-generated & Al-refined @\n", + "Human-written & Al-refined @\n", + "\n", + "Human-written @\n", + "\n", + "¥Y Understanding your results\n", + "\n", + "Human\n", + "\n", + "< Share\n", + "\n", + "@ Tue 14. Oct 22:33\n", + "\n", + "a\n", + "\n", + "&} Apps and Extensio...\n", + "\n", + "& Download =\n", + "\n", + "Feedback\n", + "\n", + "D\n", + "\n", + "History\n", + "\n", + "oO 22%\n", + "0%\n", + "0%\n", + "\n", + "98%\n", + "\n", + "Refine with Paraphra\n", + "\n", + "v\n", + "\n", + "ae (eG\n", + "\n", + "\n", + "Figure 2 Testing MSMC on simulated data. a b ~ 10,000 years ago. Sees\n", + "\n", + "(a) To test the resolution of MSMC applied to = Simulation — 40,000 years ago, 8 haplotypes\n", + "two, four and eight haplotypes, we simulated — 2 haplotypes +++» 100,000 years ago, simulation\n", + "a series of exponential population growths and — 4 haplotypes — 100,000 years ago, 4 haplotypes\n", + "\n", + "— 100,000 years ago, 8 haplotypes\n", + "\n", + "— 8 haplotypes\n", + "\n", + "declines, each changing the population size by\n", + "a factor of ten. MSMC recovers the resulting\n", + "zigzag pattern (on a double-logarithmic plot)\n", + "in different times, depending on the number\n", + "of haplotypes. With two haplotypes, MSMC\n", + "infers the population history from 40,000 to\n", + "\n", + "3 million years ago, whereas, with four and\n", + "eight haplotypes, it infers the population\n", + "history from 8,000 to 30,000 years ago ra 7 ;\n", + "and from 2,000 to 50,000 years ago, 10 10 10 10 10 10\n", + "respectively. (b) Model estimates from two Time (years ago) Time (years ago)\n", + "\n", + "simulated population splits 10,000 and 100,000 years ago. The dotted lines plot the expected relative cross coalescence rate between the two\n", + "populations before and after the splits. Maximum-likelihood estimates are shown in red (four haplotypes) and purple (eight haplotypes). As expected,\n", + "four haplotypes yield good estimates for the older split, whereas eight haplotypes give better estimates for the more recent split.\n", + "\n", + "ond\n", + "o\n", + "\n", + "°\n", + "©\n", + "\n", + "10°\n", + "\n", + "°\n", + "b\n", + "\n", + "10*\n", + "\n", + "Effective population size\n", + "°\n", + "ny\n", + "\n", + "Relative cross coalescence rate\n", + "o\n", + "o\n", + "\n", + "°\n", + "\n", + "\n", + "eco (ff = > OQ VD BG monkeytype.com Ws. Search SEARXNG-NALAKATH eo @°\n", + "\n", + "New merch store now open, including a limited edition metal keycap! monkeytype.store x\n", + "\n", + "70\n", + "94%\n", + "\n", + "cautich 80 176/7/86/0 82% 30s\n", + "\n", + "english\n", + "\n", + "GO @QZx,gQvnvuds» HM BOC BD.\n", + "\n", + "&\n", + "\n", + "Workspaces v <_|/txtai: @ All-in-one a Examples - txtai () txtai/examples/13_Similar () txtai/examples/38_Introdu Mc Introducing RAG with txta *K Image caption generation (m) Monkeytype | A minimalis' > +\n", + "\n", + "0@e S CI CQ Reset —O——$—$——— 100% = 21:20\n", + "\n", + "g\n", + "&\n", + "\n", + "‘oUala Cel (urgor IS Freguiatea DY a\n", + "\n", + "ABA complex network of interacting second\n", + "Se messengers, pH, membrane potential,\n", + "protein phosphorylation, ion channel\n", + "NOE 10 activity — and more!!\n", + "\n", + "\n", + "Variable population size\n", + "\n", + "Beyond the Standard Neutral Model\n", + "\n", + "Slow fluctuations\n", + "in population size : = =\n", + "\n", + "4 Need:\n", + "A, 7 T << min[N, |\n", + "\n", + "\n", + "@ = Safari File\n", + "\n", + "Edit View\n", + "\n", + "History\n", + "\n", + "¥% © & @ &#\n", + "\n", + ">\n", + "cod\n", + "\n", + "Q\n", + "\n", + "S Mon3.Nov 14:17\n", + "\n", + "-\n", + "eco -\n", + "\n", + "rp | A pipeline for...\n", + "\n", + "HUMAN CELL ATLAS,\n", + "DATA EXPLORER\n", + "\n", + "<\n", + "\n", + "Q tINIT tutorial...\n", + "\n", + "Bookmarks Window Help\n", + "0O9eW¢s8\n", + "Ce) The integrate... © Swagger UI\n", + "\n", + "explore.data.humancellatlas.org\n", + "\n", + "ce) Choose Expor...\n", + "\n", + "Explore > Export Selected Data > Download Selecte..\n", + "\n", + "Download Selected Data Using “curl”\n", + "\n", + "io Census data...\n", + "\n", + "io The integrate...\n", + "\n", + "Gea ©\n", + "\n", + "(=) HLCA/docs/fa...\n", + "\n", + "e Files from projects with access \"required\" will be excluded from this export.\n", + "\n", + "Download via curt\n", + "Species\n", + "\n", + "Mus musculus\n", + "\n", + "Homo sapiens\n", + "\n", + "File Type\n", + "Name\n", + "bai\n", + "\n", + "bam\n", + "\n", + "cmd.exe\n", + "\n", + "quest curl Command\n", + "\n", + "File Count\n", + "\n", + "22.0k\n", + "\n", + "22.0k\n", + "\n", + "22\n", + "\n", + "File Size\n", + "\n", + "39.15 GB\n", + "\n", + "3.98 TB\n", + "\n", + "24.66 GB\n", + "\n", + "The generated curl command is compatible with the Bash shell on Mac and Linux systems,\n", + "and the Command shell on Windows systems, and will remain valid for seven days.\n", + "\n", + "Current Query\n", + "\n", + "Access\n", + "true\n", + "\n", + "Genus Species\n", + "Homo sapiens\n", + "\n", + "Paired End\n", + "true\n", + "\n", + "Nucleic Acid Source\n", + "single cell\n", + "\n", + "File Source\n", + "DCP/2 Analysis\n", + "\n", + "File Format\n", + "loom\n", + "\n", + "Selected Data Summary\n", + "\n", + "Estimated Cells\n", + "570.8k\n", + "\n", + "File Size\n", + "24.66 GB\n", + "\n", + "Files\n", + "22\n", + "\n", + "Projects\n", + "19\n", + "\n", + "Species\n", + "Homo sapiens\n", + "\n", + "Donors\n", + "45\n", + "\n", + "Disease Status (Donor)\n", + "4 disease statuses\n", + "\n", + "Specimens\n", + "775\n", + "\n", + "Disease Status (Specimen)\n", + "3 disease statuses\n", + "\n", + "Anatomical Entity\n", + "12 anatomical entities\n", + "\n", + "Organ Part\n", + "14 organ parts\n", + "\n", + "Library Construction Method\n", + "2 library construction methods\n", + "\n", + "Paired End\n", + "true\n", + "\n", + "Downloaded and exported data is\n", + "\n", + "@ ChatGPT - Dr...\n", + "\n", + "Pastebin.com...\n", + "\n", + "a\n", + "© @ +\n", + "\n", + "Ce) Download Sel\n", + "\n", + "Help & Documentation + e@\n", + "\n", + "(\n", + "\n", + "v Please select\n", + "Chalmers tekniska hoegskola AB\n", + "Goteborgs Universitet\n", + "Handelshégskolan i Stockholm (HHS)\n", + "Hégskolan i Halmstad\n", + "Karlstads universitet\n", + "Karolinska Institutet\n", + "Kungliga Tekniska H6gskolan (KTH)\n", + "Linképings universitet (LiU)\n", + "Linnéuniversitetet\n", + "Lulea tekniska universitet\n", + "Lunds universitet\n", + "Stockholms universitet\n", + "Sveriges lantbruksuniversitet (SLU)\n", + "Umea universitet\n", + "Uppsala universitet\n", + "\n", + "Genetic context of bacterial aqpN genes\n", + "\n", + "44 AQPNsinKEGG (45% in arsenic resistance operons — 55 % in NO operon)\n", + "57 AQPNsin NCBI (68% in arsenic resistance operons — 32 % in NO operon)\n", + "As(V)\n", + "\n", + "As(II!)\n", + "\n", + "Progeny genotypes\n", + "\n", + "p2\n", + "PH\n", + "\n", + "=\n", + "—\n", + "eo\n", + "\n", + "(1/4)H2 (1/2)H2 (1/4)H2\n", + "HQ HQ\n", + "\n", + "2(P+(1/2)H)\n", + "(P rads (Q +(1/2)H) (Q oo\n", + "\n", + "\n", + "Arbuscule development\n", + "\n", + "a cee. ees\n", + "SbtM1 Gene\n", + "ceeennnnnnennnsnnenseneesennenennsenneenenennseneenensesnenasenennsecunmeaneneanees expression\n", + "BCPI\n", + "\n", + "PM Cell wall\n", + "\n", + "C | stage! Stage Il Stage Ill Stage lV Stage V\n", + "PPA Cell entry Birdsfoot Mature arbuscule Collapsed arbuscule\n", + "t t t t TL\n", + "CYCLOPS RAM1, RAM2 OsPT13\n", + "DIS\n", + "RED\n", + "\n", + "3 VAMPs @ PT4 tT ] SbtM1 P BCPI\n", + "\n", + "Scientific interests\n", + "\n", + "Research Interests:\n", + "\n", + "Description: At this stage, which research areas and scientific questions are you most interested in exploring during your PhD? Please describe the techniques and\n", + "methods you are currently considering. (min. 100 words - max. 400 words)\n", + "\n", + "CURRENT research area (Primary) Computational Biology, Genomes and Evolution\n", + "\n", + "Scientific Question:\n", + "Click here to enter your comments (What excites you about doing science?)\n", + "Applicant's answer:\n", + "\n", + "Epigenetic basis of complex Spontaneous epimutations Epigenetic clocks Machine learning of 3D\n", + "\n", + "traits chromatin contacts\n", + "\n", + "Genomic and epigenomic basis\n", + "\n", + "of high-alpine adaptation\n", + "\n", + "Usefulness of crosses\n", + "\n", + "Selection of Parents\n", + "\n", + "U, = Cj +R;\n", + "\n", + "m midparent value, perfect predictor of cj with additive gene action\n", + "\n", + "4 and absence of epistasis\n", + "\n", + "0.7 ¢ (0.8\n", + "\n", + "Vinyl\n", + "Rij = iho,\n", + "\n", + "i prediction difficult\n", + "\n", + "\n", + "Method\n", + "\n", + "Heterozygosity\n", + "\n", + "Nucleotide diversity (tt)\n", + "\n", + "Site Frequency Spectrum (SFS)\n", + "Linkage Disequilibrium (LD)\n", + "Tajima’s D\n", + "\n", + "Runs of Homozygosity (ROH)\n", + "\n", + "Effective Population Size (Ne)\n", + "\n", + "Signature of Bottleneck\n", + "\n", + "Decreased heterozygosity\n", + "\n", + "Reduced genetic diversity\n", + "\n", + "Skew toward intermediate alleles\n", + "\n", + "Increased LD, slower decay\n", + "\n", + "Positive values due to allele frequency shift\n", + "Longer ROH in bottlenecked populations\n", + "\n", + "Sudden decrease in Ne\n", + "\n", + "3. What sort of growth pattern in the epidermis would explain\n", + "the kink formation?\n", + "\n", + "°\n", + "3.1. Is there any cellular evidence for PD growth signal in epidermis?\n", + "\n", + "\n", + "Genome vv Tracks ¥ Sample Info v Session v Share Bookmark Save Image Circular View v Help v\n", + "\n", + "IGV oxford_e...me.fasta tig00000002:1,989,819-1,993,234 Q 3,416 bp (Select Tracks ) (Crosshairs )(_Center Line )(TrackLabels) @ +)\n", + "1,990 kb j 1,991 kb j 1,992 kb j 1,993 kb\n", + "AQ 0 EA A MY TAY A AY a\n", + "|= SS SS en |\n", + "tnaB tnaA mnmE_1\n", + "\n", + "INSTITUTE\n", + "\n", + "Heng igv.org UCSan Diego fe BROAD\n", + "\n", + "\n", + "Genome vv Tracks ¥ Sample Info v Session v Share Bookmark Save Image Circular View v Help v\n", + "\n", + "GV oxford_e...me.fasta _ tig00000002:2,754-6,178 Q 3,425 bp (Select Tracks )( Crosshairs )( Center Line ){ Track Labels ) (—) auu==® +)\n", + "3 kb j 4 kb j 5 kb j 6 kb\n", + "LSA A A 8 a\n", + "po ee ss sss | %\n", + "dadA_1 IKAOHOFJ_00007 fadR_1\n", + "pac Pi\n", + "dadA_2 fadR_2\n", + "\n", + "igv.org UCSanDiego EEBROAD\n", + "\n", + "INSTITUTE\n", + "\n", + "i al\n", + "\n", + "Leaf Hi-C K4me3 HiChIP K27me3 HiChIP\n", + "\n", + "eQTL-gene\n", + "links >20 kb |\n", + "\n", + "shuffled pairs\n", + "\n", + "\n", + "Leaf Hi-C\n", + "\n", + "N=OS (fihered) 347 (unique) 347 (total), PALL = 0,909\n", + "\n", + "eQTL-gene —\n", + "links >20 kb .\n", + "\n", + "shuffled pairs\n", + "\n", + "\n", + "Figure 3 Inference of population size from whole- —— YRI (Nigeria) —— CHB (China)\n", + "\n", + ". : . — MKK (Kenya) — JPT (Japan)\n", + "genome sequences. (a) Population size estimates —— LWK (Kenya) — GIH (N. India\n", + "indivi — CEU (N.Europe) —— MXL (Mexico — CEU (N. Europe)\n", + "\n", + "from four haplotypes (two phased individuals) a — fsiaayy pe) — we re Rletive American) b ~ TSI (aly)\n", + "from each of nine populations. The dashed line — CHB (China)\n", + "was generated from a reduced data set of only the © g 10° — JPT (Japan)\n", + "Native American components of the MXL genomes. 3 ae — GIH (N. India)\n", + "\n", + ". < 5 10 — YRI (Nigeria)\n", + "Estimates from two haplotypes for CEU and YRI g FS — LWK (Kenya)\n", + "are shown for comparison as dotted lines. 2 2 108\n", + "N, Northern. (b) Population size estimates from a a\n", + "eight haplotypes (four phased individuals) from the g 2 108\n", + "same populations as in a but excluding MXL and 3 E 10!\n", + "\n", + "Ww\n", + "\n", + "MKK. In contrast to estimates with four haplotypes,\n", + "estimates are more recent. For comparison, we\n", + "show the result from four haplotypes for CEU,\n", + "\n", + "10° 104 10°\n", + "CHB and YRI as dotted lines. Time (years ago) Time (years ago)\n", + "\n", + "\n", + "Leaf Hi-C K4me3 HiChIP K27me3 HiChIP\n", + "\n", + "face mar rapa mat\n", + "\n", + "eQTL-gene\n", + "links >20 kb\n", + "\n", + "shuffled pairs :\n", + "\n", + "\n", + "(mustache_aman) [papantonis1@gwdu1@1 aman]$ awk '$1 == $9 {print $1}' GE02457_dots_5kb.bedpe | sort | unig -c && wc -1 GE02457_dots_5kb.bedpe\n", + "842 chri\n", + "413 chr1e\n", + "465 chri1\n", + "442 chr12\n", + "244 chri3\n", + "254 chri4\n", + "234 chris\n", + "174 chri16\n", + "248 chr17\n", + "196 chri8\n", + "122 chri9\n", + "817 chr2\n", + "196 chr2e\n", + "\n", + "81 chr21\n", + "78 chr22\n", + "731 chr3\n", + "\n", + "594 chr4\n", + "609 chr5\n", + "631 chr6é\n", + "478 chr7\n", + "\n", + "505 chr8&\n", + "349 chr9\n", + "\n", + "184 chrx\n", + "\n", + "8888 GE02457_dots_5kb.bedpe\n", + "\n", + "ge Plant Epigenome\n", + "Browser\n", + "\n", + "sect\n", + "\n", + "anc seg 3\n", + "\n", + "meg :\n", + "Bea meri\n", + "\n", + "rae ‘\n", + "\n", + "irsenabnitninpmeyiityrr afl mnie ahi\n", + "\n", + "\n", + "Chromosomes\n", + "\n", + "o\n", + "\n", + "8\n", + "\n", + "ae\n", + "\n", + "Show\n", + "\n", + "Observed\n", + "\n", + "Normalization (Obs | Ctrl)\n", + "\n", + "None ¢\n", + "\n", + "Bala...\n", + "\n", + "Resolution (BP)\n", + "\n", + "I rrdtdot ttt td\n", + "2.5MB 500KB 100KB 25KB 5KB 1KB 200BP\n", + "\n", + "OMB\n", + "\n", + "\n", + "Genetic context of bacterial aqpN genes TUT\n", + "\n", + "44 AQPNsinKEGG (45% in arsenic resistance operons — 55 % in NO operon)\n", + "\n", + "57 AQPNs in NCBI (68% in arsenic resistance operons — 32 % in NO operon)\n", + "\n", + "As(V)\n", + "\n", + "transporter\n", + "\n", + "As(ltl)\n", + "\n", + "= > > | >> >>>\n", + "f GipF Aqpz |\n", + "\n", + "Crop\n", + "Physiology\n", + "56\n", + "\n", + "\n", + "clonalAbundance\n", + "\n", + "We can also examine the relative distribution of clones by abundance. Here clonalAbundance() will produce a line graph with a total\n", + "number of clones by the number of instances within the sample or run. Like above, we can also group.by this by vectors within the\n", + "contig object using the group.by variable in the function.\n", + "\n", + "clonalAbundance(combined. TCR,\n", + "cloneCall = \"gene\",\n", + "scale = FALSE)\n", + "\n", + "5000\n", + "4000\n", + "Samples\n", + "— P17B\n", + "3 PI7L\n", + "5 3000\n", + "ro) — P18B\n", + "ao) — P18L\n", + "3 — P19B\n", + "2000\n", + "5 — PI19L\n", + "Zz\n", + "— P20B\n", + "— P20L\n", + "1000\n", + "oo §\n", + "1 10 100 1000\n", + "Abundance\n", + "\n", + "clonalAbundance() output can also be converted into a density plot, which may allow for better comparisons between different\n", + "repertoire sizes, by setting scale = TRUE.\n", + "\n", + "clonalAbundance(combined.TCR, cloneCall = \"gene\", scale = TRUE)\n", + "\n", + "Gibberellin biosynthesis is well understood TUT\n", + "\n", + "core\n", + "SS\n", + ") The “green revolution”\n", + "semidwarf1 rice variety is\n", + "af mutated in a GA20ox that is\n", + "expressed in shoots but not\n", + "\n", + "GAg\n", + "\n", + "a\n", + "\n", + "1\n", + "i 5, ¢ ot — ent-kaurenoic acid in reproductive tissues, ; Q\n", + "i | t Ga2q GA. leading to increased grain |\n", + "i GA GAs . yields.\n", + "a; —_—_\n", + "5A200 .\n", + "\\ GA _ Sasaki ef al. & Matsuoka, 2002, Nature\n", + "A > GA, —+ GA, —+> GAy, Spielmeyer et al. & Chandler, 2002, PNAS\n", + "”\n", + "\n", + "4 Ce Gazal\n", + "\n", + "CA» ——> GA,\n", + "\n", + "~\n", + "\n", + "GA30x\n", + "\n", + "GA\n", + "\n", + "GAs ——® GA\n", + "\n", + "Brigitte Poppenberger (TUM) Hernandez-Garcia et al & Blazquez, 2021, Sem. Cell Dev. Biol 13\n", + "\n", + "\n", + "Genome vv Tracks ¥ Sample Info v Session v Share Bookmark Save Image Circular View v Help v\n", + "\n", + "IGV oxford_e...me.fasta _ tig00000002:1,604,261-1,606,695 § Q. 2,435 bp (Select Tracks ) (\"Crosshairs )(_Center Line )(TrackLabels) @ iE +)\n", + "\n", + "C D)\n", + "\n", + "604,300 bp 1,604,500 bp 1,604,700 bp 1,604,900 bp 1,605,100 bp 1,605,300 bp 1,605,500 bp 1,605,700 bp 1,605,900 bp 1,606,100 bp 1,606,300 bp 1,606,500 bp 1,606,\n", + "L i 1 L L L 1 L L L 1 L L L 1 L L L 1 L L L 1 L L L 1 L L L 1 L L L 1 L L L 1 L L i 1 L L L 1 L\n", + "\n", + "%\n", + "ee Pe | ZZ\n", + "\n", + "pdeC_1 IKAOHOFJ_01847 ssb\n", + "\n", + "pdeC_2\n", + "\n", + "Investigating the Impact of Hexaploidization on Gene Expression in Oat: in this project, we compare gene expression in hexaploid oat\n", + "species with their tetraploid ancestors. The aim is to explore how the addition of a new genome through hybridization has affected gene\n", + "regulation.\n", + "\n", + "Results - 1. Confocal Images\n", + "\n", + "~ Adaxial oi1\n", + "\n", + "Nucelus ss __-» Abaxial oi2\n", + "\n", + "Adaxial ii1. <——\n", + "\n", + "Abaxial ii2 __—» Chalaza\n", + "\n", + "—+ Funiculus\n", + "\n", + "chromosome1 x1 x2 chromosome2 yl y2 color observed\n", + "expected_bottom_left expected_donut expected_horizontal expected_vertical\n", + "fdr_bottom_left fdr_donut fdr_horizontal fdr_vertical\n", + "number_collapsed centroid1 centroid2 radius\n", + "\n", + "eoo <— > OQ VD G monkeytype.com Ws Search SEARXNG-NALAKATH eave @ @ ~™@®\n", + "ay & a Ab New merch store now open, including a limited edition metal keycap! monkeytype.store x\n", + "\n", + "monkeytype\n", + "\n", + "73\n", + "96%\n", + "\n", + "cautich 76 182/3/1/0 84% 30s\n", + "\n", + "GCO@Qe2® ag Avnud9g HSBTOC BD\n", + "\n", + "english\n", + "&\n", + "S Workspaces v (mi) Monkeytype | A minimalisti + Vv\n", + "0&8 S CI CQ Reset Om 100% 11:12\n", + "\n", + "3.3 SUVRS affects the gene region more than the TE region\n", + "\n", + "\n", + "~*~\n", + "\n", + "@ Safari File Edit View History Bookmarks Develop Window Help @ ne) ¥@6© €& @) F Q ® Fri21.Nov 14:15\n", + "eecax m&-< ‘on © {1| @ 2g pax-db.org Ga co Oo +\n", + "{0.0} Yes. The paper states that t... iG] geckopy/geckopy/experimen... FE] geckopy — geckopy 0.0.1 do... a= PaxDb - Help 2 https://pax-db.org/download... a PaxDb - Download iN} FragPipe workflows | FragPipe\n", + "\n", + "paxdb®° PaxDb: Protein Abundance Database\n", + "\n", + "dary\n", + "Pio,\n", + "\n", + "x protein(s) id/name\n", + "\n", + "PaxDb Downloads\n", + "\n", + "Accessory files\n", + "\n", + "e All datasets can be found here paxdb-abundance-files.zip (~31MB).\n", + "\n", + "¢ Protein sequences fasta file can be found here paxdb-protein-sequences.zip (~498MB).\n", + "¢ Mapped peptides files can be found here paxdb-mapped_peptides.zip (~193MB).\n", + "\n", + "¢ Orthologs list can be found here paxdb-orthologs.zip (~25MB).\n", + "\n", + "e UniProt mappings can be found here paxdb-uniprot-links.zip (~11MB).\n", + "\n", + "e Files from previous PaxDB versions can be found here: /downloads/\n", + "\n", + "Per-species abundance files\n", + "\n", + "ES -\n", + "\n", + "Species Datasets J?\n", + "Homo sapiens 375\n", + "Mus musculus 175\n", + "\n", + "DOWNLOAD\n", + "\n", + "COMPUTE+;\n", + "\n", + "REQUEST+*;\n", + "\n", + "Download\n", + "\n", + "9606.zip\n", + "\n", + "10090.zip\n", + "\n", + "WHAT'S NEW4;\n", + "\n", + "HELP\n", + "\n", + "\n", + "First 5 rows and columns of raw genotype data:\n", + "\n", + "Cl\n", + "\n", + "dddde|\n", + "trop\n", + "\n", + "dddae\n", + "\n", + "dddd|\n", + "rrr\n", + "\n", + "ddd\n", + "\n", + "dddd|\n", + "rrr\n", + "\n", + "a)\n", + "\n", + "dadded\n", + "rrr\n", + "\n", + "eSeeooe\n", + "\n", + "dade|\n", + "\n", + "eeoo\n", + "\n", + "® -1]]]\n", + "\n", + "\n", + "AN Tene enginevelry, 5 Py weeds\n", + "- eZ Anal biota Beat dp\n", + "ate - Tyce Gear bei Oo\n", + "46, Trtwesip * Feashig UP ES\n", + "\n", + "yor\n", + "\n", + "SONGS [ab 2 S welts wort\n", + "\n", + "Coup peggy\n", + "\n", + "Repars — PDO)\n", + "Brcacvnud,\n", + "Summer school\n", + "\n", + "Reding Dalukinnoyy gprs\n", + "L¥ Pap & Stiles\n", + "Chater — Pronses\n", + "Saf & Stee Duatle fo Ui?\n", + "Anping fe Hos. Haig HG\n", + "\n", + "a\n", + "\n", + "I\n", + "I\n", + ".* a\n", + "\n", + "LIEN TIE uc\n", + "\n", + "olathe id- \"4 ut ]\n", + "Figure 2 | Haplotype pattern in a region defined by SNPs that are at high\n", + "frequency in Tibetans and at low frequency in Han Chinese. Each column is\n", + "a polymorphic genomic location (95 in total), each row is a phased haplotype\n", + "(80 Han and 80 Tibetan haplotypes), and the coloured column on the left\n", + "denotes the population identity of the individuals. Haplotypes of the Denisovan\n", + "individual are shown in the top two rows (green). The black cells represent the\n", + "presence of the derived allele and the grey space represents the presence of\n", + "the ancestral allele (see Methods). The first and last columns correspond to the\n", + "first and last positions in Supplementary Table 3, respectively. The red and\n", + "blue arrows indicate the 32 sites in Supplementary Table 3. The blue arrows\n", + "represent a five-SNP haplotype block defined by the first five SNPs in the\n", + "32.7-kb region. Asterisks indicate sites at which Tibetans share a derived allele\n", + "with the Denisovan individual.\n", + "\n", + "\n", + "Building regulatory landscapes\n", + "reveals that an enhancer can recruit\n", + "cohesin to create contact domains,\n", + "engage CTCF sites and activate\n", + "distant genes\n", + "\n", + "Rinzema NJ, Sofiados k, [...], de Laat W\n", + "\n", + "Nature Structural & Molecular Biology (2022)\n", + "\n", + "[| DOWNLOAD | 2022\n", + "\n", + "Robust detection of translocations in\n", + "lymphoma FFPE samples using\n", + "targeted locus capture-based\n", + "sequencing\n", + "\n", + "Allahyar A, Pieterse M, [...], de Laat W\n", + "\n", + "NATURE COMMUNICATIONS: 12:3361\n", + "\n", + "[| DOWNLOAD | 2021\n", + "\n", + "Ready-to-use public infrastructure\n", + "for global SARS-CoV-2 monitoring\n", + "Krijger PHL, Hoek TA, [...], de Laat W, Tanenbaum M\n", + "\n", + "Nature Biotechnology 39: 1178-1184\n", + "\n", + "[| DOWNLOAD | 2021\n", + "\n", + "Novel orthogonal methods to\n", + "uncover the complexity and diversity\n", + "of nuclear architecture\n", + "\n", + "Tjalsma SJD, de Laat W\n", + "\n", + "Current Opinion in Genetics & Development: 67:10-17\n", + "\n", + "[| DOWNLOAD | 2021\n", + "\n", + "Interplay between CTCF boundaries\n", + "and a super enhancer controls\n", + "cohesin extrusion trajectories and\n", + "gene expression\n", + "\n", + "Vos ESM, Valdes-Quezada C, Huang Y [...], de Laat\n", + "Ww\n", + "\n", + "Mol. Cell 81(15):3082-3095\n", + "\n", + "[| DOWNLOAD | 2021\n", + "\n", + "How chromosome topologies get\n", + "their shape: views from proximity\n", + "ligation and microscopy methods\n", + "Huang Y, Neijts R, de Laat W\n", + "\n", + "FEBS Letters: 594 3439-3449\n", + "\n", + "[| DOWNLOAD | 2020\n", + "\n", + "Instituto Universitario de Lisboa (ISCTE IUL)\n", + "UNIVERSIDADE CATOLICA PORTUGUESA\n", + "Universidade de Coimbra\n", + "\n", + "Universidade de Evora\n", + "\n", + "Universidade de Lisboa\n", + "\n", + "Universidade do Porto\n", + "\n", + "Universidade Nova de Lisboa\n", + "\n", + "Ice ot\n", + "‘earn ere ta rao pen 2 prema ne oe [eeremsne [seen]\n", + "\n", + "FastQC: Per Sequence GC Content\n", + "Pea Samp\n", + "\n", + "Per Base N Content [aim\n", + "\n", + "‘epocenapecttancastcan poten ren an asa\n", + "\n", + "FastQC: Per Base N Content\n", + "\n", + "‘Sequence Length Distribution [a\n", + "\n", + "Mimosa equa ci ng)\n", + "\n", + "‘Sequence Duplication Levels SE (ome)\n", + "‘eae ge yer\n", + "[eeewwres [cere]\n", + "\n", + "FastQC: Sequence Duplication Levels,\n", + "\n", + "Overrepresented sequences by sample SKIN\n", + "\n", + "‘Pett arr ctonnpeericsminceanh eaten.\n", + "\n", + "Top overrepresented sequences\n", + "\n", + "‘ie onmmteeseince sr ssarde The soe 2 trent ser cern aye noosa yr\n", + "\n", + "‘Adapter Content [ZI [ome]\n", + "\n", + "‘Peamusiepenep cathe sana yay te asa en aspen enon\n", + "[eeremsne [seen]\n", + "\n", + "FastQC: Adapter Content\n", + "\n", + "\n", + "% TADS\n", + "\n", + "Sequencing technologies have been a driving force in genomics science\n", + "since the 70's.\n", + "\n", + "After reading the article De novo genome assembly: what every biologist\n", + "should know (Published: March 2012)\n", + "\n", + "Link: https://www.nature.com/articles/nmeth.1935,\n", + "\n", + "1. Pick one issue or problem that is mentioned in it. Describe it shortly with\n", + "your own words and try to produce a possible solution for it based on what\n", + "you have learnt in this course so far (it doesn't matter if your solution is\n", + "\n", + "really doable)\n", + "\n", + "2. Share your problem description and solution in TWO places:\n", + "In the discussion forum “Impact of sequencing technology\" and submit the\n", + "same text also in the task “Impact of sequencing technology”. Please, read in\n", + "\n", + "the discussion forum your peers’ answers,\n", + "\n", + "TAL\n", + "TECH\n", + "\n", + "NEXT STEPS |\n", + "\n", + "CHECK THE DEADLINES IN MOODLE\n", + "\n", + "* Read \"De novo genome assembly: what every biologist should know\"\n", + "\n", + "* Do and submit Coursework 3 (based on the lectures so far + reading).\n", + "\n", + "Once you have done all these, you may move on to the \"Week 4, Session 1\"\n", + "\n", + ". 7 sad\n", + "\n", + "Ethylene induces expression of ACS genes during ripening\n", + "\n", + "ACS ACO\n", + "\n", + "SAM LEACSS ACC — > C2H, — Perception\n", + "DS\n", + "LEACS1A —\n", + "4 LEACS4 =e)\n", + "LEACS2\n", + "\n", + "Developmentally\n", + "regulated\n", + "\n", + "Brigitte Poppenberger (TUM) Plant Cell teaching tool\n", + "\n", + "OMB\n", + "\n", + "100 MB\n", + "\n", + "200 MB\n", + "\n", + "Chromosomes Show Normalization (Obs | Ctrl) Resolution (BP)\n", + "“aw “aw a a a — y,\n", + "2 Bp Observed Bala... None © Pivrb ttre teins\n", + "2.5MB 500KB 100KB 25KB 5KB 1KB 200BP\n", + "OMB 100 MB 200 MB 300 MB\n", + "\n", + "\n", + "@ Zoom Workplace\n", + "\n", + "ox\n", + "\n", + "Ww\n", + "\n", + "4\n", + "\n", + "a\n", + "\n", + "Clipboard\n", + "\n", + "11\n", + "\n", + "Slide 10 0f 14 4\n", + "\n", + "Meeting View Edit\n", + "\n", + "[==] Layout ¥\n", + "\n", + "‘© Reset\n", + "New\n", + "\n", + "Slide v Section\n", + "\n", + "Slides\n", + "\n", + "English (India)\n", + "\n", + "Cy, Accessibility: Investigate\n", + "\n", + "W4\n", + "\n", + "x\n", + "\n", + "Pre\n", + "\n", + "& 00 Ce PD Find i) A\n", + "D | 5 b S yy\n", + "ALLS o5|¥ 82 Replace v\n", + "D Swen Arrange Create PDF Create PDF and Add-ins\n", + "° soe 2 I$ Select v and Share link Share via Outlook\n", + "Font Paragraph Drawing Editing Adobe Acrobat Add-ins\n", + "\n", + "Growth : Tissue expansion\n", + "\n", + "Across stages (from 2-III to 2-V) , interval [0.75-1.00] along PD axis sees the highest tissue expansion\n", + "\n", + "U\n", + "\n", + "What sort of tissue expansion is it (isotropic or anisotropic) ?\n", + "\n", + "MAXMIN Histogram MAXMID Histogram\n", + "(0.75. 1.00} 00} oe\n", + "= 2 7\n", + "} (0.25, 0.30) 4\n", + "- Po om st 0.00, 0.25)\n", + "a =\n", + "\n", + "Average of MAXMID\n", + "\n", + "> No evidence of purely isotropic cell growth in any stage at any interval\n", + "\n", + "> Cell growth on average is anisotropic for all intervals at each stage\n", + "\n", + "“ oo\n", + "= Notes QB comments oo\n", + "\n", + "oO\n", + "\n", + "Window Help Bevwvue@8Wrt+ ort.t@goe wy Se Wed Feb 12 22:24\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(*results, sep='\\n')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6934273 --- /dev/null +++ b/setup.py @@ -0,0 +1,13 @@ +from setuptools import setup, find_packages + +setup( + name="kg_ocr", + packages=find_packages(), + python_requires=">=3.10", + install_requires=[ + "pytesseract", + "Pillow", + "txtai", + "sentence-transformers", + ], +) diff --git a/tests/test_graph.py b/tests/test_graph.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_indexer.py b/tests/test_indexer.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_ocr.py b/tests/test_ocr.py new file mode 100644 index 0000000..e69de29