#!/usr/bin/env bash # H1: Representation learning — GM12878 VGAE + IMR90 zero-shot transfer # Usage: bash experiments/h1_representation/run.sh [--chrom chr1] [--res 25000] set -euo pipefail REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" export PYTHONPATH="$REPO:${PYTHONPATH:-}" # Auto-activate conda env if its packages are not on the current Python if ! python -c "import torch_geometric" 2>/dev/null; then echo "Activating conda env chromatin_gnn..." eval "$(conda shell.bash hook)" conda activate chromatin_gnn fi CHROM="${CHROM:-chr1}" RES="${RES:-25000}" EPOCHS="${EPOCHS:-300}" PATIENCE="${PATIENCE:-50}" HIDDEN="${HIDDEN:-256}" LATENT="${LATENT:-64}" SEED="${SEED:-42}" DATA="$REPO/data" RESULTS_BASE="$REPO/results/h1_representation" RESULTS="$RESULTS_BASE/${CHROM}" # per-chromosome outputs COMPARTMENTS_DIR="$RESULTS_BASE/compartments" # shared (filenames include $CHROM) EXP="$REPO/experiments/h1_representation" mkdir -p "$RESULTS/figures" "$COMPARTMENTS_DIR" # ── Step 1: Build graphs ────────────────────────────────────────────────────── for CELL in gm12878 imr90; do CELL_UPPER=$(echo "$CELL" | tr '[:lower:]' '[:upper:]') OUT="$DATA/processed/$CELL/${CHROM}.pt" if [ -f "$OUT" ]; then echo "[$CELL] Graph already exists, skipping" else python -m chromatin_gnn.build_graph \ --mcool "$DATA/raw/${CELL_UPPER}.mcool" \ --chrom "$CHROM" --res "$RES" \ --bigwigs \ "$DATA/raw/${CELL_UPPER}_CTCF.bw" \ "$DATA/raw/${CELL_UPPER}_H3K27me3.bw" \ "$DATA/raw/${CELL_UPPER}_H3K4me3.bw" \ --out "$OUT" fi done # ── Step 2: Compute A/B compartments ───────────────────────────────────────── for CELL in gm12878 imr90; do CELL_UPPER=$(echo "$CELL" | tr '[:lower:]' '[:upper:]') OUT="$COMPARTMENTS_DIR/${CELL}_${CHROM}.csv" if [ -f "$OUT" ]; then echo "[$CELL] Compartments already exist, skipping" else python "$EXP/compute_compartments.py" \ --mcool "$DATA/raw/${CELL_UPPER}.mcool" \ --chrom "$CHROM" --res "$RES" \ --bigwig_orient "$DATA/raw/${CELL_UPPER}_CTCF.bw" \ --out "$OUT" fi done # ── Step 3: Train VGAE on GM12878 ──────────────────────────────────────────── if [ -f "$RESULTS/model.pt" ]; then echo "[GM12878] Model already exists, skipping training" else python "$EXP/train.py" \ --graph "$DATA/processed/gm12878/${CHROM}.pt" \ --encoder deep_gcn \ --hidden "$HIDDEN" --latent "$LATENT" \ --epochs "$EPOCHS" --patience "$PATIENCE" \ --lr 3e-4 --dropout 0.3 --beta 0.5 --kl_anneal 100 \ --seed "$SEED" \ --outdir "$RESULTS" fi # ── Step 4: Zero-shot encode IMR90 ─────────────────────────────────────────── python "$EXP/encode.py" \ --model "$RESULTS/model.pt" \ --graph "$DATA/processed/imr90/${CHROM}.pt" \ --out "$RESULTS/imr90_emb.npy" # ── Step 5: Visualise (skip with SKIP_FIGURES=1 — useful in genome-wide loops) ─ if [ -z "${SKIP_FIGURES:-}" ]; then python "$EXP/visualize.py" \ --emb "$RESULTS/gm12878_emb.npy" "$RESULTS/imr90_emb.npy" \ --labels GM12878 IMR90 \ --compartments \ "$COMPARTMENTS_DIR/gm12878_${CHROM}.csv" \ "$COMPARTMENTS_DIR/imr90_${CHROM}.csv" \ --prefix "$RESULTS/figures/umap" \ --seed "$SEED" # ── Step 6: Compare embeddings ──────────────────────────────────────────── python "$EXP/compare.py" \ --emb1 "$RESULTS/gm12878_emb.npy" \ --emb2 "$RESULTS/imr90_emb.npy" \ --label1 GM12878 --label2 IMR90 \ --prefix "$RESULTS/figures/${CHROM}" else echo "[SKIP_FIGURES=1] Skipping visualization and comparison steps." fi echo "" echo "=== H1 complete ($CHROM). Results: $RESULTS ===" cat "$RESULTS/metrics.json"