chromatin-vgae-hic/experiments/h1_representation/run.sh

#!/usr/bin/env bash
# H1: Representation learning — GM12878 VGAE + IMR90 zero-shot transfer
# Usage: bash experiments/h1_representation/run.sh [--chrom chr1] [--res 25000]
set -euo pipefail

REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
export PYTHONPATH="$REPO:${PYTHONPATH:-}"

# Auto-activate conda env if its packages are not on the current Python
if ! python -c "import torch_geometric" 2>/dev/null; then
    echo "Activating conda env chromatin_gnn..."
    eval "$(conda shell.bash hook)"
    conda activate chromatin_gnn
fi

CHROM="${CHROM:-chr1}"
RES="${RES:-25000}"
EPOCHS="${EPOCHS:-300}"
PATIENCE="${PATIENCE:-50}"
HIDDEN="${HIDDEN:-256}"
LATENT="${LATENT:-64}"
SEED="${SEED:-42}"

DATA="$REPO/data"
RESULTS_BASE="$REPO/results/h1_representation"
RESULTS="$RESULTS_BASE/${CHROM}"                # per-chromosome outputs
COMPARTMENTS_DIR="$RESULTS_BASE/compartments"   # shared (filenames include $CHROM)
EXP="$REPO/experiments/h1_representation"

mkdir -p "$RESULTS/figures" "$COMPARTMENTS_DIR"

# ── Step 1: Build graphs ──────────────────────────────────────────────────────
for CELL in gm12878 imr90; do
    CELL_UPPER=$(echo "$CELL" | tr '[:lower:]' '[:upper:]')
    OUT="$DATA/processed/$CELL/${CHROM}.pt"
    if [ -f "$OUT" ]; then
        echo "[$CELL] Graph already exists, skipping"
    else
        python -m chromatin_gnn.build_graph \
            --mcool "$DATA/raw/${CELL_UPPER}.mcool" \
            --chrom "$CHROM" --res "$RES" \
            --bigwigs \
                "$DATA/raw/${CELL_UPPER}_CTCF.bw" \
                "$DATA/raw/${CELL_UPPER}_H3K27me3.bw" \
                "$DATA/raw/${CELL_UPPER}_H3K4me3.bw" \
            --out "$OUT"
    fi
done

# ── Step 2: Compute A/B compartments ─────────────────────────────────────────
for CELL in gm12878 imr90; do
    CELL_UPPER=$(echo "$CELL" | tr '[:lower:]' '[:upper:]')
    OUT="$COMPARTMENTS_DIR/${CELL}_${CHROM}.csv"
    if [ -f "$OUT" ]; then
        echo "[$CELL] Compartments already exist, skipping"
    else
        python "$EXP/compute_compartments.py" \
            --mcool "$DATA/raw/${CELL_UPPER}.mcool" \
            --chrom "$CHROM" --res "$RES" \
            --bigwig_orient "$DATA/raw/${CELL_UPPER}_CTCF.bw" \
            --out "$OUT"
    fi
done

# ── Step 3: Train VGAE on GM12878 ────────────────────────────────────────────
if [ -f "$RESULTS/model.pt" ]; then
    echo "[GM12878] Model already exists, skipping training"
else
    python "$EXP/train.py" \
        --graph "$DATA/processed/gm12878/${CHROM}.pt" \
        --encoder deep_gcn \
        --hidden "$HIDDEN" --latent "$LATENT" \
        --epochs "$EPOCHS" --patience "$PATIENCE" \
        --lr 3e-4 --dropout 0.3 --beta 0.5 --kl_anneal 100 \
        --seed "$SEED" \
        --outdir "$RESULTS"
fi

# ── Step 4: Zero-shot encode IMR90 ───────────────────────────────────────────
python "$EXP/encode.py" \
    --model "$RESULTS/model.pt" \
    --graph "$DATA/processed/imr90/${CHROM}.pt" \
    --out   "$RESULTS/imr90_emb.npy"

# ── Step 5: Visualise (skip with SKIP_FIGURES=1 — useful in genome-wide loops) ─
if [ -z "${SKIP_FIGURES:-}" ]; then
    python "$EXP/visualize.py" \
        --emb    "$RESULTS/gm12878_emb.npy" "$RESULTS/imr90_emb.npy" \
        --labels GM12878 IMR90 \
        --compartments \
            "$COMPARTMENTS_DIR/gm12878_${CHROM}.csv" \
            "$COMPARTMENTS_DIR/imr90_${CHROM}.csv" \
        --prefix "$RESULTS/figures/umap" \
        --seed "$SEED"

    # ── Step 6: Compare embeddings ────────────────────────────────────────────
    python "$EXP/compare.py" \
        --emb1 "$RESULTS/gm12878_emb.npy" \
        --emb2 "$RESULTS/imr90_emb.npy" \
        --label1 GM12878 --label2 IMR90 \
        --prefix "$RESULTS/figures/${CHROM}"
else
    echo "[SKIP_FIGURES=1] Skipping visualization and comparison steps."
fi

echo ""
echo "=== H1 complete ($CHROM). Results: $RESULTS ==="
cat "$RESULTS/metrics.json"