Files
chromatin-vgae-hic/experiments/h1_representation/run.sh

109 lines
4.3 KiB
Bash

#!/usr/bin/env bash
# H1: Representation learning — GM12878 VGAE + IMR90 zero-shot transfer
# Usage: bash experiments/h1_representation/run.sh [--chrom chr1] [--res 25000]
set -euo pipefail
REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
export PYTHONPATH="$REPO:${PYTHONPATH:-}"
# Auto-activate conda env if its packages are not on the current Python
if ! python -c "import torch_geometric" 2>/dev/null; then
echo "Activating conda env chromatin_gnn..."
eval "$(conda shell.bash hook)"
conda activate chromatin_gnn
fi
CHROM="${CHROM:-chr1}"
RES="${RES:-25000}"
EPOCHS="${EPOCHS:-300}"
PATIENCE="${PATIENCE:-50}"
HIDDEN="${HIDDEN:-256}"
LATENT="${LATENT:-64}"
SEED="${SEED:-42}"
DATA="$REPO/data"
RESULTS_BASE="$REPO/results/h1_representation"
RESULTS="$RESULTS_BASE/${CHROM}" # per-chromosome outputs
COMPARTMENTS_DIR="$RESULTS_BASE/compartments" # shared (filenames include $CHROM)
EXP="$REPO/experiments/h1_representation"
mkdir -p "$RESULTS/figures" "$COMPARTMENTS_DIR"
# ── Step 1: Build graphs ──────────────────────────────────────────────────────
for CELL in gm12878 imr90; do
CELL_UPPER=$(echo "$CELL" | tr '[:lower:]' '[:upper:]')
OUT="$DATA/processed/$CELL/${CHROM}.pt"
if [ -f "$OUT" ]; then
echo "[$CELL] Graph already exists, skipping"
else
python -m chromatin_gnn.build_graph \
--mcool "$DATA/raw/${CELL_UPPER}.mcool" \
--chrom "$CHROM" --res "$RES" \
--bigwigs \
"$DATA/raw/${CELL_UPPER}_CTCF.bw" \
"$DATA/raw/${CELL_UPPER}_H3K27me3.bw" \
"$DATA/raw/${CELL_UPPER}_H3K4me3.bw" \
--out "$OUT"
fi
done
# ── Step 2: Compute A/B compartments ─────────────────────────────────────────
for CELL in gm12878 imr90; do
CELL_UPPER=$(echo "$CELL" | tr '[:lower:]' '[:upper:]')
OUT="$COMPARTMENTS_DIR/${CELL}_${CHROM}.csv"
if [ -f "$OUT" ]; then
echo "[$CELL] Compartments already exist, skipping"
else
python "$EXP/compute_compartments.py" \
--mcool "$DATA/raw/${CELL_UPPER}.mcool" \
--chrom "$CHROM" --res "$RES" \
--bigwig_orient "$DATA/raw/${CELL_UPPER}_CTCF.bw" \
--out "$OUT"
fi
done
# ── Step 3: Train VGAE on GM12878 ────────────────────────────────────────────
if [ -f "$RESULTS/model.pt" ]; then
echo "[GM12878] Model already exists, skipping training"
else
python "$EXP/train.py" \
--graph "$DATA/processed/gm12878/${CHROM}.pt" \
--encoder deep_gcn \
--hidden "$HIDDEN" --latent "$LATENT" \
--epochs "$EPOCHS" --patience "$PATIENCE" \
--lr 3e-4 --dropout 0.3 --beta 0.5 --kl_anneal 100 \
--seed "$SEED" \
--outdir "$RESULTS"
fi
# ── Step 4: Zero-shot encode IMR90 ───────────────────────────────────────────
python "$EXP/encode.py" \
--model "$RESULTS/model.pt" \
--graph "$DATA/processed/imr90/${CHROM}.pt" \
--out "$RESULTS/imr90_emb.npy"
# ── Step 5: Visualise (skip with SKIP_FIGURES=1 — useful in genome-wide loops) ─
if [ -z "${SKIP_FIGURES:-}" ]; then
python "$EXP/visualize.py" \
--emb "$RESULTS/gm12878_emb.npy" "$RESULTS/imr90_emb.npy" \
--labels GM12878 IMR90 \
--compartments \
"$COMPARTMENTS_DIR/gm12878_${CHROM}.csv" \
"$COMPARTMENTS_DIR/imr90_${CHROM}.csv" \
--prefix "$RESULTS/figures/umap" \
--seed "$SEED"
# ── Step 6: Compare embeddings ────────────────────────────────────────────
python "$EXP/compare.py" \
--emb1 "$RESULTS/gm12878_emb.npy" \
--emb2 "$RESULTS/imr90_emb.npy" \
--label1 GM12878 --label2 IMR90 \
--prefix "$RESULTS/figures/${CHROM}"
else
echo "[SKIP_FIGURES=1] Skipping visualization and comparison steps."
fi
echo ""
echo "=== H1 complete ($CHROM). Results: $RESULTS ==="
cat "$RESULTS/metrics.json"