109 lines
4.3 KiB
Bash
109 lines
4.3 KiB
Bash
#!/usr/bin/env bash
|
|
# H1: Representation learning — GM12878 VGAE + IMR90 zero-shot transfer
|
|
# Usage: bash experiments/h1_representation/run.sh [--chrom chr1] [--res 25000]
|
|
set -euo pipefail
|
|
|
|
REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
export PYTHONPATH="$REPO:${PYTHONPATH:-}"
|
|
|
|
# Auto-activate conda env if its packages are not on the current Python
|
|
if ! python -c "import torch_geometric" 2>/dev/null; then
|
|
echo "Activating conda env chromatin_gnn..."
|
|
eval "$(conda shell.bash hook)"
|
|
conda activate chromatin_gnn
|
|
fi
|
|
|
|
CHROM="${CHROM:-chr1}"
|
|
RES="${RES:-25000}"
|
|
EPOCHS="${EPOCHS:-300}"
|
|
PATIENCE="${PATIENCE:-50}"
|
|
HIDDEN="${HIDDEN:-256}"
|
|
LATENT="${LATENT:-64}"
|
|
SEED="${SEED:-42}"
|
|
|
|
DATA="$REPO/data"
|
|
RESULTS_BASE="$REPO/results/h1_representation"
|
|
RESULTS="$RESULTS_BASE/${CHROM}" # per-chromosome outputs
|
|
COMPARTMENTS_DIR="$RESULTS_BASE/compartments" # shared (filenames include $CHROM)
|
|
EXP="$REPO/experiments/h1_representation"
|
|
|
|
mkdir -p "$RESULTS/figures" "$COMPARTMENTS_DIR"
|
|
|
|
# ── Step 1: Build graphs ──────────────────────────────────────────────────────
|
|
for CELL in gm12878 imr90; do
|
|
CELL_UPPER=$(echo "$CELL" | tr '[:lower:]' '[:upper:]')
|
|
OUT="$DATA/processed/$CELL/${CHROM}.pt"
|
|
if [ -f "$OUT" ]; then
|
|
echo "[$CELL] Graph already exists, skipping"
|
|
else
|
|
python -m chromatin_gnn.build_graph \
|
|
--mcool "$DATA/raw/${CELL_UPPER}.mcool" \
|
|
--chrom "$CHROM" --res "$RES" \
|
|
--bigwigs \
|
|
"$DATA/raw/${CELL_UPPER}_CTCF.bw" \
|
|
"$DATA/raw/${CELL_UPPER}_H3K27me3.bw" \
|
|
"$DATA/raw/${CELL_UPPER}_H3K4me3.bw" \
|
|
--out "$OUT"
|
|
fi
|
|
done
|
|
|
|
# ── Step 2: Compute A/B compartments ─────────────────────────────────────────
|
|
for CELL in gm12878 imr90; do
|
|
CELL_UPPER=$(echo "$CELL" | tr '[:lower:]' '[:upper:]')
|
|
OUT="$COMPARTMENTS_DIR/${CELL}_${CHROM}.csv"
|
|
if [ -f "$OUT" ]; then
|
|
echo "[$CELL] Compartments already exist, skipping"
|
|
else
|
|
python "$EXP/compute_compartments.py" \
|
|
--mcool "$DATA/raw/${CELL_UPPER}.mcool" \
|
|
--chrom "$CHROM" --res "$RES" \
|
|
--bigwig_orient "$DATA/raw/${CELL_UPPER}_CTCF.bw" \
|
|
--out "$OUT"
|
|
fi
|
|
done
|
|
|
|
# ── Step 3: Train VGAE on GM12878 ────────────────────────────────────────────
|
|
if [ -f "$RESULTS/model.pt" ]; then
|
|
echo "[GM12878] Model already exists, skipping training"
|
|
else
|
|
python "$EXP/train.py" \
|
|
--graph "$DATA/processed/gm12878/${CHROM}.pt" \
|
|
--encoder deep_gcn \
|
|
--hidden "$HIDDEN" --latent "$LATENT" \
|
|
--epochs "$EPOCHS" --patience "$PATIENCE" \
|
|
--lr 3e-4 --dropout 0.3 --beta 0.5 --kl_anneal 100 \
|
|
--seed "$SEED" \
|
|
--outdir "$RESULTS"
|
|
fi
|
|
|
|
# ── Step 4: Zero-shot encode IMR90 ───────────────────────────────────────────
|
|
python "$EXP/encode.py" \
|
|
--model "$RESULTS/model.pt" \
|
|
--graph "$DATA/processed/imr90/${CHROM}.pt" \
|
|
--out "$RESULTS/imr90_emb.npy"
|
|
|
|
# ── Step 5: Visualise (skip with SKIP_FIGURES=1 — useful in genome-wide loops) ─
|
|
if [ -z "${SKIP_FIGURES:-}" ]; then
|
|
python "$EXP/visualize.py" \
|
|
--emb "$RESULTS/gm12878_emb.npy" "$RESULTS/imr90_emb.npy" \
|
|
--labels GM12878 IMR90 \
|
|
--compartments \
|
|
"$COMPARTMENTS_DIR/gm12878_${CHROM}.csv" \
|
|
"$COMPARTMENTS_DIR/imr90_${CHROM}.csv" \
|
|
--prefix "$RESULTS/figures/umap" \
|
|
--seed "$SEED"
|
|
|
|
# ── Step 6: Compare embeddings ────────────────────────────────────────────
|
|
python "$EXP/compare.py" \
|
|
--emb1 "$RESULTS/gm12878_emb.npy" \
|
|
--emb2 "$RESULTS/imr90_emb.npy" \
|
|
--label1 GM12878 --label2 IMR90 \
|
|
--prefix "$RESULTS/figures/${CHROM}"
|
|
else
|
|
echo "[SKIP_FIGURES=1] Skipping visualization and comparison steps."
|
|
fi
|
|
|
|
echo ""
|
|
echo "=== H1 complete ($CHROM). Results: $RESULTS ==="
|
|
cat "$RESULTS/metrics.json"
|