Files
chromatin-vgae-hic/experiments/h3_longrange/run.sh

135 lines
5.4 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# H3: Long-range topology — graph ablation experiment
#
# Tests whether long-range edges (> 1 Mb) encode non-trivial topological
# structure beyond local contact density.
#
# Trains 3 VGAE variants on the same GM12878 chr1 graph:
# full — all edges up to 5 Mb (reused from H1; no retraining)
# local — only edges < 250 kb (within-TAD scale)
# longrange — only edges > 1 Mb (sub-compartment scale)
#
# Usage:
# bash experiments/h3_longrange/run.sh
set -euo pipefail
REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
export PYTHONPATH="$REPO:${PYTHONPATH:-}"
# Auto-activate conda env if its packages are not on the current Python
if ! python -c "import torch_geometric" 2>/dev/null; then
echo "Activating conda env chromatin_gnn..."
eval "$(conda shell.bash hook)"
conda activate chromatin_gnn
fi
CHROM="${CHROM:-chr1}"
RES="${RES:-25000}"
SEED="${SEED:-42}"
EPOCHS="${EPOCHS:-300}"
PATIENCE="${PATIENCE:-50}"
HIDDEN="${HIDDEN:-256}" # match H1 full-model hyperparameters
LATENT="${LATENT:-64}"
DEVICE="${DEVICE:-auto}"
DATA="$REPO/data"
RESULTS_BASE="$REPO/results/h3_longrange"
RESULTS="$RESULTS_BASE/${CHROM}" # per-chromosome outputs
H1_BASE="$REPO/results/h1_representation"
H1_CHR_DIR="$H1_BASE/${CHROM}" # per-chrom H1 outputs
COMPARTMENTS_DIR="$H1_BASE/compartments" # shared, $CHROM in filename
EXP="$REPO/experiments/h3_longrange"
TRAIN="$REPO/experiments/h1_representation/train.py"
mkdir -p "$RESULTS"
# ── Step 1: Build ablation graphs ─────────────────────────────────────────────
LOCAL_GRAPH="$DATA/processed/gm12878/${CHROM}_local.pt"
LONGRANGE_GRAPH="$DATA/processed/gm12878/${CHROM}_longrange.pt"
if [ -f "$LOCAL_GRAPH" ] && [ -f "$LONGRANGE_GRAPH" ]; then
echo "Ablation graphs already exist, skipping build."
else
echo "=== Step 1: Building ablation graphs ==="
python "$EXP/build_ablation_graphs.py" \
--graph "$DATA/processed/gm12878/${CHROM}.pt" \
--res "$RES" \
--short_cutoff 250000 \
--long_cutoff 1000000 \
--out_local "$LOCAL_GRAPH" \
--out_longrange "$LONGRANGE_GRAPH"
fi
# ── Step 2a: Train real-features variants (full is reused from H1) ───────────
for VARIANT in local longrange; do
OUT="$RESULTS/${VARIANT}_only"
if [ -f "$OUT/model.pt" ]; then
echo "[$VARIANT real-features] Already trained, skipping"
else
echo "=== Step 2a: Training $VARIANT-only (real features) ==="
python "$TRAIN" \
--graph "$DATA/processed/gm12878/${CHROM}_${VARIANT}.pt" \
--encoder deep_gcn \
--hidden "$HIDDEN" --latent "$LATENT" \
--epochs "$EPOCHS" --patience "$PATIENCE" \
--lr 3e-4 --dropout 0.3 --beta 0.5 --kl_anneal 100 \
--seed "$SEED" \
--device "$DEVICE" \
--outdir "$OUT"
fi
done
# ── Step 2b: Constant-features cross-ablation ────────────────────────────────
# Trains the bottom row of the 2×3 (feature × edge) grid: constant ones features
# on the full graph and on each edge-band subset. With features removed, the
# encoder must rely on graph topology alone — so any compartment signal in the
# resulting embeddings reflects what each edge subset carries topologically.
for CELL in full local longrange; do
OUT="$RESULTS/${CELL}_const"
if [ "$CELL" = "full" ]; then
GRAPH="$DATA/processed/gm12878/${CHROM}.pt"
else
GRAPH="$DATA/processed/gm12878/${CHROM}_${CELL}.pt"
fi
if [ -f "$OUT/model.pt" ]; then
echo "[$CELL const-features] Already trained, skipping"
else
echo "=== Step 2b: Training $CELL (constant features) ==="
python "$TRAIN" \
--graph "$GRAPH" \
--encoder deep_gcn \
--hidden "$HIDDEN" --latent "$LATENT" \
--epochs "$EPOCHS" --patience "$PATIENCE" \
--lr 3e-4 --dropout 0.3 --beta 0.5 --kl_anneal 100 \
--seed "$SEED" \
--device "$DEVICE" \
--constant_features \
--outdir "$OUT"
fi
done
# ── Step 3a: Edge-only ablation (3-way comparison, real features) ────────────
echo "=== Step 3a: Evaluating edge-only ablation ==="
python "$EXP/evaluate_ablation.py" \
--full_emb "$H1_CHR_DIR/gm12878_emb.npy" \
--full_metrics "$H1_CHR_DIR/metrics.json" \
--local_dir "$RESULTS/local_only" \
--longrange_dir "$RESULTS/longrange_only" \
--compartments "$COMPARTMENTS_DIR/gm12878_${CHROM}.csv" \
--out "$RESULTS/ablation_comparison.json"
# ── Step 3b: Feature × edge cross-ablation (2×3 grid) ────────────────────────
echo "=== Step 3b: Evaluating feature × edge cross-ablation ==="
python "$EXP/evaluate_cross_ablation.py" \
--full_real_dir "$H1_CHR_DIR" \
--local_real_dir "$RESULTS/local_only" \
--longrange_real_dir "$RESULTS/longrange_only" \
--full_const_dir "$RESULTS/full_const" \
--local_const_dir "$RESULTS/local_const" \
--longrange_const_dir "$RESULTS/longrange_const" \
--compartments "$COMPARTMENTS_DIR/gm12878_${CHROM}.csv" \
--out "$RESULTS/cross_ablation.json"
echo ""
echo "=== H3 complete ($CHROM). Results: $RESULTS ==="