135 lines
5.4 KiB
Bash
135 lines
5.4 KiB
Bash
#!/usr/bin/env bash
|
||
# H3: Long-range topology — graph ablation experiment
|
||
#
|
||
# Tests whether long-range edges (> 1 Mb) encode non-trivial topological
|
||
# structure beyond local contact density.
|
||
#
|
||
# Trains 3 VGAE variants on the same GM12878 chr1 graph:
|
||
# full — all edges up to 5 Mb (reused from H1; no retraining)
|
||
# local — only edges < 250 kb (within-TAD scale)
|
||
# longrange — only edges > 1 Mb (sub-compartment scale)
|
||
#
|
||
# Usage:
|
||
# bash experiments/h3_longrange/run.sh
|
||
set -euo pipefail
|
||
|
||
REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||
export PYTHONPATH="$REPO:${PYTHONPATH:-}"
|
||
|
||
# Auto-activate conda env if its packages are not on the current Python
|
||
if ! python -c "import torch_geometric" 2>/dev/null; then
|
||
echo "Activating conda env chromatin_gnn..."
|
||
eval "$(conda shell.bash hook)"
|
||
conda activate chromatin_gnn
|
||
fi
|
||
|
||
CHROM="${CHROM:-chr1}"
|
||
RES="${RES:-25000}"
|
||
SEED="${SEED:-42}"
|
||
EPOCHS="${EPOCHS:-300}"
|
||
PATIENCE="${PATIENCE:-50}"
|
||
HIDDEN="${HIDDEN:-256}" # match H1 full-model hyperparameters
|
||
LATENT="${LATENT:-64}"
|
||
DEVICE="${DEVICE:-auto}"
|
||
|
||
DATA="$REPO/data"
|
||
RESULTS_BASE="$REPO/results/h3_longrange"
|
||
RESULTS="$RESULTS_BASE/${CHROM}" # per-chromosome outputs
|
||
H1_BASE="$REPO/results/h1_representation"
|
||
H1_CHR_DIR="$H1_BASE/${CHROM}" # per-chrom H1 outputs
|
||
COMPARTMENTS_DIR="$H1_BASE/compartments" # shared, $CHROM in filename
|
||
EXP="$REPO/experiments/h3_longrange"
|
||
TRAIN="$REPO/experiments/h1_representation/train.py"
|
||
|
||
mkdir -p "$RESULTS"
|
||
|
||
# ── Step 1: Build ablation graphs ─────────────────────────────────────────────
|
||
LOCAL_GRAPH="$DATA/processed/gm12878/${CHROM}_local.pt"
|
||
LONGRANGE_GRAPH="$DATA/processed/gm12878/${CHROM}_longrange.pt"
|
||
|
||
if [ -f "$LOCAL_GRAPH" ] && [ -f "$LONGRANGE_GRAPH" ]; then
|
||
echo "Ablation graphs already exist, skipping build."
|
||
else
|
||
echo "=== Step 1: Building ablation graphs ==="
|
||
python "$EXP/build_ablation_graphs.py" \
|
||
--graph "$DATA/processed/gm12878/${CHROM}.pt" \
|
||
--res "$RES" \
|
||
--short_cutoff 250000 \
|
||
--long_cutoff 1000000 \
|
||
--out_local "$LOCAL_GRAPH" \
|
||
--out_longrange "$LONGRANGE_GRAPH"
|
||
fi
|
||
|
||
# ── Step 2a: Train real-features variants (full is reused from H1) ───────────
|
||
for VARIANT in local longrange; do
|
||
OUT="$RESULTS/${VARIANT}_only"
|
||
if [ -f "$OUT/model.pt" ]; then
|
||
echo "[$VARIANT real-features] Already trained, skipping"
|
||
else
|
||
echo "=== Step 2a: Training $VARIANT-only (real features) ==="
|
||
python "$TRAIN" \
|
||
--graph "$DATA/processed/gm12878/${CHROM}_${VARIANT}.pt" \
|
||
--encoder deep_gcn \
|
||
--hidden "$HIDDEN" --latent "$LATENT" \
|
||
--epochs "$EPOCHS" --patience "$PATIENCE" \
|
||
--lr 3e-4 --dropout 0.3 --beta 0.5 --kl_anneal 100 \
|
||
--seed "$SEED" \
|
||
--device "$DEVICE" \
|
||
--outdir "$OUT"
|
||
fi
|
||
done
|
||
|
||
# ── Step 2b: Constant-features cross-ablation ────────────────────────────────
|
||
# Trains the bottom row of the 2×3 (feature × edge) grid: constant ones features
|
||
# on the full graph and on each edge-band subset. With features removed, the
|
||
# encoder must rely on graph topology alone — so any compartment signal in the
|
||
# resulting embeddings reflects what each edge subset carries topologically.
|
||
for CELL in full local longrange; do
|
||
OUT="$RESULTS/${CELL}_const"
|
||
if [ "$CELL" = "full" ]; then
|
||
GRAPH="$DATA/processed/gm12878/${CHROM}.pt"
|
||
else
|
||
GRAPH="$DATA/processed/gm12878/${CHROM}_${CELL}.pt"
|
||
fi
|
||
if [ -f "$OUT/model.pt" ]; then
|
||
echo "[$CELL const-features] Already trained, skipping"
|
||
else
|
||
echo "=== Step 2b: Training $CELL (constant features) ==="
|
||
python "$TRAIN" \
|
||
--graph "$GRAPH" \
|
||
--encoder deep_gcn \
|
||
--hidden "$HIDDEN" --latent "$LATENT" \
|
||
--epochs "$EPOCHS" --patience "$PATIENCE" \
|
||
--lr 3e-4 --dropout 0.3 --beta 0.5 --kl_anneal 100 \
|
||
--seed "$SEED" \
|
||
--device "$DEVICE" \
|
||
--constant_features \
|
||
--outdir "$OUT"
|
||
fi
|
||
done
|
||
|
||
# ── Step 3a: Edge-only ablation (3-way comparison, real features) ────────────
|
||
echo "=== Step 3a: Evaluating edge-only ablation ==="
|
||
python "$EXP/evaluate_ablation.py" \
|
||
--full_emb "$H1_CHR_DIR/gm12878_emb.npy" \
|
||
--full_metrics "$H1_CHR_DIR/metrics.json" \
|
||
--local_dir "$RESULTS/local_only" \
|
||
--longrange_dir "$RESULTS/longrange_only" \
|
||
--compartments "$COMPARTMENTS_DIR/gm12878_${CHROM}.csv" \
|
||
--out "$RESULTS/ablation_comparison.json"
|
||
|
||
# ── Step 3b: Feature × edge cross-ablation (2×3 grid) ────────────────────────
|
||
echo "=== Step 3b: Evaluating feature × edge cross-ablation ==="
|
||
python "$EXP/evaluate_cross_ablation.py" \
|
||
--full_real_dir "$H1_CHR_DIR" \
|
||
--local_real_dir "$RESULTS/local_only" \
|
||
--longrange_real_dir "$RESULTS/longrange_only" \
|
||
--full_const_dir "$RESULTS/full_const" \
|
||
--local_const_dir "$RESULTS/local_const" \
|
||
--longrange_const_dir "$RESULTS/longrange_const" \
|
||
--compartments "$COMPARTMENTS_DIR/gm12878_${CHROM}.csv" \
|
||
--out "$RESULTS/cross_ablation.json"
|
||
|
||
echo ""
|
||
echo "=== H3 complete ($CHROM). Results: $RESULTS ==="
|