v1.0.0: VGAE applied to GM12878 vs IMR90 chr21 Hi-C at 25kb

Full reproducible pipeline: .mcool + ChIP-seq bigwigs → latent embeddings → A/B compartment calls → cross-cell comparison. Key results (chr21, 25 kb, latent dim=32): - Test AUC=0.777, AP=0.759 (converged epoch 31/300) - GM12878 A/B silhouette (cosine) = 0.775 - IMR90 zero-shot silhouette = 0.443 - A-compartment bins stable across cell types (mean cosine Δ=0.042) - B-compartment bins shift substantially (mean cosine Δ=0.451) - 101 B→A and 70 A→B compartment switches GM12878→IMR90
2026-05-15 01:53:04 +02:00
parent 6c91af655d
commit acadbd780c
27 changed files with 6764 additions and 201 deletions
--- a/run_pipeline.sh
+++ b/run_pipeline.sh
@@ -0,0 +1,162 @@
+#!/usr/bin/env bash
+#
+# run_pipeline.sh  –  end-to-end pipeline for chromatin-gnn
+#
+# Prerequisites
+# -------------
+# 1. Create and activate the environment:
+#      conda create -n chromatin_gnn python=3.10 -y
+#      conda activate chromatin_gnn
+#      pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cpu
+#      pip install torch-geometric==2.5.3 cooler==0.9.3 pyBigWig pandas \
+#                  "numpy>=1.24,<2.0" scikit-learn matplotlib umap-learn scipy seaborn tqdm
+#
+# 2. Download raw data into data/raw/ (see README.md § Dataset for URLs):
+#      GM12878.mcool   4DN accession 4DNFIRUMEC32
+#      IMR90.mcool     4DN accession 4DNFIABB3FHQ
+#      GM12878_CTCF.bw ENCODE ENCFF741BAQ (experiment ENCSR000AKB)
+#      GM12878_H3K27me3.bw ENCODE ENCFF736CNQ (experiment ENCSR000AKD)
+#      IMR90_CTCF.bw   ENCODE ENCFF770DUD (experiment ENCSR000EFI)
+#      IMR90_H3K27me3.bw ENCODE ENCFF158HZL (experiment ENCSR431UUY)
+#
+# Usage
+# -----
+#   bash run_pipeline.sh [--chrom chr21] [--res 25000] [--epochs 300]
+
+set -euo pipefail
+
+# ========== Configuration ==========
+CHROM="${CHROM:-chr21}"
+RES="${RES:-25000}"
+EPOCHS="${EPOCHS:-300}"
+PATIENCE="${PATIENCE:-20}"
+HIDDEN="${HIDDEN:-64}"
+LATENT="${LATENT:-32}"
+SEED="${SEED:-42}"
+
+REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPTS="$REPO/scripts"
+DATA="$REPO/data"
+RESULTS="$REPO/results"
+
+# ========== Directories ==========
+mkdir -p "$DATA/raw" "$DATA/processed" \
+         "$RESULTS/GM12878" "$RESULTS/IMR90" "$RESULTS/figures"
+
+# ========== Download ENCODE bigWig tracks ==========
+echo "=== Downloading ENCODE bigWig tracks ==="
+for entry in \
+    "GM12878_CTCF.bw|https://www.encodeproject.org/files/ENCFF741BAQ/@@download/ENCFF741BAQ.bigWig" \
+    "GM12878_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF736CNQ/@@download/ENCFF736CNQ.bigWig" \
+    "IMR90_CTCF.bw|https://www.encodeproject.org/files/ENCFF770DUD/@@download/ENCFF770DUD.bigWig" \
+    "IMR90_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF158HZL/@@download/ENCFF158HZL.bigWig"
+do
+    fname="${entry%%|*}"
+    url="${entry##*|}"
+    out="$DATA/raw/$fname"
+    if [ -f "$out" ]; then
+        echo "  $fname already present, skipping"
+    else
+        echo "  Downloading $fname ..."
+        wget -q --show-progress -O "$out" "$url"
+    fi
+done
+
+# .mcool files must be downloaded manually from 4DN (requires free account):
+#   GM12878: https://data.4dnucleome.org/files-processed/4DNFIRUMEC32/@@download/4DNFIRUMEC32.mcool
+#   IMR90:   https://data.4dnucleome.org/files-processed/4DNFIABB3FHQ/@@download/4DNFIABB3FHQ.mcool
+for f in GM12878.mcool IMR90.mcool; do
+    if [ ! -f "$DATA/raw/$f" ]; then
+        echo "ERROR: $DATA/raw/$f not found. Download from 4DN (see README) and retry." >&2
+        exit 1
+    fi
+done
+
+# ========== Step 1: Build contact graphs ==========
+echo ""
+echo "=== Step 1: Building chromatin contact graphs ==="
+for CELL in GM12878 IMR90; do
+    OUT="$DATA/processed/${CELL}_${CHROM}.pt"
+    if [ -f "$OUT" ]; then
+        echo "  ${CELL} graph already exists, skipping"
+    else
+        python "$SCRIPTS/build_graph.py" \
+            --mcool "$DATA/raw/${CELL}.mcool" \
+            --chrom "$CHROM" --res "$RES" \
+            --bigwigs "$DATA/raw/${CELL}_CTCF.bw" "$DATA/raw/${CELL}_H3K27me3.bw" \
+            --out "$OUT"
+    fi
+done
+
+# ========== Step 2: Compute A/B compartments ==========
+echo ""
+echo "=== Step 2: Computing A/B compartments (PC1 of O/E Pearson correlation) ==="
+for CELL in GM12878 IMR90; do
+    OUT="$RESULTS/${CELL}/compartments_${CHROM}.csv"
+    if [ -f "$OUT" ]; then
+        echo "  ${CELL} compartments already exist, skipping"
+    else
+        python "$SCRIPTS/compute_compartments.py" \
+            --mcool "$DATA/raw/${CELL}.mcool" \
+            --chrom "$CHROM" --res "$RES" \
+            --bigwig_orient "$DATA/raw/${CELL}_CTCF.bw" \
+            --out "$OUT"
+    fi
+done
+
+# ========== Step 3: Train VGAE on GM12878 ==========
+echo ""
+echo "=== Step 3: Training VGAE on GM12878 ==="
+if [ -f "$RESULTS/GM12878/model.pt" ]; then
+    echo "  Trained model already exists, skipping"
+else
+    python "$SCRIPTS/train_vgae.py" \
+        --graph "$DATA/processed/GM12878_${CHROM}.pt" \
+        --epochs "$EPOCHS" --patience "$PATIENCE" \
+        --hidden "$HIDDEN" --latent "$LATENT" \
+        --seed "$SEED" \
+        --outdir "$RESULTS/GM12878"
+fi
+
+# ========== Step 4: Encode IMR90 with GM12878 model ==========
+echo ""
+echo "=== Step 4: Encoding IMR90 graph with trained GM12878 model ==="
+if [ -f "$RESULTS/IMR90/emb.npy" ]; then
+    echo "  IMR90 embeddings already exist, skipping"
+else
+    python "$SCRIPTS/encode_graph.py" \
+        --model "$RESULTS/GM12878/model.pt" \
+        --graph "$DATA/processed/IMR90_${CHROM}.pt" \
+        --out   "$RESULTS/IMR90/emb.npy"
+fi
+
+# ========== Step 5: Visualise embeddings ==========
+echo ""
+echo "=== Step 5: Generating UMAP visualisations ==="
+python "$SCRIPTS/visualize_embeddings.py" \
+    --emb    "$RESULTS/GM12878/emb.npy" "$RESULTS/IMR90/emb.npy" \
+    --labels GM12878 IMR90 \
+    --compartments \
+        "$RESULTS/GM12878/compartments_${CHROM}.csv" \
+        "$RESULTS/IMR90/compartments_${CHROM}.csv" \
+    --prefix "$RESULTS/figures/umap" \
+    --seed "$SEED"
+
+# ========== Step 6: Compare embeddings ==========
+echo ""
+echo "=== Step 6: Comparing GM12878 vs IMR90 embeddings ==="
+python "$SCRIPTS/compare_embeddings.py" \
+    --emb1 "$RESULTS/GM12878/emb.npy" \
+    --emb2 "$RESULTS/IMR90/emb.npy" \
+    --label1 GM12878 --label2 IMR90 \
+    --prefix "$RESULTS/figures/${CHROM}"
+
+# ========== Summary ==========
+echo ""
+echo "=== Pipeline complete ==="
+echo "Outputs:"
+echo "  Model + embeddings : $RESULTS/GM12878/"
+echo "  Figures            : $RESULTS/figures/"
+echo "  Metrics            : $RESULTS/GM12878/metrics.json"
+echo ""
+cat "$RESULTS/GM12878/metrics.json"