v1.0.0: VGAE applied to GM12878 vs IMR90 chr21 Hi-C at 25kb

Full reproducible pipeline: .mcool + ChIP-seq bigwigs → latent
  embeddings → A/B compartment calls → cross-cell comparison.

  Key results (chr21, 25 kb, latent dim=32):
  - Test AUC=0.777, AP=0.759 (converged epoch 31/300)
  - GM12878 A/B silhouette (cosine) = 0.775
  - IMR90 zero-shot silhouette = 0.443
  - A-compartment bins stable across cell types (mean cosine Δ=0.042)
  - B-compartment bins shift substantially (mean cosine Δ=0.451)
  - 101 B→A and 70 A→B compartment switches GM12878→IMR90
This commit is contained in:
2026-05-15 01:53:04 +02:00
parent 6c91af655d
commit acadbd780c
27 changed files with 6764 additions and 201 deletions

162
run_pipeline.sh Normal file
View File

@@ -0,0 +1,162 @@
#!/usr/bin/env bash
#
# run_pipeline.sh end-to-end pipeline for chromatin-gnn
#
# Prerequisites
# -------------
# 1. Create and activate the environment:
# conda create -n chromatin_gnn python=3.10 -y
# conda activate chromatin_gnn
# pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cpu
# pip install torch-geometric==2.5.3 cooler==0.9.3 pyBigWig pandas \
# "numpy>=1.24,<2.0" scikit-learn matplotlib umap-learn scipy seaborn tqdm
#
# 2. Download raw data into data/raw/ (see README.md § Dataset for URLs):
# GM12878.mcool 4DN accession 4DNFIRUMEC32
# IMR90.mcool 4DN accession 4DNFIABB3FHQ
# GM12878_CTCF.bw ENCODE ENCFF741BAQ (experiment ENCSR000AKB)
# GM12878_H3K27me3.bw ENCODE ENCFF736CNQ (experiment ENCSR000AKD)
# IMR90_CTCF.bw ENCODE ENCFF770DUD (experiment ENCSR000EFI)
# IMR90_H3K27me3.bw ENCODE ENCFF158HZL (experiment ENCSR431UUY)
#
# Usage
# -----
# bash run_pipeline.sh [--chrom chr21] [--res 25000] [--epochs 300]
set -euo pipefail
# ========== Configuration ==========
CHROM="${CHROM:-chr21}"
RES="${RES:-25000}"
EPOCHS="${EPOCHS:-300}"
PATIENCE="${PATIENCE:-20}"
HIDDEN="${HIDDEN:-64}"
LATENT="${LATENT:-32}"
SEED="${SEED:-42}"
REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SCRIPTS="$REPO/scripts"
DATA="$REPO/data"
RESULTS="$REPO/results"
# ========== Directories ==========
mkdir -p "$DATA/raw" "$DATA/processed" \
"$RESULTS/GM12878" "$RESULTS/IMR90" "$RESULTS/figures"
# ========== Download ENCODE bigWig tracks ==========
echo "=== Downloading ENCODE bigWig tracks ==="
for entry in \
"GM12878_CTCF.bw|https://www.encodeproject.org/files/ENCFF741BAQ/@@download/ENCFF741BAQ.bigWig" \
"GM12878_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF736CNQ/@@download/ENCFF736CNQ.bigWig" \
"IMR90_CTCF.bw|https://www.encodeproject.org/files/ENCFF770DUD/@@download/ENCFF770DUD.bigWig" \
"IMR90_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF158HZL/@@download/ENCFF158HZL.bigWig"
do
fname="${entry%%|*}"
url="${entry##*|}"
out="$DATA/raw/$fname"
if [ -f "$out" ]; then
echo " $fname already present, skipping"
else
echo " Downloading $fname ..."
wget -q --show-progress -O "$out" "$url"
fi
done
# .mcool files must be downloaded manually from 4DN (requires free account):
# GM12878: https://data.4dnucleome.org/files-processed/4DNFIRUMEC32/@@download/4DNFIRUMEC32.mcool
# IMR90: https://data.4dnucleome.org/files-processed/4DNFIABB3FHQ/@@download/4DNFIABB3FHQ.mcool
for f in GM12878.mcool IMR90.mcool; do
if [ ! -f "$DATA/raw/$f" ]; then
echo "ERROR: $DATA/raw/$f not found. Download from 4DN (see README) and retry." >&2
exit 1
fi
done
# ========== Step 1: Build contact graphs ==========
echo ""
echo "=== Step 1: Building chromatin contact graphs ==="
for CELL in GM12878 IMR90; do
OUT="$DATA/processed/${CELL}_${CHROM}.pt"
if [ -f "$OUT" ]; then
echo " ${CELL} graph already exists, skipping"
else
python "$SCRIPTS/build_graph.py" \
--mcool "$DATA/raw/${CELL}.mcool" \
--chrom "$CHROM" --res "$RES" \
--bigwigs "$DATA/raw/${CELL}_CTCF.bw" "$DATA/raw/${CELL}_H3K27me3.bw" \
--out "$OUT"
fi
done
# ========== Step 2: Compute A/B compartments ==========
echo ""
echo "=== Step 2: Computing A/B compartments (PC1 of O/E Pearson correlation) ==="
for CELL in GM12878 IMR90; do
OUT="$RESULTS/${CELL}/compartments_${CHROM}.csv"
if [ -f "$OUT" ]; then
echo " ${CELL} compartments already exist, skipping"
else
python "$SCRIPTS/compute_compartments.py" \
--mcool "$DATA/raw/${CELL}.mcool" \
--chrom "$CHROM" --res "$RES" \
--bigwig_orient "$DATA/raw/${CELL}_CTCF.bw" \
--out "$OUT"
fi
done
# ========== Step 3: Train VGAE on GM12878 ==========
echo ""
echo "=== Step 3: Training VGAE on GM12878 ==="
if [ -f "$RESULTS/GM12878/model.pt" ]; then
echo " Trained model already exists, skipping"
else
python "$SCRIPTS/train_vgae.py" \
--graph "$DATA/processed/GM12878_${CHROM}.pt" \
--epochs "$EPOCHS" --patience "$PATIENCE" \
--hidden "$HIDDEN" --latent "$LATENT" \
--seed "$SEED" \
--outdir "$RESULTS/GM12878"
fi
# ========== Step 4: Encode IMR90 with GM12878 model ==========
echo ""
echo "=== Step 4: Encoding IMR90 graph with trained GM12878 model ==="
if [ -f "$RESULTS/IMR90/emb.npy" ]; then
echo " IMR90 embeddings already exist, skipping"
else
python "$SCRIPTS/encode_graph.py" \
--model "$RESULTS/GM12878/model.pt" \
--graph "$DATA/processed/IMR90_${CHROM}.pt" \
--out "$RESULTS/IMR90/emb.npy"
fi
# ========== Step 5: Visualise embeddings ==========
echo ""
echo "=== Step 5: Generating UMAP visualisations ==="
python "$SCRIPTS/visualize_embeddings.py" \
--emb "$RESULTS/GM12878/emb.npy" "$RESULTS/IMR90/emb.npy" \
--labels GM12878 IMR90 \
--compartments \
"$RESULTS/GM12878/compartments_${CHROM}.csv" \
"$RESULTS/IMR90/compartments_${CHROM}.csv" \
--prefix "$RESULTS/figures/umap" \
--seed "$SEED"
# ========== Step 6: Compare embeddings ==========
echo ""
echo "=== Step 6: Comparing GM12878 vs IMR90 embeddings ==="
python "$SCRIPTS/compare_embeddings.py" \
--emb1 "$RESULTS/GM12878/emb.npy" \
--emb2 "$RESULTS/IMR90/emb.npy" \
--label1 GM12878 --label2 IMR90 \
--prefix "$RESULTS/figures/${CHROM}"
# ========== Summary ==========
echo ""
echo "=== Pipeline complete ==="
echo "Outputs:"
echo " Model + embeddings : $RESULTS/GM12878/"
echo " Figures : $RESULTS/figures/"
echo " Metrics : $RESULTS/GM12878/metrics.json"
echo ""
cat "$RESULTS/GM12878/metrics.json"