Files
chromatin-vgae-hic/run_pipeline.sh
aman acadbd780c v1.0.0: VGAE applied to GM12878 vs IMR90 chr21 Hi-C at 25kb
Full reproducible pipeline: .mcool + ChIP-seq bigwigs → latent
  embeddings → A/B compartment calls → cross-cell comparison.

  Key results (chr21, 25 kb, latent dim=32):
  - Test AUC=0.777, AP=0.759 (converged epoch 31/300)
  - GM12878 A/B silhouette (cosine) = 0.775
  - IMR90 zero-shot silhouette = 0.443
  - A-compartment bins stable across cell types (mean cosine Δ=0.042)
  - B-compartment bins shift substantially (mean cosine Δ=0.451)
  - 101 B→A and 70 A→B compartment switches GM12878→IMR90
2026-05-15 01:53:04 +02:00

163 lines
5.7 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
#
# run_pipeline.sh end-to-end pipeline for chromatin-gnn
#
# Prerequisites
# -------------
# 1. Create and activate the environment:
# conda create -n chromatin_gnn python=3.10 -y
# conda activate chromatin_gnn
# pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cpu
# pip install torch-geometric==2.5.3 cooler==0.9.3 pyBigWig pandas \
# "numpy>=1.24,<2.0" scikit-learn matplotlib umap-learn scipy seaborn tqdm
#
# 2. Download raw data into data/raw/ (see README.md § Dataset for URLs):
# GM12878.mcool 4DN accession 4DNFIRUMEC32
# IMR90.mcool 4DN accession 4DNFIABB3FHQ
# GM12878_CTCF.bw ENCODE ENCFF741BAQ (experiment ENCSR000AKB)
# GM12878_H3K27me3.bw ENCODE ENCFF736CNQ (experiment ENCSR000AKD)
# IMR90_CTCF.bw ENCODE ENCFF770DUD (experiment ENCSR000EFI)
# IMR90_H3K27me3.bw ENCODE ENCFF158HZL (experiment ENCSR431UUY)
#
# Usage
# -----
# bash run_pipeline.sh [--chrom chr21] [--res 25000] [--epochs 300]
set -euo pipefail
# ========== Configuration ==========
CHROM="${CHROM:-chr21}"
RES="${RES:-25000}"
EPOCHS="${EPOCHS:-300}"
PATIENCE="${PATIENCE:-20}"
HIDDEN="${HIDDEN:-64}"
LATENT="${LATENT:-32}"
SEED="${SEED:-42}"
REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SCRIPTS="$REPO/scripts"
DATA="$REPO/data"
RESULTS="$REPO/results"
# ========== Directories ==========
mkdir -p "$DATA/raw" "$DATA/processed" \
"$RESULTS/GM12878" "$RESULTS/IMR90" "$RESULTS/figures"
# ========== Download ENCODE bigWig tracks ==========
echo "=== Downloading ENCODE bigWig tracks ==="
for entry in \
"GM12878_CTCF.bw|https://www.encodeproject.org/files/ENCFF741BAQ/@@download/ENCFF741BAQ.bigWig" \
"GM12878_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF736CNQ/@@download/ENCFF736CNQ.bigWig" \
"IMR90_CTCF.bw|https://www.encodeproject.org/files/ENCFF770DUD/@@download/ENCFF770DUD.bigWig" \
"IMR90_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF158HZL/@@download/ENCFF158HZL.bigWig"
do
fname="${entry%%|*}"
url="${entry##*|}"
out="$DATA/raw/$fname"
if [ -f "$out" ]; then
echo " $fname already present, skipping"
else
echo " Downloading $fname ..."
wget -q --show-progress -O "$out" "$url"
fi
done
# .mcool files must be downloaded manually from 4DN (requires free account):
# GM12878: https://data.4dnucleome.org/files-processed/4DNFIRUMEC32/@@download/4DNFIRUMEC32.mcool
# IMR90: https://data.4dnucleome.org/files-processed/4DNFIABB3FHQ/@@download/4DNFIABB3FHQ.mcool
for f in GM12878.mcool IMR90.mcool; do
if [ ! -f "$DATA/raw/$f" ]; then
echo "ERROR: $DATA/raw/$f not found. Download from 4DN (see README) and retry." >&2
exit 1
fi
done
# ========== Step 1: Build contact graphs ==========
echo ""
echo "=== Step 1: Building chromatin contact graphs ==="
for CELL in GM12878 IMR90; do
OUT="$DATA/processed/${CELL}_${CHROM}.pt"
if [ -f "$OUT" ]; then
echo " ${CELL} graph already exists, skipping"
else
python "$SCRIPTS/build_graph.py" \
--mcool "$DATA/raw/${CELL}.mcool" \
--chrom "$CHROM" --res "$RES" \
--bigwigs "$DATA/raw/${CELL}_CTCF.bw" "$DATA/raw/${CELL}_H3K27me3.bw" \
--out "$OUT"
fi
done
# ========== Step 2: Compute A/B compartments ==========
echo ""
echo "=== Step 2: Computing A/B compartments (PC1 of O/E Pearson correlation) ==="
for CELL in GM12878 IMR90; do
OUT="$RESULTS/${CELL}/compartments_${CHROM}.csv"
if [ -f "$OUT" ]; then
echo " ${CELL} compartments already exist, skipping"
else
python "$SCRIPTS/compute_compartments.py" \
--mcool "$DATA/raw/${CELL}.mcool" \
--chrom "$CHROM" --res "$RES" \
--bigwig_orient "$DATA/raw/${CELL}_CTCF.bw" \
--out "$OUT"
fi
done
# ========== Step 3: Train VGAE on GM12878 ==========
echo ""
echo "=== Step 3: Training VGAE on GM12878 ==="
if [ -f "$RESULTS/GM12878/model.pt" ]; then
echo " Trained model already exists, skipping"
else
python "$SCRIPTS/train_vgae.py" \
--graph "$DATA/processed/GM12878_${CHROM}.pt" \
--epochs "$EPOCHS" --patience "$PATIENCE" \
--hidden "$HIDDEN" --latent "$LATENT" \
--seed "$SEED" \
--outdir "$RESULTS/GM12878"
fi
# ========== Step 4: Encode IMR90 with GM12878 model ==========
echo ""
echo "=== Step 4: Encoding IMR90 graph with trained GM12878 model ==="
if [ -f "$RESULTS/IMR90/emb.npy" ]; then
echo " IMR90 embeddings already exist, skipping"
else
python "$SCRIPTS/encode_graph.py" \
--model "$RESULTS/GM12878/model.pt" \
--graph "$DATA/processed/IMR90_${CHROM}.pt" \
--out "$RESULTS/IMR90/emb.npy"
fi
# ========== Step 5: Visualise embeddings ==========
echo ""
echo "=== Step 5: Generating UMAP visualisations ==="
python "$SCRIPTS/visualize_embeddings.py" \
--emb "$RESULTS/GM12878/emb.npy" "$RESULTS/IMR90/emb.npy" \
--labels GM12878 IMR90 \
--compartments \
"$RESULTS/GM12878/compartments_${CHROM}.csv" \
"$RESULTS/IMR90/compartments_${CHROM}.csv" \
--prefix "$RESULTS/figures/umap" \
--seed "$SEED"
# ========== Step 6: Compare embeddings ==========
echo ""
echo "=== Step 6: Comparing GM12878 vs IMR90 embeddings ==="
python "$SCRIPTS/compare_embeddings.py" \
--emb1 "$RESULTS/GM12878/emb.npy" \
--emb2 "$RESULTS/IMR90/emb.npy" \
--label1 GM12878 --label2 IMR90 \
--prefix "$RESULTS/figures/${CHROM}"
# ========== Summary ==========
echo ""
echo "=== Pipeline complete ==="
echo "Outputs:"
echo " Model + embeddings : $RESULTS/GM12878/"
echo " Figures : $RESULTS/figures/"
echo " Metrics : $RESULTS/GM12878/metrics.json"
echo ""
cat "$RESULTS/GM12878/metrics.json"