v1.0.0: VGAE applied to GM12878 vs IMR90 chr21 Hi-C at 25kb
Full reproducible pipeline: .mcool + ChIP-seq bigwigs → latent embeddings → A/B compartment calls → cross-cell comparison. Key results (chr21, 25 kb, latent dim=32): - Test AUC=0.777, AP=0.759 (converged epoch 31/300) - GM12878 A/B silhouette (cosine) = 0.775 - IMR90 zero-shot silhouette = 0.443 - A-compartment bins stable across cell types (mean cosine Δ=0.042) - B-compartment bins shift substantially (mean cosine Δ=0.451) - 101 B→A and 70 A→B compartment switches GM12878→IMR90
This commit is contained in:
162
run_pipeline.sh
Normal file
162
run_pipeline.sh
Normal file
@@ -0,0 +1,162 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# run_pipeline.sh – end-to-end pipeline for chromatin-gnn
|
||||
#
|
||||
# Prerequisites
|
||||
# -------------
|
||||
# 1. Create and activate the environment:
|
||||
# conda create -n chromatin_gnn python=3.10 -y
|
||||
# conda activate chromatin_gnn
|
||||
# pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cpu
|
||||
# pip install torch-geometric==2.5.3 cooler==0.9.3 pyBigWig pandas \
|
||||
# "numpy>=1.24,<2.0" scikit-learn matplotlib umap-learn scipy seaborn tqdm
|
||||
#
|
||||
# 2. Download raw data into data/raw/ (see README.md § Dataset for URLs):
|
||||
# GM12878.mcool 4DN accession 4DNFIRUMEC32
|
||||
# IMR90.mcool 4DN accession 4DNFIABB3FHQ
|
||||
# GM12878_CTCF.bw ENCODE ENCFF741BAQ (experiment ENCSR000AKB)
|
||||
# GM12878_H3K27me3.bw ENCODE ENCFF736CNQ (experiment ENCSR000AKD)
|
||||
# IMR90_CTCF.bw ENCODE ENCFF770DUD (experiment ENCSR000EFI)
|
||||
# IMR90_H3K27me3.bw ENCODE ENCFF158HZL (experiment ENCSR431UUY)
|
||||
#
|
||||
# Usage
|
||||
# -----
|
||||
# bash run_pipeline.sh [--chrom chr21] [--res 25000] [--epochs 300]
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ========== Configuration ==========
|
||||
CHROM="${CHROM:-chr21}"
|
||||
RES="${RES:-25000}"
|
||||
EPOCHS="${EPOCHS:-300}"
|
||||
PATIENCE="${PATIENCE:-20}"
|
||||
HIDDEN="${HIDDEN:-64}"
|
||||
LATENT="${LATENT:-32}"
|
||||
SEED="${SEED:-42}"
|
||||
|
||||
REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
SCRIPTS="$REPO/scripts"
|
||||
DATA="$REPO/data"
|
||||
RESULTS="$REPO/results"
|
||||
|
||||
# ========== Directories ==========
|
||||
mkdir -p "$DATA/raw" "$DATA/processed" \
|
||||
"$RESULTS/GM12878" "$RESULTS/IMR90" "$RESULTS/figures"
|
||||
|
||||
# ========== Download ENCODE bigWig tracks ==========
|
||||
echo "=== Downloading ENCODE bigWig tracks ==="
|
||||
for entry in \
|
||||
"GM12878_CTCF.bw|https://www.encodeproject.org/files/ENCFF741BAQ/@@download/ENCFF741BAQ.bigWig" \
|
||||
"GM12878_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF736CNQ/@@download/ENCFF736CNQ.bigWig" \
|
||||
"IMR90_CTCF.bw|https://www.encodeproject.org/files/ENCFF770DUD/@@download/ENCFF770DUD.bigWig" \
|
||||
"IMR90_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF158HZL/@@download/ENCFF158HZL.bigWig"
|
||||
do
|
||||
fname="${entry%%|*}"
|
||||
url="${entry##*|}"
|
||||
out="$DATA/raw/$fname"
|
||||
if [ -f "$out" ]; then
|
||||
echo " $fname already present, skipping"
|
||||
else
|
||||
echo " Downloading $fname ..."
|
||||
wget -q --show-progress -O "$out" "$url"
|
||||
fi
|
||||
done
|
||||
|
||||
# .mcool files must be downloaded manually from 4DN (requires free account):
|
||||
# GM12878: https://data.4dnucleome.org/files-processed/4DNFIRUMEC32/@@download/4DNFIRUMEC32.mcool
|
||||
# IMR90: https://data.4dnucleome.org/files-processed/4DNFIABB3FHQ/@@download/4DNFIABB3FHQ.mcool
|
||||
for f in GM12878.mcool IMR90.mcool; do
|
||||
if [ ! -f "$DATA/raw/$f" ]; then
|
||||
echo "ERROR: $DATA/raw/$f not found. Download from 4DN (see README) and retry." >&2
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# ========== Step 1: Build contact graphs ==========
|
||||
echo ""
|
||||
echo "=== Step 1: Building chromatin contact graphs ==="
|
||||
for CELL in GM12878 IMR90; do
|
||||
OUT="$DATA/processed/${CELL}_${CHROM}.pt"
|
||||
if [ -f "$OUT" ]; then
|
||||
echo " ${CELL} graph already exists, skipping"
|
||||
else
|
||||
python "$SCRIPTS/build_graph.py" \
|
||||
--mcool "$DATA/raw/${CELL}.mcool" \
|
||||
--chrom "$CHROM" --res "$RES" \
|
||||
--bigwigs "$DATA/raw/${CELL}_CTCF.bw" "$DATA/raw/${CELL}_H3K27me3.bw" \
|
||||
--out "$OUT"
|
||||
fi
|
||||
done
|
||||
|
||||
# ========== Step 2: Compute A/B compartments ==========
|
||||
echo ""
|
||||
echo "=== Step 2: Computing A/B compartments (PC1 of O/E Pearson correlation) ==="
|
||||
for CELL in GM12878 IMR90; do
|
||||
OUT="$RESULTS/${CELL}/compartments_${CHROM}.csv"
|
||||
if [ -f "$OUT" ]; then
|
||||
echo " ${CELL} compartments already exist, skipping"
|
||||
else
|
||||
python "$SCRIPTS/compute_compartments.py" \
|
||||
--mcool "$DATA/raw/${CELL}.mcool" \
|
||||
--chrom "$CHROM" --res "$RES" \
|
||||
--bigwig_orient "$DATA/raw/${CELL}_CTCF.bw" \
|
||||
--out "$OUT"
|
||||
fi
|
||||
done
|
||||
|
||||
# ========== Step 3: Train VGAE on GM12878 ==========
|
||||
echo ""
|
||||
echo "=== Step 3: Training VGAE on GM12878 ==="
|
||||
if [ -f "$RESULTS/GM12878/model.pt" ]; then
|
||||
echo " Trained model already exists, skipping"
|
||||
else
|
||||
python "$SCRIPTS/train_vgae.py" \
|
||||
--graph "$DATA/processed/GM12878_${CHROM}.pt" \
|
||||
--epochs "$EPOCHS" --patience "$PATIENCE" \
|
||||
--hidden "$HIDDEN" --latent "$LATENT" \
|
||||
--seed "$SEED" \
|
||||
--outdir "$RESULTS/GM12878"
|
||||
fi
|
||||
|
||||
# ========== Step 4: Encode IMR90 with GM12878 model ==========
|
||||
echo ""
|
||||
echo "=== Step 4: Encoding IMR90 graph with trained GM12878 model ==="
|
||||
if [ -f "$RESULTS/IMR90/emb.npy" ]; then
|
||||
echo " IMR90 embeddings already exist, skipping"
|
||||
else
|
||||
python "$SCRIPTS/encode_graph.py" \
|
||||
--model "$RESULTS/GM12878/model.pt" \
|
||||
--graph "$DATA/processed/IMR90_${CHROM}.pt" \
|
||||
--out "$RESULTS/IMR90/emb.npy"
|
||||
fi
|
||||
|
||||
# ========== Step 5: Visualise embeddings ==========
|
||||
echo ""
|
||||
echo "=== Step 5: Generating UMAP visualisations ==="
|
||||
python "$SCRIPTS/visualize_embeddings.py" \
|
||||
--emb "$RESULTS/GM12878/emb.npy" "$RESULTS/IMR90/emb.npy" \
|
||||
--labels GM12878 IMR90 \
|
||||
--compartments \
|
||||
"$RESULTS/GM12878/compartments_${CHROM}.csv" \
|
||||
"$RESULTS/IMR90/compartments_${CHROM}.csv" \
|
||||
--prefix "$RESULTS/figures/umap" \
|
||||
--seed "$SEED"
|
||||
|
||||
# ========== Step 6: Compare embeddings ==========
|
||||
echo ""
|
||||
echo "=== Step 6: Comparing GM12878 vs IMR90 embeddings ==="
|
||||
python "$SCRIPTS/compare_embeddings.py" \
|
||||
--emb1 "$RESULTS/GM12878/emb.npy" \
|
||||
--emb2 "$RESULTS/IMR90/emb.npy" \
|
||||
--label1 GM12878 --label2 IMR90 \
|
||||
--prefix "$RESULTS/figures/${CHROM}"
|
||||
|
||||
# ========== Summary ==========
|
||||
echo ""
|
||||
echo "=== Pipeline complete ==="
|
||||
echo "Outputs:"
|
||||
echo " Model + embeddings : $RESULTS/GM12878/"
|
||||
echo " Figures : $RESULTS/figures/"
|
||||
echo " Metrics : $RESULTS/GM12878/metrics.json"
|
||||
echo ""
|
||||
cat "$RESULTS/GM12878/metrics.json"
|
||||
Reference in New Issue
Block a user