Full reproducible pipeline: .mcool + ChIP-seq bigwigs → latent embeddings → A/B compartment calls → cross-cell comparison. Key results (chr21, 25 kb, latent dim=32): - Test AUC=0.777, AP=0.759 (converged epoch 31/300) - GM12878 A/B silhouette (cosine) = 0.775 - IMR90 zero-shot silhouette = 0.443 - A-compartment bins stable across cell types (mean cosine Δ=0.042) - B-compartment bins shift substantially (mean cosine Δ=0.451) - 101 B→A and 70 A→B compartment switches GM12878→IMR90
163 lines
5.7 KiB
Bash
163 lines
5.7 KiB
Bash
#!/usr/bin/env bash
|
||
#
|
||
# run_pipeline.sh – end-to-end pipeline for chromatin-gnn
|
||
#
|
||
# Prerequisites
|
||
# -------------
|
||
# 1. Create and activate the environment:
|
||
# conda create -n chromatin_gnn python=3.10 -y
|
||
# conda activate chromatin_gnn
|
||
# pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cpu
|
||
# pip install torch-geometric==2.5.3 cooler==0.9.3 pyBigWig pandas \
|
||
# "numpy>=1.24,<2.0" scikit-learn matplotlib umap-learn scipy seaborn tqdm
|
||
#
|
||
# 2. Download raw data into data/raw/ (see README.md § Dataset for URLs):
|
||
# GM12878.mcool 4DN accession 4DNFIRUMEC32
|
||
# IMR90.mcool 4DN accession 4DNFIABB3FHQ
|
||
# GM12878_CTCF.bw ENCODE ENCFF741BAQ (experiment ENCSR000AKB)
|
||
# GM12878_H3K27me3.bw ENCODE ENCFF736CNQ (experiment ENCSR000AKD)
|
||
# IMR90_CTCF.bw ENCODE ENCFF770DUD (experiment ENCSR000EFI)
|
||
# IMR90_H3K27me3.bw ENCODE ENCFF158HZL (experiment ENCSR431UUY)
|
||
#
|
||
# Usage
|
||
# -----
|
||
# bash run_pipeline.sh [--chrom chr21] [--res 25000] [--epochs 300]
|
||
|
||
set -euo pipefail
|
||
|
||
# ========== Configuration ==========
|
||
CHROM="${CHROM:-chr21}"
|
||
RES="${RES:-25000}"
|
||
EPOCHS="${EPOCHS:-300}"
|
||
PATIENCE="${PATIENCE:-20}"
|
||
HIDDEN="${HIDDEN:-64}"
|
||
LATENT="${LATENT:-32}"
|
||
SEED="${SEED:-42}"
|
||
|
||
REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
SCRIPTS="$REPO/scripts"
|
||
DATA="$REPO/data"
|
||
RESULTS="$REPO/results"
|
||
|
||
# ========== Directories ==========
|
||
mkdir -p "$DATA/raw" "$DATA/processed" \
|
||
"$RESULTS/GM12878" "$RESULTS/IMR90" "$RESULTS/figures"
|
||
|
||
# ========== Download ENCODE bigWig tracks ==========
|
||
echo "=== Downloading ENCODE bigWig tracks ==="
|
||
for entry in \
|
||
"GM12878_CTCF.bw|https://www.encodeproject.org/files/ENCFF741BAQ/@@download/ENCFF741BAQ.bigWig" \
|
||
"GM12878_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF736CNQ/@@download/ENCFF736CNQ.bigWig" \
|
||
"IMR90_CTCF.bw|https://www.encodeproject.org/files/ENCFF770DUD/@@download/ENCFF770DUD.bigWig" \
|
||
"IMR90_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF158HZL/@@download/ENCFF158HZL.bigWig"
|
||
do
|
||
fname="${entry%%|*}"
|
||
url="${entry##*|}"
|
||
out="$DATA/raw/$fname"
|
||
if [ -f "$out" ]; then
|
||
echo " $fname already present, skipping"
|
||
else
|
||
echo " Downloading $fname ..."
|
||
wget -q --show-progress -O "$out" "$url"
|
||
fi
|
||
done
|
||
|
||
# .mcool files must be downloaded manually from 4DN (requires free account):
|
||
# GM12878: https://data.4dnucleome.org/files-processed/4DNFIRUMEC32/@@download/4DNFIRUMEC32.mcool
|
||
# IMR90: https://data.4dnucleome.org/files-processed/4DNFIABB3FHQ/@@download/4DNFIABB3FHQ.mcool
|
||
for f in GM12878.mcool IMR90.mcool; do
|
||
if [ ! -f "$DATA/raw/$f" ]; then
|
||
echo "ERROR: $DATA/raw/$f not found. Download from 4DN (see README) and retry." >&2
|
||
exit 1
|
||
fi
|
||
done
|
||
|
||
# ========== Step 1: Build contact graphs ==========
|
||
echo ""
|
||
echo "=== Step 1: Building chromatin contact graphs ==="
|
||
for CELL in GM12878 IMR90; do
|
||
OUT="$DATA/processed/${CELL}_${CHROM}.pt"
|
||
if [ -f "$OUT" ]; then
|
||
echo " ${CELL} graph already exists, skipping"
|
||
else
|
||
python "$SCRIPTS/build_graph.py" \
|
||
--mcool "$DATA/raw/${CELL}.mcool" \
|
||
--chrom "$CHROM" --res "$RES" \
|
||
--bigwigs "$DATA/raw/${CELL}_CTCF.bw" "$DATA/raw/${CELL}_H3K27me3.bw" \
|
||
--out "$OUT"
|
||
fi
|
||
done
|
||
|
||
# ========== Step 2: Compute A/B compartments ==========
|
||
echo ""
|
||
echo "=== Step 2: Computing A/B compartments (PC1 of O/E Pearson correlation) ==="
|
||
for CELL in GM12878 IMR90; do
|
||
OUT="$RESULTS/${CELL}/compartments_${CHROM}.csv"
|
||
if [ -f "$OUT" ]; then
|
||
echo " ${CELL} compartments already exist, skipping"
|
||
else
|
||
python "$SCRIPTS/compute_compartments.py" \
|
||
--mcool "$DATA/raw/${CELL}.mcool" \
|
||
--chrom "$CHROM" --res "$RES" \
|
||
--bigwig_orient "$DATA/raw/${CELL}_CTCF.bw" \
|
||
--out "$OUT"
|
||
fi
|
||
done
|
||
|
||
# ========== Step 3: Train VGAE on GM12878 ==========
|
||
echo ""
|
||
echo "=== Step 3: Training VGAE on GM12878 ==="
|
||
if [ -f "$RESULTS/GM12878/model.pt" ]; then
|
||
echo " Trained model already exists, skipping"
|
||
else
|
||
python "$SCRIPTS/train_vgae.py" \
|
||
--graph "$DATA/processed/GM12878_${CHROM}.pt" \
|
||
--epochs "$EPOCHS" --patience "$PATIENCE" \
|
||
--hidden "$HIDDEN" --latent "$LATENT" \
|
||
--seed "$SEED" \
|
||
--outdir "$RESULTS/GM12878"
|
||
fi
|
||
|
||
# ========== Step 4: Encode IMR90 with GM12878 model ==========
|
||
echo ""
|
||
echo "=== Step 4: Encoding IMR90 graph with trained GM12878 model ==="
|
||
if [ -f "$RESULTS/IMR90/emb.npy" ]; then
|
||
echo " IMR90 embeddings already exist, skipping"
|
||
else
|
||
python "$SCRIPTS/encode_graph.py" \
|
||
--model "$RESULTS/GM12878/model.pt" \
|
||
--graph "$DATA/processed/IMR90_${CHROM}.pt" \
|
||
--out "$RESULTS/IMR90/emb.npy"
|
||
fi
|
||
|
||
# ========== Step 5: Visualise embeddings ==========
|
||
echo ""
|
||
echo "=== Step 5: Generating UMAP visualisations ==="
|
||
python "$SCRIPTS/visualize_embeddings.py" \
|
||
--emb "$RESULTS/GM12878/emb.npy" "$RESULTS/IMR90/emb.npy" \
|
||
--labels GM12878 IMR90 \
|
||
--compartments \
|
||
"$RESULTS/GM12878/compartments_${CHROM}.csv" \
|
||
"$RESULTS/IMR90/compartments_${CHROM}.csv" \
|
||
--prefix "$RESULTS/figures/umap" \
|
||
--seed "$SEED"
|
||
|
||
# ========== Step 6: Compare embeddings ==========
|
||
echo ""
|
||
echo "=== Step 6: Comparing GM12878 vs IMR90 embeddings ==="
|
||
python "$SCRIPTS/compare_embeddings.py" \
|
||
--emb1 "$RESULTS/GM12878/emb.npy" \
|
||
--emb2 "$RESULTS/IMR90/emb.npy" \
|
||
--label1 GM12878 --label2 IMR90 \
|
||
--prefix "$RESULTS/figures/${CHROM}"
|
||
|
||
# ========== Summary ==========
|
||
echo ""
|
||
echo "=== Pipeline complete ==="
|
||
echo "Outputs:"
|
||
echo " Model + embeddings : $RESULTS/GM12878/"
|
||
echo " Figures : $RESULTS/figures/"
|
||
echo " Metrics : $RESULTS/GM12878/metrics.json"
|
||
echo ""
|
||
cat "$RESULTS/GM12878/metrics.json"
|