#!/usr/bin/env bash # # run_pipeline.sh – end-to-end pipeline for chromatin-gnn # # Prerequisites # ------------- # 1. Create and activate the environment: # conda create -n chromatin_gnn python=3.10 -y # conda activate chromatin_gnn # pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cpu # pip install torch-geometric==2.5.3 cooler==0.9.3 pyBigWig pandas \ # "numpy>=1.24,<2.0" scikit-learn matplotlib umap-learn scipy seaborn tqdm # # 2. Download raw data into data/raw/ (see README.md § Dataset for URLs): # GM12878.mcool 4DN accession 4DNFIRUMEC32 # IMR90.mcool 4DN accession 4DNFIABB3FHQ # GM12878_CTCF.bw ENCODE ENCFF741BAQ (experiment ENCSR000AKB) # GM12878_H3K27me3.bw ENCODE ENCFF736CNQ (experiment ENCSR000AKD) # IMR90_CTCF.bw ENCODE ENCFF770DUD (experiment ENCSR000EFI) # IMR90_H3K27me3.bw ENCODE ENCFF158HZL (experiment ENCSR431UUY) # # Usage # ----- # bash run_pipeline.sh [--chrom chr21] [--res 25000] [--epochs 300] set -euo pipefail # ========== Configuration ========== CHROM="${CHROM:-chr21}" RES="${RES:-25000}" EPOCHS="${EPOCHS:-300}" PATIENCE="${PATIENCE:-20}" HIDDEN="${HIDDEN:-64}" LATENT="${LATENT:-32}" SEED="${SEED:-42}" REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPTS="$REPO/scripts" DATA="$REPO/data" RESULTS="$REPO/results" # ========== Directories ========== mkdir -p "$DATA/raw" "$DATA/processed" \ "$RESULTS/GM12878" "$RESULTS/IMR90" "$RESULTS/figures" # ========== Download ENCODE bigWig tracks ========== echo "=== Downloading ENCODE bigWig tracks ===" for entry in \ "GM12878_CTCF.bw|https://www.encodeproject.org/files/ENCFF741BAQ/@@download/ENCFF741BAQ.bigWig" \ "GM12878_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF736CNQ/@@download/ENCFF736CNQ.bigWig" \ "IMR90_CTCF.bw|https://www.encodeproject.org/files/ENCFF770DUD/@@download/ENCFF770DUD.bigWig" \ "IMR90_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF158HZL/@@download/ENCFF158HZL.bigWig" do fname="${entry%%|*}" url="${entry##*|}" out="$DATA/raw/$fname" if [ -f "$out" ]; then echo " $fname already present, skipping" else echo " Downloading $fname ..." wget -q --show-progress -O "$out" "$url" fi done # .mcool files must be downloaded manually from 4DN (requires free account): # GM12878: https://data.4dnucleome.org/files-processed/4DNFIRUMEC32/@@download/4DNFIRUMEC32.mcool # IMR90: https://data.4dnucleome.org/files-processed/4DNFIABB3FHQ/@@download/4DNFIABB3FHQ.mcool for f in GM12878.mcool IMR90.mcool; do if [ ! -f "$DATA/raw/$f" ]; then echo "ERROR: $DATA/raw/$f not found. Download from 4DN (see README) and retry." >&2 exit 1 fi done # ========== Step 1: Build contact graphs ========== echo "" echo "=== Step 1: Building chromatin contact graphs ===" for CELL in GM12878 IMR90; do OUT="$DATA/processed/${CELL}_${CHROM}.pt" if [ -f "$OUT" ]; then echo " ${CELL} graph already exists, skipping" else python "$SCRIPTS/build_graph.py" \ --mcool "$DATA/raw/${CELL}.mcool" \ --chrom "$CHROM" --res "$RES" \ --bigwigs "$DATA/raw/${CELL}_CTCF.bw" "$DATA/raw/${CELL}_H3K27me3.bw" \ --out "$OUT" fi done # ========== Step 2: Compute A/B compartments ========== echo "" echo "=== Step 2: Computing A/B compartments (PC1 of O/E Pearson correlation) ===" for CELL in GM12878 IMR90; do OUT="$RESULTS/${CELL}/compartments_${CHROM}.csv" if [ -f "$OUT" ]; then echo " ${CELL} compartments already exist, skipping" else python "$SCRIPTS/compute_compartments.py" \ --mcool "$DATA/raw/${CELL}.mcool" \ --chrom "$CHROM" --res "$RES" \ --bigwig_orient "$DATA/raw/${CELL}_CTCF.bw" \ --out "$OUT" fi done # ========== Step 3: Train VGAE on GM12878 ========== echo "" echo "=== Step 3: Training VGAE on GM12878 ===" if [ -f "$RESULTS/GM12878/model.pt" ]; then echo " Trained model already exists, skipping" else python "$SCRIPTS/train_vgae.py" \ --graph "$DATA/processed/GM12878_${CHROM}.pt" \ --epochs "$EPOCHS" --patience "$PATIENCE" \ --hidden "$HIDDEN" --latent "$LATENT" \ --seed "$SEED" \ --outdir "$RESULTS/GM12878" fi # ========== Step 4: Encode IMR90 with GM12878 model ========== echo "" echo "=== Step 4: Encoding IMR90 graph with trained GM12878 model ===" if [ -f "$RESULTS/IMR90/emb.npy" ]; then echo " IMR90 embeddings already exist, skipping" else python "$SCRIPTS/encode_graph.py" \ --model "$RESULTS/GM12878/model.pt" \ --graph "$DATA/processed/IMR90_${CHROM}.pt" \ --out "$RESULTS/IMR90/emb.npy" fi # ========== Step 5: Visualise embeddings ========== echo "" echo "=== Step 5: Generating UMAP visualisations ===" python "$SCRIPTS/visualize_embeddings.py" \ --emb "$RESULTS/GM12878/emb.npy" "$RESULTS/IMR90/emb.npy" \ --labels GM12878 IMR90 \ --compartments \ "$RESULTS/GM12878/compartments_${CHROM}.csv" \ "$RESULTS/IMR90/compartments_${CHROM}.csv" \ --prefix "$RESULTS/figures/umap" \ --seed "$SEED" # ========== Step 6: Compare embeddings ========== echo "" echo "=== Step 6: Comparing GM12878 vs IMR90 embeddings ===" python "$SCRIPTS/compare_embeddings.py" \ --emb1 "$RESULTS/GM12878/emb.npy" \ --emb2 "$RESULTS/IMR90/emb.npy" \ --label1 GM12878 --label2 IMR90 \ --prefix "$RESULTS/figures/${CHROM}" # ========== Summary ========== echo "" echo "=== Pipeline complete ===" echo "Outputs:" echo " Model + embeddings : $RESULTS/GM12878/" echo " Figures : $RESULTS/figures/" echo " Metrics : $RESULTS/GM12878/metrics.json" echo "" cat "$RESULTS/GM12878/metrics.json"