#!/usr/bin/env bash
#
# run_pipeline.sh  –  end-to-end pipeline for chromatin-gnn
#
# Prerequisites
# -------------
# 1. Create and activate the environment:
#      conda create -n chromatin_gnn python=3.10 -y
#      conda activate chromatin_gnn
#      pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cpu
#      pip install torch-geometric==2.5.3 cooler==0.9.3 pyBigWig pandas \
#                  "numpy>=1.24,<2.0" scikit-learn matplotlib umap-learn scipy seaborn tqdm
#
# 2. Download raw data into data/raw/ (see README.md § Dataset for URLs):
#      GM12878.mcool   4DN accession 4DNFIRUMEC32
#      IMR90.mcool     4DN accession 4DNFIABB3FHQ
#      GM12878_CTCF.bw ENCODE ENCFF741BAQ (experiment ENCSR000AKB)
#      GM12878_H3K27me3.bw ENCODE ENCFF736CNQ (experiment ENCSR000AKD)
#      IMR90_CTCF.bw   ENCODE ENCFF770DUD (experiment ENCSR000EFI)
#      IMR90_H3K27me3.bw ENCODE ENCFF158HZL (experiment ENCSR431UUY)
#
# Usage
# -----
#   bash run_pipeline.sh [--chrom chr21] [--res 25000] [--epochs 300]

set -euo pipefail

# ========== Configuration ==========
CHROM="${CHROM:-chr21}"
RES="${RES:-25000}"
EPOCHS="${EPOCHS:-300}"
PATIENCE="${PATIENCE:-20}"
HIDDEN="${HIDDEN:-64}"
LATENT="${LATENT:-32}"
SEED="${SEED:-42}"

REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SCRIPTS="$REPO/scripts"
DATA="$REPO/data"
RESULTS="$REPO/results"

# ========== Directories ==========
mkdir -p "$DATA/raw" "$DATA/processed" \
         "$RESULTS/GM12878" "$RESULTS/IMR90" "$RESULTS/figures"

# ========== Download ENCODE bigWig tracks ==========
echo "=== Downloading ENCODE bigWig tracks ==="
for entry in \
    "GM12878_CTCF.bw|https://www.encodeproject.org/files/ENCFF741BAQ/@@download/ENCFF741BAQ.bigWig" \
    "GM12878_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF736CNQ/@@download/ENCFF736CNQ.bigWig" \
    "IMR90_CTCF.bw|https://www.encodeproject.org/files/ENCFF770DUD/@@download/ENCFF770DUD.bigWig" \
    "IMR90_H3K27me3.bw|https://www.encodeproject.org/files/ENCFF158HZL/@@download/ENCFF158HZL.bigWig"
do
    fname="${entry%%|*}"
    url="${entry##*|}"
    out="$DATA/raw/$fname"
    if [ -f "$out" ]; then
        echo "  $fname already present, skipping"
    else
        echo "  Downloading $fname ..."
        wget -q --show-progress -O "$out" "$url"
    fi
done

# .mcool files must be downloaded manually from 4DN (requires free account):
#   GM12878: https://data.4dnucleome.org/files-processed/4DNFIRUMEC32/@@download/4DNFIRUMEC32.mcool
#   IMR90:   https://data.4dnucleome.org/files-processed/4DNFIABB3FHQ/@@download/4DNFIABB3FHQ.mcool
for f in GM12878.mcool IMR90.mcool; do
    if [ ! -f "$DATA/raw/$f" ]; then
        echo "ERROR: $DATA/raw/$f not found. Download from 4DN (see README) and retry." >&2
        exit 1
    fi
done

# ========== Step 1: Build contact graphs ==========
echo ""
echo "=== Step 1: Building chromatin contact graphs ==="
for CELL in GM12878 IMR90; do
    OUT="$DATA/processed/${CELL}_${CHROM}.pt"
    if [ -f "$OUT" ]; then
        echo "  ${CELL} graph already exists, skipping"
    else
        python "$SCRIPTS/build_graph.py" \
            --mcool "$DATA/raw/${CELL}.mcool" \
            --chrom "$CHROM" --res "$RES" \
            --bigwigs "$DATA/raw/${CELL}_CTCF.bw" "$DATA/raw/${CELL}_H3K27me3.bw" \
            --out "$OUT"
    fi
done

# ========== Step 2: Compute A/B compartments ==========
echo ""
echo "=== Step 2: Computing A/B compartments (PC1 of O/E Pearson correlation) ==="
for CELL in GM12878 IMR90; do
    OUT="$RESULTS/${CELL}/compartments_${CHROM}.csv"
    if [ -f "$OUT" ]; then
        echo "  ${CELL} compartments already exist, skipping"
    else
        python "$SCRIPTS/compute_compartments.py" \
            --mcool "$DATA/raw/${CELL}.mcool" \
            --chrom "$CHROM" --res "$RES" \
            --bigwig_orient "$DATA/raw/${CELL}_CTCF.bw" \
            --out "$OUT"
    fi
done

# ========== Step 3: Train VGAE on GM12878 ==========
echo ""
echo "=== Step 3: Training VGAE on GM12878 ==="
if [ -f "$RESULTS/GM12878/model.pt" ]; then
    echo "  Trained model already exists, skipping"
else
    python "$SCRIPTS/train_vgae.py" \
        --graph "$DATA/processed/GM12878_${CHROM}.pt" \
        --epochs "$EPOCHS" --patience "$PATIENCE" \
        --hidden "$HIDDEN" --latent "$LATENT" \
        --seed "$SEED" \
        --outdir "$RESULTS/GM12878"
fi

# ========== Step 4: Encode IMR90 with GM12878 model ==========
echo ""
echo "=== Step 4: Encoding IMR90 graph with trained GM12878 model ==="
if [ -f "$RESULTS/IMR90/emb.npy" ]; then
    echo "  IMR90 embeddings already exist, skipping"
else
    python "$SCRIPTS/encode_graph.py" \
        --model "$RESULTS/GM12878/model.pt" \
        --graph "$DATA/processed/IMR90_${CHROM}.pt" \
        --out   "$RESULTS/IMR90/emb.npy"
fi

# ========== Step 5: Visualise embeddings ==========
echo ""
echo "=== Step 5: Generating UMAP visualisations ==="
python "$SCRIPTS/visualize_embeddings.py" \
    --emb    "$RESULTS/GM12878/emb.npy" "$RESULTS/IMR90/emb.npy" \
    --labels GM12878 IMR90 \
    --compartments \
        "$RESULTS/GM12878/compartments_${CHROM}.csv" \
        "$RESULTS/IMR90/compartments_${CHROM}.csv" \
    --prefix "$RESULTS/figures/umap" \
    --seed "$SEED"

# ========== Step 6: Compare embeddings ==========
echo ""
echo "=== Step 6: Comparing GM12878 vs IMR90 embeddings ==="
python "$SCRIPTS/compare_embeddings.py" \
    --emb1 "$RESULTS/GM12878/emb.npy" \
    --emb2 "$RESULTS/IMR90/emb.npy" \
    --label1 GM12878 --label2 IMR90 \
    --prefix "$RESULTS/figures/${CHROM}"

# ========== Summary ==========
echo ""
echo "=== Pipeline complete ==="
echo "Outputs:"
echo "  Model + embeddings : $RESULTS/GM12878/"
echo "  Figures            : $RESULTS/figures/"
echo "  Metrics            : $RESULTS/GM12878/metrics.json"
echo ""
cat "$RESULTS/GM12878/metrics.json"