Files
chromatin-vgae-hic/scripts/build_genome_graphs.sh

94 lines
3.3 KiB
Bash
Executable File

#!/usr/bin/env bash
# Build all processed graphs needed for H1 and H2 genome-wide trainings.
# Runs only the build_graph + compartment steps — no model training.
# Designed to be run on the data-rich box (e.g. Contabo) so only the small
# .pt files need to be transferred to the GPU box.
#
# Usage: bash scripts/build_genome_graphs.sh [chr2 chr5 ...]
# (no args = chr1..chr22)
set -euo pipefail
REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
export PYTHONPATH="$REPO:${PYTHONPATH:-}"
# Auto-activate conda env
if ! python -c "import torch_geometric" 2>/dev/null; then
eval "$(conda shell.bash hook)"
conda activate chromatin_gnn
fi
RES="${RES:-25000}"
DATA="$REPO/data"
COMPARTMENTS_DIR="$REPO/results/h1_representation/compartments"
mkdir -p "$COMPARTMENTS_DIR"
if [ $# -gt 0 ]; then
CHROMS=("$@")
else
CHROMS=()
for i in $(seq 1 22); do CHROMS+=("chr$i"); done
fi
build_one() {
local CELL_UPPER="$1" CELL_LOWER="$2" CHROM="$3" OUT="$4"
[ -f "$OUT" ] && { echo " [$CELL_LOWER $CHROM] graph exists, skip"; return; }
echo " [$CELL_LOWER $CHROM] building graph..."
python -m chromatin_gnn.build_graph \
--mcool "$DATA/raw/${CELL_UPPER}.mcool" \
--chrom "$CHROM" --res "$RES" \
--bigwigs \
"$DATA/raw/${CELL_UPPER}_CTCF.bw" \
"$DATA/raw/${CELL_UPPER}_H3K27me3.bw" \
"$DATA/raw/${CELL_UPPER}_H3K4me3.bw" \
--out "$OUT"
}
build_compartments() {
local CELL_UPPER="$1" CELL_LOWER="$2" CHROM="$3" OUT="$4"
[ -f "$OUT" ] && { echo " [$CELL_LOWER $CHROM] compartments exist, skip"; return; }
echo " [$CELL_LOWER $CHROM] computing compartments..."
python "$REPO/experiments/h1_representation/compute_compartments.py" \
--mcool "$DATA/raw/${CELL_UPPER}.mcool" \
--chrom "$CHROM" --res "$RES" \
--bigwig_orient "$DATA/raw/${CELL_UPPER}_CTCF.bw" \
--out "$OUT"
}
build_hct116() {
local COND="$1" CHROM="$2"
local OUT="$DATA/processed/hct116/${COND}_${CHROM}.pt"
[ -f "$OUT" ] && { echo " [hct116 $COND $CHROM] graph exists, skip"; return; }
echo " [hct116 $COND $CHROM] building graph..."
python -m chromatin_gnn.build_graph \
--mcool "$DATA/raw/HCT116_${COND}.mcool" \
--chrom "$CHROM" --res "$RES" \
--bigwigs \
"$DATA/raw/HCT116_CTCF.bw" \
"$DATA/raw/HCT116_H3K27me3.bw" \
"$DATA/raw/HCT116_H3K27ac.bw" \
"$DATA/raw/HCT116_H3K4me3.bw" \
--out "$OUT"
}
START=$(date +%s)
for CHROM in "${CHROMS[@]}"; do
echo ""
echo "=== $CHROM ==="
mkdir -p "$DATA/processed/gm12878" "$DATA/processed/imr90" "$DATA/processed/hct116"
# H1 — GM12878 + IMR90
build_one GM12878 gm12878 "$CHROM" "$DATA/processed/gm12878/${CHROM}.pt"
build_one IMR90 imr90 "$CHROM" "$DATA/processed/imr90/${CHROM}.pt"
build_compartments GM12878 gm12878 "$CHROM" "$COMPARTMENTS_DIR/gm12878_${CHROM}.csv"
build_compartments IMR90 imr90 "$CHROM" "$COMPARTMENTS_DIR/imr90_${CHROM}.csv"
# H2 — HCT-116 control + treated
build_hct116 control "$CHROM"
build_hct116 treated_6h "$CHROM"
done
ELAPSED=$(( $(date +%s) - START ))
echo ""
echo "=== Done in ${ELAPSED}s. Processed graphs in $DATA/processed/, compartments in $COMPARTMENTS_DIR ==="
echo "Now rsync data/processed/ and results/h1_representation/compartments/ to Vast."