#!/usr/bin/env python3 """ H1 genome-wide aggregation. Reads per-chromosome H1 outputs and computes: • Link prediction test AUC/AP per chrom (from metrics.json) • Compartment recovery AUC per chrom (5-fold CV LogReg on GM12878 embeddings) • IMR90 zero-shot compartment AUC per chrom • Spearman r (PC1 vs best latent dim) • Mean ± SD across autosomes Inputs (auto-discovered): results/h1_representation/chr*/metrics.json results/h1_representation/chr*/gm12878_emb.npy results/h1_representation/chr*/imr90_emb.npy results/h1_representation/compartments/{gm12878,imr90}_chr*.csv Outputs: results/h1_representation/genome_summary.csv Usage: python experiments/h1_representation/aggregate.py python experiments/h1_representation/aggregate.py --results_base results/h1_representation """ import argparse import glob import json import os import re import numpy as np import pandas as pd from scipy.stats import spearmanr from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score, average_precision_score from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import StandardScaler def cv_auc_ap(X, y, n_splits=5, seed=42): skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) aucs, aps = [], [] for tr, te in skf.split(X, y): sc = StandardScaler().fit(X[tr]) lr = LogisticRegression(max_iter=1000, random_state=seed) lr.fit(sc.transform(X[tr]), y[tr]) prob = lr.predict_proba(sc.transform(X[te]))[:, 1] aucs.append(roc_auc_score(y[te], prob)) aps.append(average_precision_score(y[te], prob)) return float(np.mean(aucs)), float(np.mean(aps)) def best_spearman(emb, pc1): rs = [abs(spearmanr(emb[:, d], pc1).statistic) for d in range(emb.shape[1])] return float(np.max(rs)) def _load_compartments(path): df = pd.read_csv(path) valid = df["compartment"].isin(["A", "B"]) & df["pc1"].notna() if valid.sum() < 100: return None return { "mask": valid.values, "pc1": df.loc[valid, "pc1"].values, "y": (df.loc[valid, "compartment"] == "A").astype(int).values, } def evaluate_chrom(chrom, results_base, comp_dir): chr_dir = os.path.join(results_base, chrom) metrics_path = os.path.join(chr_dir, "metrics.json") if not os.path.exists(metrics_path): return None with open(metrics_path) as f: m = json.load(f) row = { "chrom": chrom, "link_auc": m.get("test_auc"), "link_ap": m.get("test_ap"), "epochs_ran": m.get("epochs_ran"), } # GM12878 in-domain compartment recovery. # Genome-wide runs save "emb.npy"; migrated chr1 has "gm12878_emb.npy". gm_emb_path = os.path.join(chr_dir, "emb.npy") if not os.path.exists(gm_emb_path): gm_emb_path = os.path.join(chr_dir, "gm12878_emb.npy") gm_comp = _load_compartments(os.path.join(comp_dir, f"gm12878_{chrom}.csv")) if os.path.exists(gm_emb_path) and gm_comp is not None: emb = np.load(gm_emb_path)[gm_comp["mask"]] auc, ap = cv_auc_ap(emb, gm_comp["y"]) row["gm12878_n_bins"] = int(gm_comp["mask"].sum()) row["gm12878_comp_auc"] = auc row["gm12878_comp_ap"] = ap row["gm12878_spearman"] = best_spearman(emb, gm_comp["pc1"]) # IMR90 zero-shot imr_emb_path = os.path.join(chr_dir, "imr90_emb.npy") imr_comp = _load_compartments(os.path.join(comp_dir, f"imr90_{chrom}.csv")) if os.path.exists(imr_emb_path) and imr_comp is not None: emb = np.load(imr_emb_path)[imr_comp["mask"]] auc, ap = cv_auc_ap(emb, imr_comp["y"]) row["imr90_n_bins"] = int(imr_comp["mask"].sum()) row["imr90_zs_comp_auc"] = auc row["imr90_zs_spearman"] = best_spearman(emb, imr_comp["pc1"]) return row def _natural_chrom_key(name): m = re.match(r"chr(\d+)$", name) return int(m.group(1)) if m else 99 def main(): ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) ap.add_argument("--results_base", default="results/h1_representation") ap.add_argument("--out", default=None, help="Output CSV. Default: /genome_summary.csv") args = ap.parse_args() out_path = args.out or os.path.join(args.results_base, "genome_summary.csv") comp_dir = os.path.join(args.results_base, "compartments") chr_dirs = sorted( [os.path.basename(p) for p in glob.glob(os.path.join(args.results_base, "chr*")) if os.path.isdir(p)], key=_natural_chrom_key, ) if not chr_dirs: print(f"No per-chrom dirs found in {args.results_base}. " "Run experiments/h1_representation/run_genome.sh first.") return rows = [] for chrom in chr_dirs: print(f"Evaluating {chrom}...") row = evaluate_chrom(chrom, args.results_base, comp_dir) if row: rows.append(row) df = pd.DataFrame(rows) # Mean ± SD summary row summary = {"chrom": "MEAN ± SD"} for col in df.columns: if col == "chrom": continue v = pd.to_numeric(df[col], errors="coerce").dropna() if col.endswith("_n_bins") or col == "epochs_ran": summary[col] = f"sum={int(v.sum())}" if len(v) else "" elif len(v): summary[col] = f"{v.mean():.3f} ± {v.std():.3f}" else: summary[col] = "" df_out = pd.concat([df, pd.DataFrame([summary])], ignore_index=True) df_out.to_csv(out_path, index=False) print(f"\nSaved → {out_path}") print("\n" + df_out.to_string(index=False)) if __name__ == "__main__": main()