Files

169 lines
5.7 KiB
Python
Executable File

#!/usr/bin/env python3
"""
H1 genome-wide aggregation.
Reads per-chromosome H1 outputs and computes:
• Link prediction test AUC/AP per chrom (from metrics.json)
• Compartment recovery AUC per chrom (5-fold CV LogReg on GM12878 embeddings)
• IMR90 zero-shot compartment AUC per chrom
• Spearman r (PC1 vs best latent dim)
• Mean ± SD across autosomes
Inputs (auto-discovered):
results/h1_representation/chr*/metrics.json
results/h1_representation/chr*/gm12878_emb.npy
results/h1_representation/chr*/imr90_emb.npy
results/h1_representation/compartments/{gm12878,imr90}_chr*.csv
Outputs:
results/h1_representation/genome_summary.csv
Usage:
python experiments/h1_representation/aggregate.py
python experiments/h1_representation/aggregate.py --results_base results/h1_representation
"""
import argparse
import glob
import json
import os
import re
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
def cv_auc_ap(X, y, n_splits=5, seed=42):
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
aucs, aps = [], []
for tr, te in skf.split(X, y):
sc = StandardScaler().fit(X[tr])
lr = LogisticRegression(max_iter=1000, random_state=seed)
lr.fit(sc.transform(X[tr]), y[tr])
prob = lr.predict_proba(sc.transform(X[te]))[:, 1]
aucs.append(roc_auc_score(y[te], prob))
aps.append(average_precision_score(y[te], prob))
return float(np.mean(aucs)), float(np.mean(aps))
def best_spearman(emb, pc1):
rs = [abs(spearmanr(emb[:, d], pc1).statistic) for d in range(emb.shape[1])]
return float(np.max(rs))
def _load_compartments(path):
df = pd.read_csv(path)
valid = df["compartment"].isin(["A", "B"]) & df["pc1"].notna()
if valid.sum() < 100:
return None
return {
"mask": valid.values,
"pc1": df.loc[valid, "pc1"].values,
"y": (df.loc[valid, "compartment"] == "A").astype(int).values,
}
def evaluate_chrom(chrom, results_base, comp_dir):
chr_dir = os.path.join(results_base, chrom)
metrics_path = os.path.join(chr_dir, "metrics.json")
if not os.path.exists(metrics_path):
return None
with open(metrics_path) as f:
m = json.load(f)
row = {
"chrom": chrom,
"link_auc": m.get("test_auc"),
"link_ap": m.get("test_ap"),
"epochs_ran": m.get("epochs_ran"),
}
# GM12878 in-domain compartment recovery.
# Genome-wide runs save "emb.npy"; migrated chr1 has "gm12878_emb.npy".
gm_emb_path = os.path.join(chr_dir, "emb.npy")
if not os.path.exists(gm_emb_path):
gm_emb_path = os.path.join(chr_dir, "gm12878_emb.npy")
gm_comp = _load_compartments(os.path.join(comp_dir, f"gm12878_{chrom}.csv"))
if os.path.exists(gm_emb_path) and gm_comp is not None:
emb = np.load(gm_emb_path)[gm_comp["mask"]]
auc, ap = cv_auc_ap(emb, gm_comp["y"])
row["gm12878_n_bins"] = int(gm_comp["mask"].sum())
row["gm12878_comp_auc"] = auc
row["gm12878_comp_ap"] = ap
row["gm12878_spearman"] = best_spearman(emb, gm_comp["pc1"])
# IMR90 zero-shot
imr_emb_path = os.path.join(chr_dir, "imr90_emb.npy")
imr_comp = _load_compartments(os.path.join(comp_dir, f"imr90_{chrom}.csv"))
if os.path.exists(imr_emb_path) and imr_comp is not None:
emb = np.load(imr_emb_path)[imr_comp["mask"]]
auc, ap = cv_auc_ap(emb, imr_comp["y"])
row["imr90_n_bins"] = int(imr_comp["mask"].sum())
row["imr90_zs_comp_auc"] = auc
row["imr90_zs_spearman"] = best_spearman(emb, imr_comp["pc1"])
return row
def _natural_chrom_key(name):
m = re.match(r"chr(\d+)$", name)
return int(m.group(1)) if m else 99
def main():
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--results_base", default="results/h1_representation")
ap.add_argument("--out", default=None,
help="Output CSV. Default: <results_base>/genome_summary.csv")
args = ap.parse_args()
out_path = args.out or os.path.join(args.results_base, "genome_summary.csv")
comp_dir = os.path.join(args.results_base, "compartments")
chr_dirs = sorted(
[os.path.basename(p) for p in glob.glob(os.path.join(args.results_base, "chr*"))
if os.path.isdir(p)],
key=_natural_chrom_key,
)
if not chr_dirs:
print(f"No per-chrom dirs found in {args.results_base}. "
"Run experiments/h1_representation/run_genome.sh first.")
return
rows = []
for chrom in chr_dirs:
print(f"Evaluating {chrom}...")
row = evaluate_chrom(chrom, args.results_base, comp_dir)
if row:
rows.append(row)
df = pd.DataFrame(rows)
# Mean ± SD summary row
summary = {"chrom": "MEAN ± SD"}
for col in df.columns:
if col == "chrom":
continue
v = pd.to_numeric(df[col], errors="coerce").dropna()
if col.endswith("_n_bins") or col == "epochs_ran":
summary[col] = f"sum={int(v.sum())}" if len(v) else ""
elif len(v):
summary[col] = f"{v.mean():.3f} ± {v.std():.3f}"
else:
summary[col] = ""
df_out = pd.concat([df, pd.DataFrame([summary])], ignore_index=True)
df_out.to_csv(out_path, index=False)
print(f"\nSaved → {out_path}")
print("\n" + df_out.to_string(index=False))
if __name__ == "__main__":
main()