169 lines
5.7 KiB
Python
Executable File
169 lines
5.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
H1 genome-wide aggregation.
|
|
|
|
Reads per-chromosome H1 outputs and computes:
|
|
• Link prediction test AUC/AP per chrom (from metrics.json)
|
|
• Compartment recovery AUC per chrom (5-fold CV LogReg on GM12878 embeddings)
|
|
• IMR90 zero-shot compartment AUC per chrom
|
|
• Spearman r (PC1 vs best latent dim)
|
|
• Mean ± SD across autosomes
|
|
|
|
Inputs (auto-discovered):
|
|
results/h1_representation/chr*/metrics.json
|
|
results/h1_representation/chr*/gm12878_emb.npy
|
|
results/h1_representation/chr*/imr90_emb.npy
|
|
results/h1_representation/compartments/{gm12878,imr90}_chr*.csv
|
|
|
|
Outputs:
|
|
results/h1_representation/genome_summary.csv
|
|
|
|
Usage:
|
|
python experiments/h1_representation/aggregate.py
|
|
python experiments/h1_representation/aggregate.py --results_base results/h1_representation
|
|
"""
|
|
|
|
import argparse
|
|
import glob
|
|
import json
|
|
import os
|
|
import re
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from scipy.stats import spearmanr
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.metrics import roc_auc_score, average_precision_score
|
|
from sklearn.model_selection import StratifiedKFold
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
|
|
def cv_auc_ap(X, y, n_splits=5, seed=42):
|
|
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
|
|
aucs, aps = [], []
|
|
for tr, te in skf.split(X, y):
|
|
sc = StandardScaler().fit(X[tr])
|
|
lr = LogisticRegression(max_iter=1000, random_state=seed)
|
|
lr.fit(sc.transform(X[tr]), y[tr])
|
|
prob = lr.predict_proba(sc.transform(X[te]))[:, 1]
|
|
aucs.append(roc_auc_score(y[te], prob))
|
|
aps.append(average_precision_score(y[te], prob))
|
|
return float(np.mean(aucs)), float(np.mean(aps))
|
|
|
|
|
|
def best_spearman(emb, pc1):
|
|
rs = [abs(spearmanr(emb[:, d], pc1).statistic) for d in range(emb.shape[1])]
|
|
return float(np.max(rs))
|
|
|
|
|
|
def _load_compartments(path):
|
|
df = pd.read_csv(path)
|
|
valid = df["compartment"].isin(["A", "B"]) & df["pc1"].notna()
|
|
if valid.sum() < 100:
|
|
return None
|
|
return {
|
|
"mask": valid.values,
|
|
"pc1": df.loc[valid, "pc1"].values,
|
|
"y": (df.loc[valid, "compartment"] == "A").astype(int).values,
|
|
}
|
|
|
|
|
|
def evaluate_chrom(chrom, results_base, comp_dir):
|
|
chr_dir = os.path.join(results_base, chrom)
|
|
metrics_path = os.path.join(chr_dir, "metrics.json")
|
|
if not os.path.exists(metrics_path):
|
|
return None
|
|
with open(metrics_path) as f:
|
|
m = json.load(f)
|
|
|
|
row = {
|
|
"chrom": chrom,
|
|
"link_auc": m.get("test_auc"),
|
|
"link_ap": m.get("test_ap"),
|
|
"epochs_ran": m.get("epochs_ran"),
|
|
}
|
|
|
|
# GM12878 in-domain compartment recovery.
|
|
# Genome-wide runs save "emb.npy"; migrated chr1 has "gm12878_emb.npy".
|
|
gm_emb_path = os.path.join(chr_dir, "emb.npy")
|
|
if not os.path.exists(gm_emb_path):
|
|
gm_emb_path = os.path.join(chr_dir, "gm12878_emb.npy")
|
|
gm_comp = _load_compartments(os.path.join(comp_dir, f"gm12878_{chrom}.csv"))
|
|
if os.path.exists(gm_emb_path) and gm_comp is not None:
|
|
emb = np.load(gm_emb_path)[gm_comp["mask"]]
|
|
auc, ap = cv_auc_ap(emb, gm_comp["y"])
|
|
row["gm12878_n_bins"] = int(gm_comp["mask"].sum())
|
|
row["gm12878_comp_auc"] = auc
|
|
row["gm12878_comp_ap"] = ap
|
|
row["gm12878_spearman"] = best_spearman(emb, gm_comp["pc1"])
|
|
|
|
# IMR90 zero-shot
|
|
imr_emb_path = os.path.join(chr_dir, "imr90_emb.npy")
|
|
imr_comp = _load_compartments(os.path.join(comp_dir, f"imr90_{chrom}.csv"))
|
|
if os.path.exists(imr_emb_path) and imr_comp is not None:
|
|
emb = np.load(imr_emb_path)[imr_comp["mask"]]
|
|
auc, ap = cv_auc_ap(emb, imr_comp["y"])
|
|
row["imr90_n_bins"] = int(imr_comp["mask"].sum())
|
|
row["imr90_zs_comp_auc"] = auc
|
|
row["imr90_zs_spearman"] = best_spearman(emb, imr_comp["pc1"])
|
|
|
|
return row
|
|
|
|
|
|
def _natural_chrom_key(name):
|
|
m = re.match(r"chr(\d+)$", name)
|
|
return int(m.group(1)) if m else 99
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser(description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
ap.add_argument("--results_base", default="results/h1_representation")
|
|
ap.add_argument("--out", default=None,
|
|
help="Output CSV. Default: <results_base>/genome_summary.csv")
|
|
args = ap.parse_args()
|
|
|
|
out_path = args.out or os.path.join(args.results_base, "genome_summary.csv")
|
|
comp_dir = os.path.join(args.results_base, "compartments")
|
|
|
|
chr_dirs = sorted(
|
|
[os.path.basename(p) for p in glob.glob(os.path.join(args.results_base, "chr*"))
|
|
if os.path.isdir(p)],
|
|
key=_natural_chrom_key,
|
|
)
|
|
if not chr_dirs:
|
|
print(f"No per-chrom dirs found in {args.results_base}. "
|
|
"Run experiments/h1_representation/run_genome.sh first.")
|
|
return
|
|
|
|
rows = []
|
|
for chrom in chr_dirs:
|
|
print(f"Evaluating {chrom}...")
|
|
row = evaluate_chrom(chrom, args.results_base, comp_dir)
|
|
if row:
|
|
rows.append(row)
|
|
|
|
df = pd.DataFrame(rows)
|
|
|
|
# Mean ± SD summary row
|
|
summary = {"chrom": "MEAN ± SD"}
|
|
for col in df.columns:
|
|
if col == "chrom":
|
|
continue
|
|
v = pd.to_numeric(df[col], errors="coerce").dropna()
|
|
if col.endswith("_n_bins") or col == "epochs_ran":
|
|
summary[col] = f"sum={int(v.sum())}" if len(v) else ""
|
|
elif len(v):
|
|
summary[col] = f"{v.mean():.3f} ± {v.std():.3f}"
|
|
else:
|
|
summary[col] = ""
|
|
df_out = pd.concat([df, pd.DataFrame([summary])], ignore_index=True)
|
|
|
|
df_out.to_csv(out_path, index=False)
|
|
print(f"\nSaved → {out_path}")
|
|
print("\n" + df_out.to_string(index=False))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|