chromatin-vgae-hic/experiments/h3_longrange/evaluate_cross_ablation.py

#!/usr/bin/env python3
"""
H3 feature × edge cross-ablation evaluation.

Builds a 2×3 grid:

                       full edges    local (<250 kb)   long-range (>1 Mb)
    real features         A                B                   C
    constant features     D                E                   F

A–C come from the existing H1/H3 trainings (real features × edge subset).
D–F are new trainings with `--constant_features` on the same edge subsets.

For each cell we report:
  • Link prediction test AUC/AP  (from cell's metrics.json)
  • Compartment classification AUC/AP (5-fold CV LogReg on embeddings)
  • Spearman r(PC1, best latent dim)

The cross-ablation separates two confounded sources of signal in H3:
  • Feature signal: epigenetic marks themselves correlate with compartment;
    message-passing smooths them over adjacent bins regardless of topology.
  • Topology signal: information genuinely carried by graph structure.

Reading the table:
  • Δ (real − const) at fixed edges = contribution of features given that topology.
  • Δ across edges at fixed features = contribution of edge-band topology.
  • The bottom row alone shows what each edge-band carries from topology alone.

Usage
-----
  python experiments/h3_longrange/evaluate_cross_ablation.py \\
      --full_real_dir       results/h1_representation \\
      --local_real_dir      results/h3_longrange/local_only \\
      --longrange_real_dir  results/h3_longrange/longrange_only \\
      --full_const_dir      results/h3_longrange/full_const \\
      --local_const_dir     results/h3_longrange/local_const \\
      --longrange_const_dir results/h3_longrange/longrange_const \\
      --compartments        results/h1_representation/compartments/gm12878_chr1.csv \\
      --out                 results/h3_longrange/cross_ablation.json
"""

import argparse
import json
import os

import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler


def cv_auc(X, y, n_splits=5, seed=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    aucs, aps = [], []
    for tr, te in skf.split(X, y):
        sc = StandardScaler().fit(X[tr])
        lr = LogisticRegression(max_iter=1000, random_state=seed)
        lr.fit(sc.transform(X[tr]), y[tr])
        prob = lr.predict_proba(sc.transform(X[te]))[:, 1]
        aucs.append(roc_auc_score(y[te], prob))
        aps.append(average_precision_score(y[te], prob))
    return float(np.mean(aucs)), float(np.mean(aps))


def best_spearman(emb, pc1):
    rs = [abs(spearmanr(emb[:, d], pc1).statistic) for d in range(emb.shape[1])]
    return float(np.max(rs)), int(np.argmax(rs))


def _emb_name(d):
    # H1 uses gm12878_emb.npy; H3 cells use emb.npy
    for name in ("emb.npy", "gm12878_emb.npy"):
        p = os.path.join(d, name)
        if os.path.exists(p):
            return p
    raise FileNotFoundError(f"No emb file in {d}")


def evaluate_cell(emb_dir, pc1, y, mask):
    emb = np.load(_emb_name(emb_dir))
    with open(os.path.join(emb_dir, "metrics.json")) as f:
        m = json.load(f)
    emb_m = emb[mask]
    auc, ap = cv_auc(emb_m, y)
    r, dim  = best_spearman(emb_m, pc1)
    return {
        "link_pred_test_auc":   m.get("test_auc"),
        "link_pred_test_ap":    m.get("test_ap"),
        "compartment_auc_5fold": auc,
        "compartment_ap_5fold":  ap,
        "spearman_r_best":      r,
        "spearman_best_dim":    dim,
        "epochs_ran":           m.get("epochs_ran"),
        "hidden":               m.get("hidden"),
        "latent":               m.get("latent"),
        "constant_features":    m.get("constant_features", False),
        "in_features":          m.get("in_features"),
    }


def main():
    ap = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument("--full_real_dir",       required=True)
    ap.add_argument("--local_real_dir",      required=True)
    ap.add_argument("--longrange_real_dir",  required=True)
    ap.add_argument("--full_const_dir",      required=True)
    ap.add_argument("--local_const_dir",     required=True)
    ap.add_argument("--longrange_const_dir", required=True)
    ap.add_argument("--compartments",        required=True)
    ap.add_argument("--out",                 required=True)
    args = ap.parse_args()

    df = pd.read_csv(args.compartments)
    valid = df["compartment"].isin(["A", "B"]) & df["pc1"].notna()
    mask = valid.values
    pc1  = df.loc[valid, "pc1"].values
    y    = (df.loc[valid, "compartment"] == "A").astype(int).values
    print(f"Compartment labels: {mask.sum()} bins (A={y.sum()}, B={(~y.astype(bool)).sum()})")

    grid = {
        ("real",  "full"):       args.full_real_dir,
        ("real",  "local"):      args.local_real_dir,
        ("real",  "longrange"):  args.longrange_real_dir,
        ("const", "full"):       args.full_const_dir,
        ("const", "local"):      args.local_const_dir,
        ("const", "longrange"):  args.longrange_const_dir,
    }

    results = {}
    for (feat, edge), d in grid.items():
        key = f"{feat}_{edge}"
        results[key] = evaluate_cell(d, pc1, y, mask)
        r = results[key]
        print(f"\n=== {key} ({d}) ===")
        print(f"  in_features={r['in_features']}  hidden={r['hidden']}  epochs={r['epochs_ran']}")
        print(f"  Link AUC {r['link_pred_test_auc']:.4f}   Comp AUC {r['compartment_auc_5fold']:.4f}   "
              f"Spearman r {r['spearman_r_best']:.4f}")

    with open(args.out, "w") as f:
        json.dump(results, f, indent=2)
    print(f"\nSaved → {args.out}")

    # 2×3 table
    print("\n=== Cross-ablation: compartment AUC (5-fold CV LogReg on embeddings) ===")
    print(f"{'features ↓ / edges →':<22} {'full':>10} {'local':>10} {'longrange':>12}")
    print("-" * 56)
    for feat in ("real", "const"):
        row = [results[f"{feat}_{e}"]["compartment_auc_5fold"] for e in ("full", "local", "longrange")]
        print(f"{feat:<22} {row[0]:>10.4f} {row[1]:>10.4f} {row[2]:>12.4f}")

    print("\n=== Feature contribution at fixed topology (real − const) ===")
    for e in ("full", "local", "longrange"):
        d = results[f"real_{e}"]["compartment_auc_5fold"] - results[f"const_{e}"]["compartment_auc_5fold"]
        print(f"  {e:<10}  Δ = {d:+.4f}")

    print("\n=== Topology contribution at constant features (vs random ~0.50) ===")
    for e in ("full", "local", "longrange"):
        v = results[f"const_{e}"]["compartment_auc_5fold"]
        d = v - 0.50
        print(f"  {e:<10}  AUC = {v:.4f}   Δ vs random = {d:+.4f}")

    print("\n=== Interpretation ===")
    lc = results["const_local"]["compartment_auc_5fold"]
    rc = results["const_longrange"]["compartment_auc_5fold"]
    if rc > lc + 0.02:
        print("  → Long-range topology > local topology under constant features.")
        print("    Original H3 result (local > longrange) was driven by feature smoothing,")
        print("    not by topology. H3 is supported once features are controlled for.")
    elif lc > rc + 0.02:
        print("  → Local topology > long-range topology even under constant features.")
        print("    H3 not supported. Local contacts dominate by topology, not by feature smoothing.")
    else:
        print("  → Local and long-range topology contribute equally under constant features.")
        print(f"    Δ = {rc - lc:+.4f} (within ±0.02 noise band).")


if __name__ == "__main__":
    main()