#!/usr/bin/env python3 """ H2 genome-wide aggregation. Reads per-chrom drift_stats.json files and produces: • Per-chromosome table (n_bins, anchor counts, drift means, perm p-value) • Mean drift contrast across chromosomes • Fisher's combined p-value across the 22 per-chrom permutation tests • Count of chromosomes individually significant at p < 0.05 Inputs (auto-discovered): results/h2_rewiring/chr*/drift_stats.json Outputs: results/h2_rewiring/genome_summary.csv # per-chrom rows + MEAN row results/h2_rewiring/genome_summary.json # combined statistics Usage: python experiments/h2_rewiring/aggregate.py """ import argparse import glob import json import os import re import numpy as np import pandas as pd from scipy.stats import combine_pvalues def _natural_chrom_key(name): m = re.match(r"chr(\d+)$", name) return int(m.group(1)) if m else 99 def main(): ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) ap.add_argument("--results_base", default="results/h2_rewiring") ap.add_argument("--out_csv", default=None, help="Default: /genome_summary.csv") ap.add_argument("--out_json", default=None, help="Default: /genome_summary.json") args = ap.parse_args() out_csv = args.out_csv or os.path.join(args.results_base, "genome_summary.csv") out_json = args.out_json or os.path.join(args.results_base, "genome_summary.json") chr_dirs = sorted( [os.path.basename(p) for p in glob.glob(os.path.join(args.results_base, "chr*")) if os.path.isdir(p)], key=_natural_chrom_key, ) if not chr_dirs: print(f"No per-chrom dirs found in {args.results_base}. " "Run experiments/h2_rewiring/run_genome.sh first.") return rows = [] for chrom in chr_dirs: path = os.path.join(args.results_base, chrom, "drift_stats.json") if not os.path.exists(path): print(f" {chrom}: drift_stats.json missing, skipping") continue with open(path) as f: s = json.load(f) rows.append({ "chrom": chrom, "n_bins": s.get("n_bins"), "n_anchor_bins": s.get("n_anchor_bins"), "drift_full_mean": s.get("drift_full_mean"), "drift_short_mean": s.get("drift_short_mean"), "drift_long_mean": s.get("drift_long_mean"), "anchor_sr_mean": s.get("obs_anchor_sr_mean"), "non_anchor_sr_mean": s.get("obs_non_sr_mean"), "perm_p_value": s.get("perm_p_value"), "n_perm": s.get("n_perm"), }) if not rows: print("No drift_stats.json files found.") return df = pd.DataFrame(rows) df["sr_anchor_minus_non"] = df["anchor_sr_mean"] - df["non_anchor_sr_mean"] # Fisher's combined p across per-chrom permutation tests. # Each chrom is an independent test (independent permutation null on # independent embeddings) → Fisher's method is appropriate. # Clip p at 1/n_perm to avoid log(0) when an observed value is more extreme # than all permutations. raw_p = df["perm_p_value"].dropna().values if len(raw_p): # If n_perm varies, use the smallest n_perm to set the lower clip n_perm = df["n_perm"].dropna() floor = 1.0 / (n_perm.min() if len(n_perm) else 1000) clipped = np.clip(raw_p, floor, 1.0) fisher_stat, fisher_p = combine_pvalues(clipped, method="fisher") else: fisher_stat, fisher_p, floor = np.nan, np.nan, np.nan # Mean ± SD summary row summary_row = {"chrom": "MEAN ± SD"} for col in df.columns: if col == "chrom": continue v = pd.to_numeric(df[col], errors="coerce").dropna() if col in ("n_bins", "n_anchor_bins", "n_perm"): summary_row[col] = f"sum={int(v.sum())}" if len(v) else "" elif len(v): summary_row[col] = f"{v.mean():.4f} ± {v.std():.4f}" else: summary_row[col] = "" df_out = pd.concat([df, pd.DataFrame([summary_row])], ignore_index=True) df_out.to_csv(out_csv, index=False) summary = { "n_chroms": int(len(df)), "n_chroms_signif_p<0.05": int((df["perm_p_value"] < 0.05).sum()), "n_chroms_signif_p<0.01": int((df["perm_p_value"] < 0.01).sum()), "fisher_combined_p_value": float(fisher_p), "fisher_chi2_statistic": float(fisher_stat), "p_value_floor_applied": float(floor), "mean_drift_full": float(df["drift_full_mean"].mean()), "mean_drift_short": float(df["drift_short_mean"].mean()), "mean_drift_long": float(df["drift_long_mean"].mean()), "mean_anchor_short_range": float(df["anchor_sr_mean"].mean()), "mean_non_anchor_short_range": float(df["non_anchor_sr_mean"].mean()), "mean_sr_anchor_minus_non": float(df["sr_anchor_minus_non"].mean()), "n_chroms_with_positive_contrast": int((df["sr_anchor_minus_non"] > 0).sum()), } with open(out_json, "w") as f: json.dump(summary, f, indent=2) print(f"Per-chrom CSV → {out_csv}") print(f"Summary JSON → {out_json}") print("\n" + df_out.to_string(index=False)) print("\n=== Genome-wide summary ===") print(json.dumps(summary, indent=2)) if __name__ == "__main__": main()