Files

147 lines
5.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""
H2 genome-wide aggregation.
Reads per-chrom drift_stats.json files and produces:
• Per-chromosome table (n_bins, anchor counts, drift means, perm p-value)
• Mean drift contrast across chromosomes
• Fisher's combined p-value across the 22 per-chrom permutation tests
• Count of chromosomes individually significant at p < 0.05
Inputs (auto-discovered):
results/h2_rewiring/chr*/drift_stats.json
Outputs:
results/h2_rewiring/genome_summary.csv # per-chrom rows + MEAN row
results/h2_rewiring/genome_summary.json # combined statistics
Usage:
python experiments/h2_rewiring/aggregate.py
"""
import argparse
import glob
import json
import os
import re
import numpy as np
import pandas as pd
from scipy.stats import combine_pvalues
def _natural_chrom_key(name):
m = re.match(r"chr(\d+)$", name)
return int(m.group(1)) if m else 99
def main():
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--results_base", default="results/h2_rewiring")
ap.add_argument("--out_csv", default=None,
help="Default: <results_base>/genome_summary.csv")
ap.add_argument("--out_json", default=None,
help="Default: <results_base>/genome_summary.json")
args = ap.parse_args()
out_csv = args.out_csv or os.path.join(args.results_base, "genome_summary.csv")
out_json = args.out_json or os.path.join(args.results_base, "genome_summary.json")
chr_dirs = sorted(
[os.path.basename(p) for p in glob.glob(os.path.join(args.results_base, "chr*"))
if os.path.isdir(p)],
key=_natural_chrom_key,
)
if not chr_dirs:
print(f"No per-chrom dirs found in {args.results_base}. "
"Run experiments/h2_rewiring/run_genome.sh first.")
return
rows = []
for chrom in chr_dirs:
path = os.path.join(args.results_base, chrom, "drift_stats.json")
if not os.path.exists(path):
print(f" {chrom}: drift_stats.json missing, skipping")
continue
with open(path) as f:
s = json.load(f)
rows.append({
"chrom": chrom,
"n_bins": s.get("n_bins"),
"n_anchor_bins": s.get("n_anchor_bins"),
"drift_full_mean": s.get("drift_full_mean"),
"drift_short_mean": s.get("drift_short_mean"),
"drift_long_mean": s.get("drift_long_mean"),
"anchor_sr_mean": s.get("obs_anchor_sr_mean"),
"non_anchor_sr_mean": s.get("obs_non_sr_mean"),
"perm_p_value": s.get("perm_p_value"),
"n_perm": s.get("n_perm"),
})
if not rows:
print("No drift_stats.json files found.")
return
df = pd.DataFrame(rows)
df["sr_anchor_minus_non"] = df["anchor_sr_mean"] - df["non_anchor_sr_mean"]
# Fisher's combined p across per-chrom permutation tests.
# Each chrom is an independent test (independent permutation null on
# independent embeddings) → Fisher's method is appropriate.
# Clip p at 1/n_perm to avoid log(0) when an observed value is more extreme
# than all permutations.
raw_p = df["perm_p_value"].dropna().values
if len(raw_p):
# If n_perm varies, use the smallest n_perm to set the lower clip
n_perm = df["n_perm"].dropna()
floor = 1.0 / (n_perm.min() if len(n_perm) else 1000)
clipped = np.clip(raw_p, floor, 1.0)
fisher_stat, fisher_p = combine_pvalues(clipped, method="fisher")
else:
fisher_stat, fisher_p, floor = np.nan, np.nan, np.nan
# Mean ± SD summary row
summary_row = {"chrom": "MEAN ± SD"}
for col in df.columns:
if col == "chrom":
continue
v = pd.to_numeric(df[col], errors="coerce").dropna()
if col in ("n_bins", "n_anchor_bins", "n_perm"):
summary_row[col] = f"sum={int(v.sum())}" if len(v) else ""
elif len(v):
summary_row[col] = f"{v.mean():.4f} ± {v.std():.4f}"
else:
summary_row[col] = ""
df_out = pd.concat([df, pd.DataFrame([summary_row])], ignore_index=True)
df_out.to_csv(out_csv, index=False)
summary = {
"n_chroms": int(len(df)),
"n_chroms_signif_p<0.05": int((df["perm_p_value"] < 0.05).sum()),
"n_chroms_signif_p<0.01": int((df["perm_p_value"] < 0.01).sum()),
"fisher_combined_p_value": float(fisher_p),
"fisher_chi2_statistic": float(fisher_stat),
"p_value_floor_applied": float(floor),
"mean_drift_full": float(df["drift_full_mean"].mean()),
"mean_drift_short": float(df["drift_short_mean"].mean()),
"mean_drift_long": float(df["drift_long_mean"].mean()),
"mean_anchor_short_range": float(df["anchor_sr_mean"].mean()),
"mean_non_anchor_short_range": float(df["non_anchor_sr_mean"].mean()),
"mean_sr_anchor_minus_non": float(df["sr_anchor_minus_non"].mean()),
"n_chroms_with_positive_contrast":
int((df["sr_anchor_minus_non"] > 0).sum()),
}
with open(out_json, "w") as f:
json.dump(summary, f, indent=2)
print(f"Per-chrom CSV → {out_csv}")
print(f"Summary JSON → {out_json}")
print("\n" + df_out.to_string(index=False))
print("\n=== Genome-wide summary ===")
print(json.dumps(summary, indent=2))
if __name__ == "__main__":
main()