147 lines
5.5 KiB
Python
Executable File
147 lines
5.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
H2 genome-wide aggregation.
|
|
|
|
Reads per-chrom drift_stats.json files and produces:
|
|
• Per-chromosome table (n_bins, anchor counts, drift means, perm p-value)
|
|
• Mean drift contrast across chromosomes
|
|
• Fisher's combined p-value across the 22 per-chrom permutation tests
|
|
• Count of chromosomes individually significant at p < 0.05
|
|
|
|
Inputs (auto-discovered):
|
|
results/h2_rewiring/chr*/drift_stats.json
|
|
|
|
Outputs:
|
|
results/h2_rewiring/genome_summary.csv # per-chrom rows + MEAN row
|
|
results/h2_rewiring/genome_summary.json # combined statistics
|
|
|
|
Usage:
|
|
python experiments/h2_rewiring/aggregate.py
|
|
"""
|
|
|
|
import argparse
|
|
import glob
|
|
import json
|
|
import os
|
|
import re
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from scipy.stats import combine_pvalues
|
|
|
|
|
|
def _natural_chrom_key(name):
|
|
m = re.match(r"chr(\d+)$", name)
|
|
return int(m.group(1)) if m else 99
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser(description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
ap.add_argument("--results_base", default="results/h2_rewiring")
|
|
ap.add_argument("--out_csv", default=None,
|
|
help="Default: <results_base>/genome_summary.csv")
|
|
ap.add_argument("--out_json", default=None,
|
|
help="Default: <results_base>/genome_summary.json")
|
|
args = ap.parse_args()
|
|
|
|
out_csv = args.out_csv or os.path.join(args.results_base, "genome_summary.csv")
|
|
out_json = args.out_json or os.path.join(args.results_base, "genome_summary.json")
|
|
|
|
chr_dirs = sorted(
|
|
[os.path.basename(p) for p in glob.glob(os.path.join(args.results_base, "chr*"))
|
|
if os.path.isdir(p)],
|
|
key=_natural_chrom_key,
|
|
)
|
|
if not chr_dirs:
|
|
print(f"No per-chrom dirs found in {args.results_base}. "
|
|
"Run experiments/h2_rewiring/run_genome.sh first.")
|
|
return
|
|
|
|
rows = []
|
|
for chrom in chr_dirs:
|
|
path = os.path.join(args.results_base, chrom, "drift_stats.json")
|
|
if not os.path.exists(path):
|
|
print(f" {chrom}: drift_stats.json missing, skipping")
|
|
continue
|
|
with open(path) as f:
|
|
s = json.load(f)
|
|
rows.append({
|
|
"chrom": chrom,
|
|
"n_bins": s.get("n_bins"),
|
|
"n_anchor_bins": s.get("n_anchor_bins"),
|
|
"drift_full_mean": s.get("drift_full_mean"),
|
|
"drift_short_mean": s.get("drift_short_mean"),
|
|
"drift_long_mean": s.get("drift_long_mean"),
|
|
"anchor_sr_mean": s.get("obs_anchor_sr_mean"),
|
|
"non_anchor_sr_mean": s.get("obs_non_sr_mean"),
|
|
"perm_p_value": s.get("perm_p_value"),
|
|
"n_perm": s.get("n_perm"),
|
|
})
|
|
|
|
if not rows:
|
|
print("No drift_stats.json files found.")
|
|
return
|
|
|
|
df = pd.DataFrame(rows)
|
|
df["sr_anchor_minus_non"] = df["anchor_sr_mean"] - df["non_anchor_sr_mean"]
|
|
|
|
# Fisher's combined p across per-chrom permutation tests.
|
|
# Each chrom is an independent test (independent permutation null on
|
|
# independent embeddings) → Fisher's method is appropriate.
|
|
# Clip p at 1/n_perm to avoid log(0) when an observed value is more extreme
|
|
# than all permutations.
|
|
raw_p = df["perm_p_value"].dropna().values
|
|
if len(raw_p):
|
|
# If n_perm varies, use the smallest n_perm to set the lower clip
|
|
n_perm = df["n_perm"].dropna()
|
|
floor = 1.0 / (n_perm.min() if len(n_perm) else 1000)
|
|
clipped = np.clip(raw_p, floor, 1.0)
|
|
fisher_stat, fisher_p = combine_pvalues(clipped, method="fisher")
|
|
else:
|
|
fisher_stat, fisher_p, floor = np.nan, np.nan, np.nan
|
|
|
|
# Mean ± SD summary row
|
|
summary_row = {"chrom": "MEAN ± SD"}
|
|
for col in df.columns:
|
|
if col == "chrom":
|
|
continue
|
|
v = pd.to_numeric(df[col], errors="coerce").dropna()
|
|
if col in ("n_bins", "n_anchor_bins", "n_perm"):
|
|
summary_row[col] = f"sum={int(v.sum())}" if len(v) else ""
|
|
elif len(v):
|
|
summary_row[col] = f"{v.mean():.4f} ± {v.std():.4f}"
|
|
else:
|
|
summary_row[col] = ""
|
|
df_out = pd.concat([df, pd.DataFrame([summary_row])], ignore_index=True)
|
|
df_out.to_csv(out_csv, index=False)
|
|
|
|
summary = {
|
|
"n_chroms": int(len(df)),
|
|
"n_chroms_signif_p<0.05": int((df["perm_p_value"] < 0.05).sum()),
|
|
"n_chroms_signif_p<0.01": int((df["perm_p_value"] < 0.01).sum()),
|
|
"fisher_combined_p_value": float(fisher_p),
|
|
"fisher_chi2_statistic": float(fisher_stat),
|
|
"p_value_floor_applied": float(floor),
|
|
"mean_drift_full": float(df["drift_full_mean"].mean()),
|
|
"mean_drift_short": float(df["drift_short_mean"].mean()),
|
|
"mean_drift_long": float(df["drift_long_mean"].mean()),
|
|
"mean_anchor_short_range": float(df["anchor_sr_mean"].mean()),
|
|
"mean_non_anchor_short_range": float(df["non_anchor_sr_mean"].mean()),
|
|
"mean_sr_anchor_minus_non": float(df["sr_anchor_minus_non"].mean()),
|
|
"n_chroms_with_positive_contrast":
|
|
int((df["sr_anchor_minus_non"] > 0).sum()),
|
|
}
|
|
with open(out_json, "w") as f:
|
|
json.dump(summary, f, indent=2)
|
|
|
|
print(f"Per-chrom CSV → {out_csv}")
|
|
print(f"Summary JSON → {out_json}")
|
|
print("\n" + df_out.to_string(index=False))
|
|
print("\n=== Genome-wide summary ===")
|
|
print(json.dumps(summary, indent=2))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|