#!/usr/bin/env python3 """ Call chromatin loops from a control mcool file using cooltools dots. Loop calls are used as the "known lost anchor" positive set for perturbation drift validation. Calling loops ourselves is justified because Rao et al. 2017 (Cell 171:305-320) demonstrates that ALL cohesin-dependent loops are eliminated upon RAD21 depletion — therefore every HiCCUPS/dots-called control loop is a valid member of the perturbed set. Outputs a bedpe-format file with loop coordinates. """ import argparse import sys import cooler import numpy as np import pandas as pd import cooltools.api.expected as cte import cooltools.api.dotfinder as ctd def main(): ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) ap.add_argument("--mcool", required=True, help="Control .mcool file") ap.add_argument("--chrom", default="chr1") ap.add_argument("--res", type=int, default=10_000, help="Resolution in bp (default 10 kb)") ap.add_argument("--max_dist", type=int, default=3_000_000, help="Max loop span in bp (default 3 Mb)") ap.add_argument("--fdr", type=float, default=0.1, help="FDR threshold for loop calls (default 0.1)") ap.add_argument("--nproc", type=int, default=4) ap.add_argument("--out", required=True, help="Output bedpe file (tab-separated)") args = ap.parse_args() uri = f"{args.mcool}::resolutions/{args.res}" print(f"Loading {uri} ...") clr = cooler.Cooler(uri) if args.chrom not in clr.chromsizes.index: print(f"ERROR: {args.chrom} not in mcool. Available: " f"{list(clr.chromsizes.index[:5])}", file=sys.stderr) sys.exit(1) view_df = pd.DataFrame([{ "chrom": args.chrom, "start": 0, "end": int(clr.chromsizes[args.chrom]), "name": args.chrom, }]) print(f"Computing expected contacts ({args.chrom}, {args.nproc} proc) ...") expected = cte.expected_cis(clr, view_df=view_df, nproc=args.nproc) print(f"Expected columns: {list(expected.columns)}") # cooltools 0.7.x: smoothed expected column name exp_col = ("balanced.avg.smoothed.agg" if "balanced.avg.smoothed.agg" in expected.columns else "balanced.avg.smoothed" if "balanced.avg.smoothed" in expected.columns else "balanced.avg") print(f"Using expected value column: {exp_col}") print(f"Calling loops (max_dist={args.max_dist} bp, FDR≤{args.fdr}) ...") dots_df = ctd.dots( clr, expected, expected_value_col=exp_col, view_df=view_df, max_loci_separation=args.max_dist, lambda_bin_fdr=args.fdr, nproc=args.nproc, ) if dots_df is None or len(dots_df) == 0: print("WARNING: no loops called — try relaxing --fdr or --max_dist") dots_df = pd.DataFrame(columns=["chrom1","start1","end1", "chrom2","start2","end2"]) else: # Keep only chr-chr pairs on the requested chromosome if "chrom1" in dots_df.columns: dots_df = dots_df[ (dots_df["chrom1"] == args.chrom) & (dots_df["chrom2"] == args.chrom) ].copy() n = len(dots_df) print(f"Loops called: {n}") # Minimal distance filter if "start1" in dots_df.columns and "start2" in dots_df.columns: span = (dots_df["start2"] - dots_df["start1"]).abs() dots_df = dots_df[span >= 100_000] print(f"After ≥100 kb distance filter: {len(dots_df)}") out_cols = ["chrom1","start1","end1","chrom2","start2","end2"] available = [c for c in out_cols if c in dots_df.columns] extra = [c for c in dots_df.columns if c not in out_cols] dots_df[available + extra].to_csv(args.out, sep="\t", index=False) print(f"Saved loop calls → {args.out}") if __name__ == "__main__": main()