Files
chromatin-vgae-hic/experiments/h2_rewiring/call_loops.py

110 lines
4.0 KiB
Python

#!/usr/bin/env python3
"""
Call chromatin loops from a control mcool file using cooltools dots.
Loop calls are used as the "known lost anchor" positive set for perturbation
drift validation. Calling loops ourselves is justified because Rao et al. 2017
(Cell 171:305-320) demonstrates that ALL cohesin-dependent loops are eliminated
upon RAD21 depletion — therefore every HiCCUPS/dots-called control loop is a
valid member of the perturbed set.
Outputs a bedpe-format file with loop coordinates.
"""
import argparse
import sys
import cooler
import numpy as np
import pandas as pd
import cooltools.api.expected as cte
import cooltools.api.dotfinder as ctd
def main():
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--mcool", required=True, help="Control .mcool file")
ap.add_argument("--chrom", default="chr1")
ap.add_argument("--res", type=int, default=10_000,
help="Resolution in bp (default 10 kb)")
ap.add_argument("--max_dist", type=int, default=3_000_000,
help="Max loop span in bp (default 3 Mb)")
ap.add_argument("--fdr", type=float, default=0.1,
help="FDR threshold for loop calls (default 0.1)")
ap.add_argument("--nproc", type=int, default=4)
ap.add_argument("--out", required=True,
help="Output bedpe file (tab-separated)")
args = ap.parse_args()
uri = f"{args.mcool}::resolutions/{args.res}"
print(f"Loading {uri} ...")
clr = cooler.Cooler(uri)
if args.chrom not in clr.chromsizes.index:
print(f"ERROR: {args.chrom} not in mcool. Available: "
f"{list(clr.chromsizes.index[:5])}", file=sys.stderr)
sys.exit(1)
view_df = pd.DataFrame([{
"chrom": args.chrom,
"start": 0,
"end": int(clr.chromsizes[args.chrom]),
"name": args.chrom,
}])
print(f"Computing expected contacts ({args.chrom}, {args.nproc} proc) ...")
expected = cte.expected_cis(clr, view_df=view_df, nproc=args.nproc)
print(f"Expected columns: {list(expected.columns)}")
# cooltools 0.7.x: smoothed expected column name
exp_col = ("balanced.avg.smoothed.agg"
if "balanced.avg.smoothed.agg" in expected.columns
else "balanced.avg.smoothed"
if "balanced.avg.smoothed" in expected.columns
else "balanced.avg")
print(f"Using expected value column: {exp_col}")
print(f"Calling loops (max_dist={args.max_dist} bp, FDR≤{args.fdr}) ...")
dots_df = ctd.dots(
clr,
expected,
expected_value_col=exp_col,
view_df=view_df,
max_loci_separation=args.max_dist,
lambda_bin_fdr=args.fdr,
nproc=args.nproc,
)
if dots_df is None or len(dots_df) == 0:
print("WARNING: no loops called — try relaxing --fdr or --max_dist")
dots_df = pd.DataFrame(columns=["chrom1","start1","end1",
"chrom2","start2","end2"])
else:
# Keep only chr-chr pairs on the requested chromosome
if "chrom1" in dots_df.columns:
dots_df = dots_df[
(dots_df["chrom1"] == args.chrom) &
(dots_df["chrom2"] == args.chrom)
].copy()
n = len(dots_df)
print(f"Loops called: {n}")
# Minimal distance filter
if "start1" in dots_df.columns and "start2" in dots_df.columns:
span = (dots_df["start2"] - dots_df["start1"]).abs()
dots_df = dots_df[span >= 100_000]
print(f"After ≥100 kb distance filter: {len(dots_df)}")
out_cols = ["chrom1","start1","end1","chrom2","start2","end2"]
available = [c for c in out_cols if c in dots_df.columns]
extra = [c for c in dots_df.columns if c not in out_cols]
dots_df[available + extra].to_csv(args.out, sep="\t", index=False)
print(f"Saved loop calls → {args.out}")
if __name__ == "__main__":
main()