110 lines
4.0 KiB
Python
110 lines
4.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Call chromatin loops from a control mcool file using cooltools dots.
|
|
|
|
Loop calls are used as the "known lost anchor" positive set for perturbation
|
|
drift validation. Calling loops ourselves is justified because Rao et al. 2017
|
|
(Cell 171:305-320) demonstrates that ALL cohesin-dependent loops are eliminated
|
|
upon RAD21 depletion — therefore every HiCCUPS/dots-called control loop is a
|
|
valid member of the perturbed set.
|
|
|
|
Outputs a bedpe-format file with loop coordinates.
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
|
|
import cooler
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
import cooltools.api.expected as cte
|
|
import cooltools.api.dotfinder as ctd
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser(description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
ap.add_argument("--mcool", required=True, help="Control .mcool file")
|
|
ap.add_argument("--chrom", default="chr1")
|
|
ap.add_argument("--res", type=int, default=10_000,
|
|
help="Resolution in bp (default 10 kb)")
|
|
ap.add_argument("--max_dist", type=int, default=3_000_000,
|
|
help="Max loop span in bp (default 3 Mb)")
|
|
ap.add_argument("--fdr", type=float, default=0.1,
|
|
help="FDR threshold for loop calls (default 0.1)")
|
|
ap.add_argument("--nproc", type=int, default=4)
|
|
ap.add_argument("--out", required=True,
|
|
help="Output bedpe file (tab-separated)")
|
|
args = ap.parse_args()
|
|
|
|
uri = f"{args.mcool}::resolutions/{args.res}"
|
|
print(f"Loading {uri} ...")
|
|
clr = cooler.Cooler(uri)
|
|
|
|
if args.chrom not in clr.chromsizes.index:
|
|
print(f"ERROR: {args.chrom} not in mcool. Available: "
|
|
f"{list(clr.chromsizes.index[:5])}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
view_df = pd.DataFrame([{
|
|
"chrom": args.chrom,
|
|
"start": 0,
|
|
"end": int(clr.chromsizes[args.chrom]),
|
|
"name": args.chrom,
|
|
}])
|
|
|
|
print(f"Computing expected contacts ({args.chrom}, {args.nproc} proc) ...")
|
|
expected = cte.expected_cis(clr, view_df=view_df, nproc=args.nproc)
|
|
print(f"Expected columns: {list(expected.columns)}")
|
|
|
|
# cooltools 0.7.x: smoothed expected column name
|
|
exp_col = ("balanced.avg.smoothed.agg"
|
|
if "balanced.avg.smoothed.agg" in expected.columns
|
|
else "balanced.avg.smoothed"
|
|
if "balanced.avg.smoothed" in expected.columns
|
|
else "balanced.avg")
|
|
print(f"Using expected value column: {exp_col}")
|
|
|
|
print(f"Calling loops (max_dist={args.max_dist} bp, FDR≤{args.fdr}) ...")
|
|
dots_df = ctd.dots(
|
|
clr,
|
|
expected,
|
|
expected_value_col=exp_col,
|
|
view_df=view_df,
|
|
max_loci_separation=args.max_dist,
|
|
lambda_bin_fdr=args.fdr,
|
|
nproc=args.nproc,
|
|
)
|
|
|
|
if dots_df is None or len(dots_df) == 0:
|
|
print("WARNING: no loops called — try relaxing --fdr or --max_dist")
|
|
dots_df = pd.DataFrame(columns=["chrom1","start1","end1",
|
|
"chrom2","start2","end2"])
|
|
else:
|
|
# Keep only chr-chr pairs on the requested chromosome
|
|
if "chrom1" in dots_df.columns:
|
|
dots_df = dots_df[
|
|
(dots_df["chrom1"] == args.chrom) &
|
|
(dots_df["chrom2"] == args.chrom)
|
|
].copy()
|
|
|
|
n = len(dots_df)
|
|
print(f"Loops called: {n}")
|
|
|
|
# Minimal distance filter
|
|
if "start1" in dots_df.columns and "start2" in dots_df.columns:
|
|
span = (dots_df["start2"] - dots_df["start1"]).abs()
|
|
dots_df = dots_df[span >= 100_000]
|
|
print(f"After ≥100 kb distance filter: {len(dots_df)}")
|
|
|
|
out_cols = ["chrom1","start1","end1","chrom2","start2","end2"]
|
|
available = [c for c in out_cols if c in dots_df.columns]
|
|
extra = [c for c in dots_df.columns if c not in out_cols]
|
|
dots_df[available + extra].to_csv(args.out, sep="\t", index=False)
|
|
print(f"Saved loop calls → {args.out}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|