From 430e0a10bafe6f2b1761cd76ce01aebdc4dd7911 Mon Sep 17 00:00:00 2001 From: aman Date: Thu, 23 Oct 2025 08:56:46 +0200 Subject: [PATCH] initial framework; to be extended --- scripts/compare_embeddings.py | 11 +++++------ scripts/train_vgae.py | 24 ++++++++++-------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/scripts/compare_embeddings.py b/scripts/compare_embeddings.py index 4cbd8ab..51a6d68 100644 --- a/scripts/compare_embeddings.py +++ b/scripts/compare_embeddings.py @@ -1,8 +1,7 @@ #!/usr/bin/env python3 """ -Compares two latent embedding matrices (e.g., CTRL vs EED-i), -computes similarity metrics (cosine, Euclidean, L1), -and saves both a CSV and an optional line plot. +Compares two latent embedding matrices, +computes similarity metrics (cosine, Euclidean, L1) Usage: python scripts/compare_embeddings_general.py \ @@ -45,7 +44,7 @@ def main(): p.add_argument("--no-plot", action="store_true", help="Skip generating the plot") args = p.parse_args() - # ---- Load ---- + # Load emb1 = np.load(args.emb1) emb2 = np.load(args.emb2) if emb1.shape != emb2.shape: @@ -55,7 +54,7 @@ def main(): n_bins, n_dim = emb1.shape print(f"Loaded embeddings: {n_bins} bins × {n_dim} dims") - # ---- Compute metrics ---- + # Compute metrics cos_sims, cos_dists, l2_dists, l1_dists = compute_metrics(emb1, emb2) df = pd.DataFrame({ @@ -69,7 +68,7 @@ def main(): df.to_csv(csv_path, index=False) print(f"Saved metrics → {csv_path}") - # ---- Plot ---- + # Plot if not args.no_plot: plt.figure(figsize=(12, 4)) plt.plot(df["bin_id"], df["cosine_distance"], lw=0.8, color="steelblue") diff --git a/scripts/train_vgae.py b/scripts/train_vgae.py index 4bb2ba3..5086449 100644 --- a/scripts/train_vgae.py +++ b/scripts/train_vgae.py @@ -1,13 +1,9 @@ #!/usr/bin/env python3 """ Train a Variational Graph Autoencoder (VGAE) on a chromatin contact graph. - +--- Inputs: - - A PyTorch Geometric Data object saved with torch.save(...) containing: - x : [num_nodes, num_features] node features - edge_index : [2, num_edges] undirected edges (will be coalesced) - edge_weight : [num_edges] (optional, unused by VGAE) - + - A PyTorch Geometric Data object saved with torch.save() - from build_graph.py --- Outputs (under results/): @@ -80,14 +76,14 @@ def main(): np.random.seed(args.seed) os.makedirs(args.outdir, exist_ok=True) - # ---- Load graph ---- + # Load graph data = torch.load(args.graph) # Coalesce/clean edges ei, _ = remove_self_loops(data.edge_index) data.edge_index = to_undirected(ei, num_nodes=data.num_nodes) x = data.x.float() - # ---- Split edges for link prediction ---- + # Split edges for link prediction splitter = RandomLinkSplit( num_val=0.1, num_test=0.1, @@ -112,12 +108,12 @@ def main(): ) - # ---- Model ---- + # Model enc = Encoder(in_dim=x.size(1), hidden=args.hidden, latent=args.latent, dropout=args.dropout) model = VGAE(enc) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) - # ---- Training loop ---- + # Training loop best_val_auc = -1.0 best_state = None for epoch in range(1, args.epochs + 1): @@ -133,7 +129,7 @@ def main(): loss.backward() optimizer.step() - # ---- Validation ---- + # Validation model.eval() with torch.no_grad(): z_full = model.encode(x, data.edge_index) # use full graph for eval embeddings @@ -146,18 +142,18 @@ def main(): if epoch % 10 == 0 or epoch == 1: print(f"[{epoch:03d}/{args.epochs}] loss={loss.item():.4f} | val AUC={val_auc:.4f} AP={val_ap:.4f}") - # ---- Save best model ---- + # Save best model model.load_state_dict(best_state) model_path = os.path.join(args.outdir, "model.pt") torch.save(model.state_dict(), model_path) - # ---- Final test metrics ---- + # Final test metrics model.eval() with torch.no_grad(): z_final = model.encode(x, data.edge_index) test_auc, test_ap = eval_linkpred(model, test_data, z_final) - # ---- Save embeddings & metrics ---- + # Save embeddings & metrics emb_path = os.path.join(args.outdir, "emb.npy") np.save(emb_path, z_final.cpu().numpy())