From 430e0a10bafe6f2b1761cd76ce01aebdc4dd7911 Mon Sep 17 00:00:00 2001
From: aman <aman.nalakath@tum.de>
Date: Thu, 23 Oct 2025 08:56:46 +0200
Subject: [PATCH] initial framework; to be extended

---
 scripts/compare_embeddings.py | 11 +++++------
 scripts/train_vgae.py         | 24 ++++++++++--------------
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/scripts/compare_embeddings.py b/scripts/compare_embeddings.py
index 4cbd8ab..51a6d68 100644
--- a/scripts/compare_embeddings.py
+++ b/scripts/compare_embeddings.py
@@ -1,8 +1,7 @@
 #!/usr/bin/env python3
 """
-Compares two latent embedding matrices (e.g., CTRL vs EED-i),
-computes similarity metrics (cosine, Euclidean, L1),
-and saves both a CSV and an optional line plot.
+Compares two latent embedding matrices,
+computes similarity metrics (cosine, Euclidean, L1)
 
 Usage:
   python scripts/compare_embeddings_general.py \
@@ -45,7 +44,7 @@ def main():
     p.add_argument("--no-plot", action="store_true", help="Skip generating the plot")
     args = p.parse_args()
 
-    # ---- Load ----
+    # Load
     emb1 = np.load(args.emb1)
     emb2 = np.load(args.emb2)
     if emb1.shape != emb2.shape:
@@ -55,7 +54,7 @@ def main():
     n_bins, n_dim = emb1.shape
     print(f"Loaded embeddings: {n_bins} bins × {n_dim} dims")
 
-    # ---- Compute metrics ----
+    # Compute metrics 
     cos_sims, cos_dists, l2_dists, l1_dists = compute_metrics(emb1, emb2)
 
     df = pd.DataFrame({
@@ -69,7 +68,7 @@ def main():
     df.to_csv(csv_path, index=False)
     print(f"Saved metrics → {csv_path}")
 
-    # ---- Plot ----
+    # Plot
     if not args.no_plot:
         plt.figure(figsize=(12, 4))
         plt.plot(df["bin_id"], df["cosine_distance"], lw=0.8, color="steelblue")
diff --git a/scripts/train_vgae.py b/scripts/train_vgae.py
index 4bb2ba3..5086449 100644
--- a/scripts/train_vgae.py
+++ b/scripts/train_vgae.py
@@ -1,13 +1,9 @@
 #!/usr/bin/env python3
 """
 Train a Variational Graph Autoencoder (VGAE) on a chromatin contact graph.
-
+---  
 Inputs:
-  - A PyTorch Geometric Data object saved with torch.save(...) containing:
-        x            : [num_nodes, num_features] node features
-        edge_index   : [2, num_edges] undirected edges (will be coalesced)
-        edge_weight  : [num_edges]   (optional, unused by VGAE)
-
+  - A PyTorch Geometric Data object saved with torch.save() 
   - from build_graph.py
 ---        
 Outputs (under results/):
@@ -80,14 +76,14 @@ def main():
     np.random.seed(args.seed)
     os.makedirs(args.outdir, exist_ok=True)
 
-    # ---- Load graph ----
+    # Load graph
     data = torch.load(args.graph)
     # Coalesce/clean edges
     ei, _ = remove_self_loops(data.edge_index)
     data.edge_index = to_undirected(ei, num_nodes=data.num_nodes)
     x = data.x.float()
 
-    # ---- Split edges for link prediction ----
+    # Split edges for link prediction
     splitter = RandomLinkSplit(
         num_val=0.1,
         num_test=0.1,
@@ -112,12 +108,12 @@ def main():
         )
 
 
-    # ---- Model ----
+    # Model
     enc = Encoder(in_dim=x.size(1), hidden=args.hidden, latent=args.latent, dropout=args.dropout)
     model = VGAE(enc)
     optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
 
-    # ---- Training loop ----
+    # Training loop
     best_val_auc = -1.0
     best_state = None
     for epoch in range(1, args.epochs + 1):
@@ -133,7 +129,7 @@ def main():
         loss.backward()
         optimizer.step()
 
-        # ---- Validation ----
+        # Validation
         model.eval()
         with torch.no_grad():
             z_full = model.encode(x, data.edge_index)  # use full graph for eval embeddings
@@ -146,18 +142,18 @@ def main():
         if epoch % 10 == 0 or epoch == 1:
             print(f"[{epoch:03d}/{args.epochs}] loss={loss.item():.4f} | val AUC={val_auc:.4f} AP={val_ap:.4f}")
 
-    # ---- Save best model ----
+    # Save best model
     model.load_state_dict(best_state)
     model_path = os.path.join(args.outdir, "model.pt")
     torch.save(model.state_dict(), model_path)
 
-    # ---- Final test metrics ----
+    # Final test metrics
     model.eval()
     with torch.no_grad():
         z_final = model.encode(x, data.edge_index)
         test_auc, test_ap = eval_linkpred(model, test_data, z_final)
 
-    # ---- Save embeddings & metrics ----
+    # Save embeddings & metrics
     emb_path = os.path.join(args.outdir, "emb.npy")
     np.save(emb_path, z_final.cpu().numpy())