[BUG] Mask out unwanted vertices during negative sampling (#303)

alexbarghi-nv · web-flow · commit 8b3b67fa1e4c · 2025-09-16T04:45:50.000Z
Masks out unwanted vertices during heterogeneous negative sampling, which was previously not being done. This caused de-offsetting to fail and produce negative values for `edge_label_index`, which exposed the bug. Anything sampled with these negatives edge would have been invalid anyways even if de-offsetting returned positive values. Also fixes a bug affecting triplet sampling by concatenating from a random subset of src instead of dst. Closes #304 Partially resolves nvbug#5502562 Authors: - Alex Barghi (https://github.com/alexbarghi-nv) Approvers: - Tingyu Wang (https://github.com/tingyu66) URL: #303
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
@@ -818,6 +818,7 @@ def sample_from_edges(
                 self.__graph_store,
                 index.row,
                 index.col,
+                index.input_type,
                 self.__batch_size,
                 neg_sampling,
                 None,  # src_time,
@@ -826,9 +827,13 @@ def sample_from_edges(
             if neg_sampling.is_binary():
                 src, _ = neg_cat(src.cuda(), src_neg, self.__batch_size)
             else:
-                # triplet, cat dst to src so length is the same; will
-                # result in the same set of unique vertices
-                src, _ = neg_cat(src.cuda(), dst_neg, self.__batch_size)
+                # triplet, cat random subset of src to src so length is the
+                # same; will result in the same set of unique vertices
+                scu = src.cuda()
+                per = torch.randint(
+                    0, scu.numel(), (dst_neg.numel(),), device=scu.device
+                )
+                src, _ = neg_cat(scu, scu[per], self.__batch_size)
             dst, neg_batch_size = neg_cat(dst.cuda(), dst_neg, self.__batch_size)
 
             # Concatenate -1s so the input id tensor lines up and can
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
@@ -79,6 +79,7 @@ def neg_sample(
     graph_store: GraphStore,
     seed_src: "torch.Tensor",
     seed_dst: "torch.Tensor",
+    input_type: Tuple[str, str, str],
     batch_size: int,
     neg_sampling: "torch_geometric.sampler.NegativeSampling",
     time: "torch.Tensor",
@@ -91,22 +92,85 @@ def neg_sample(
     except AttributeError:
         src_weight = neg_sampling.weight
         dst_weight = neg_sampling.weight
-    unweighted = src_weight is None and dst_weight is None
 
     # Require at least one negative edge per batch
     num_neg = max(
         int(ceil(neg_sampling.amount * seed_src.numel())),
         int(ceil(seed_src.numel() / batch_size)),
     )
 
+    # The weights need to match the expected number of nodes
+    if graph_store.is_homogeneous:
+        num_src_nodes = num_dst_nodes = list(graph_store._num_vertices().values())[0]
+    else:
+        num_src_nodes = graph_store._num_vertices()[input_type[0]]
+        num_dst_nodes = graph_store._num_vertices()[input_type[2]]
+
+    if src_weight is not None and dst_weight is not None:
+        if src_weight.dtype != dst_weight.dtype:
+            raise ValueError(
+                f"The 'src_weight' and 'dst_weight' attributes need to have the same"
+                f" dtype (got {src_weight.dtype} and {dst_weight.dtype})"
+            )
+    weight_dtype = (
+        torch.float32
+        if (src_weight is None and dst_weight is None)
+        else (src_weight.dtype if src_weight is not None else dst_weight.dtype)
+    )
+
+    if src_weight is None:
+        src_weight = torch.ones(num_src_nodes, dtype=weight_dtype, device="cuda")
+    else:
+        if src_weight.numel() != num_src_nodes:
+            raise ValueError(
+                f"The 'src_weight' attribute needs to match the number of source nodes"
+                f" {num_src_nodes} (got {src_weight.numel()})"
+            )
+
+    if dst_weight is None:
+        dst_weight = torch.ones(num_dst_nodes, dtype=weight_dtype, device="cuda")
+    else:
+        if dst_weight.numel() != num_dst_nodes:
+            raise ValueError(
+                f"The 'dst_weight' attribute needs to match the number of destination"
+                f" nodes {num_dst_nodes} (got {dst_weight.numel()})"
+            )
+
+    # If the graph is heterogeneous, the weights need to be concatenated together
+    # and offsetted.
+    if not graph_store.is_homogeneous:
+        if input_type[0] != input_type[2]:
+            vertices = torch.concat(
+                [
+                    torch.arange(num_src_nodes, dtype=torch.int64, device="cuda")
+                    + graph_store._vertex_offsets[input_type[0]],
+                    torch.arange(num_dst_nodes, dtype=torch.int64, device="cuda")
+                    + graph_store._vertex_offsets[input_type[2]],
+                ]
+            )
+        else:
+            vertices = (
+                torch.arange(num_src_nodes, dtype=torch.int64, device="cuda")
+                + graph_store._vertex_offsets[input_type[0]]
+            )
+
+        src_weight = torch.concat(
+            [src_weight, torch.zeros(num_dst_nodes, dtype=weight_dtype, device="cuda")]
+        )
+        dst_weight = torch.concat(
+            [torch.zeros(num_src_nodes, dtype=weight_dtype, device="cuda"), dst_weight]
+        )
+    elif src_weight is None and dst_weight is None:
+        vertices = None
+    else:
+        vertices = torch.arange(num_src_nodes, dtype=torch.int64, device="cuda")
+
     if node_time is None:
         result_dict = pylibcugraph.negative_sampling(
             graph_store._resource_handle,
             graph_store._graph,
             num_neg,
-            vertices=None
-            if unweighted
-            else cupy.arange(src_weight.numel(), dtype="int64"),
+            vertices=None if vertices is None else cupy.asarray(vertices),
             src_bias=None if src_weight is None else cupy.asarray(src_weight),
             dst_bias=None if dst_weight is None else cupy.asarray(dst_weight),
             remove_duplicates=False,
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
@@ -597,3 +597,104 @@ def test_neighbor_loader_hetero_linkpred_bidirectional_three_types(
         assert (r_i == eli_i).all()
 
     assert i == 7
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("neg_sampling_mode", ["binary", "triplet"])
+@pytest.mark.parametrize("amount", [1, 2])
+def test_link_neighbor_loader_hetero_negative_sampling(
+    batch_size, neg_sampling_mode, amount, single_pytorch_worker
+):
+    """
+    Test negative sampling for heterogeneous graphs with different edge types.
+    """
+    # Create a heterogeneous graph with paper-author relationships
+    src_paper = torch.tensor([0, 1, 2, 4, 3, 4, 5, 5])  # paper
+    dst_paper = torch.tensor([4, 5, 4, 3, 2, 1, 0, 1])  # paper
+
+    asrc = torch.tensor([0, 1, 2, 3, 3, 0])  # author
+    adst = torch.tensor([0, 1, 2, 3, 4, 5])  # paper
+
+    num_authors = 4
+    num_papers = 6
+
+    graph_store = GraphStore()
+    feature_store = FeatureStore()
+
+    # Add paper-paper citations
+    graph_store[("paper", "cites", "paper"), "coo", False, (num_papers, num_papers)] = [
+        src_paper,
+        dst_paper,
+    ]
+    # Add author-paper relationships
+    graph_store[
+        ("author", "writes", "paper"), "coo", False, (num_authors, num_papers)
+    ] = [asrc, adst]
+
+    # Create edge label index for author-paper relationships
+    edge_label_index = torch.stack([asrc, adst])
+
+    # Test both binary and triplet negative sampling
+    if neg_sampling_mode == "binary":
+        neg_sampling = torch_geometric.sampler.NegativeSampling(
+            "binary", amount=float(amount)
+        )
+    else:
+        neg_sampling = torch_geometric.sampler.NegativeSampling(
+            "triplet", amount=float(amount)
+        )
+
+    loader = cugraph_pyg.loader.LinkNeighborLoader(
+        (feature_store, graph_store),
+        num_neighbors={
+            ("paper", "cites", "paper"): [2, 2],
+            ("author", "writes", "paper"): [2, 2],
+        },
+        edge_label_index=(("author", "writes", "paper"), edge_label_index),
+        batch_size=batch_size,
+        neg_sampling=neg_sampling,
+        shuffle=False,
+    )
+
+    # Test that the loader produces batches with proper negative sampling
+    for i, batch in enumerate(loader):
+        # Check that we have the expected edge label index structure
+        assert [("author", "writes", "paper")] == list(
+            batch.edge_label_index_dict.keys()
+        )
+        assert [("author", "writes", "paper")] == list(batch.edge_label_dict.keys())
+
+        # Should have both positive (1.0) and negative (0.0) labels
+        edge_labels = batch["author", "writes", "paper"].edge_label
+        assert torch.any(edge_labels == 1.0)
+        assert torch.any(edge_labels == 0.0)
+        assert (edge_labels == 0.0).sum() == amount * (edge_labels == 1.0).sum()
+
+        # Verify that the edge label index has the correct shape
+        edge_label_idx = batch["author", "writes", "paper"].edge_label_index
+        assert edge_label_idx.shape[0] == 2  # Should be [2, num_edges]
+        assert edge_label_idx.shape[1] > 0  # Should have some edges
+
+        # Verify that the edge labels correspond to the edge label index
+        assert edge_labels.shape[0] == edge_label_idx.shape[1]
+
+        # Check that node IDs are valid
+        assert batch["author"].n_id.numel() > 0
+        assert batch["paper"].n_id.numel() > 0
+
+        # Verify that edge label index uses valid node IDs
+        author_n_ids = batch["author"].n_id
+        paper_n_ids = batch["paper"].n_id
+
+        # All source nodes in edge_label_index should be in author.n_id
+        src_nodes = edge_label_idx[0]
+        assert torch.all(torch.isin(src_nodes.cpu(), torch.arange(len(author_n_ids))))
+
+        # All destination nodes in edge_label_index should be in paper.n_id
+        dst_nodes = edge_label_idx[1]
+        assert torch.all(torch.isin(dst_nodes.cpu(), torch.arange(len(paper_n_ids))))
+
+    # Verify we processed all batches
+    assert i >= 0  # At least one batch should be processed