pyg-team
diff --git a/‎test/llm/models/test_g_retriever.py‎
Lines changed: 89 additions & 0 deletions b/‎test/llm/models/test_g_retriever.py‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎test/llm/models/test_llm.py‎
Lines changed: 94 additions & 0 deletions b/‎test/llm/models/test_llm.py‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎test/llm/models/test_sentence_transformer.py‎
Lines changed: 9 additions & 5 deletions b/‎test/llm/models/test_sentence_transformer.py‎
Lines changed: 9 additions & 5 deletions
@@ -1,6 +1,10 @@
 import gc
+from contextlib import nullcontext
+from types import SimpleNamespace
 
+import pytest
 import torch
+from torch import nn
 
 from torch_geometric.llm.models import LLM, GRetriever
 from torch_geometric.nn import GAT
@@ -100,3 +104,88 @@ def test_g_retriever_many_tokens() -> None:
     del model, llm, gnn
     gc.collect()
     torch.cuda.empty_cache()
+
+
+class DummyHFModel(nn.Module):
+    def __init__(self, vocab_size=10):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.dummy = nn.Parameter(torch.zeros(1))
+
+    def forward(self, inputs_embeds=None, **kwargs):
+        B, T, _ = inputs_embeds.shape
+        logits = torch.randn(B, T, self.vocab_size,
+                             device=inputs_embeds.device)
+        loss = torch.tensor(0.0, device=inputs_embeds.device)
+        loss.logits = logits
+        return SimpleNamespace(
+            logits=logits,
+            loss=loss,
+        )
+
+
+class DummyLLM:
+    def __init__(self, hidden_dim):
+        self.word_embedding = nn.Embedding(100, hidden_dim)
+        self.llm = DummyHFModel()
+        self.device = torch.device("cpu")
+        self.autocast_context = nullcontext()
+
+    def _get_embeds(self, question, *args):
+        batch_size = len(question)
+        seq_len = 4
+        hidden = self.word_embedding.embedding_dim
+
+        inputs_embeds = torch.randn(batch_size, seq_len, hidden)
+        attention_mask = torch.ones(batch_size, seq_len, dtype=torch.long)
+
+        return inputs_embeds, attention_mask, None
+
+
+class DummyGNN(nn.Module):
+    """Simple GNN stub returning node embeddings."""
+    def __init__(self, in_channels=4, out_channels=8):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.lin = nn.Linear(in_channels, out_channels)
+
+    def forward(self, *args, **kwargs):
+        x = args[0]
+        return self.lin(x)
+
+
+@pytest.mark.parametrize("batch_size", [1, 3])
+def test_gretriever_prefix_embedding_injection(batch_size):
+    hidden_dim = 8
+    num_nodes = 5
+
+    llm = DummyLLM(hidden_dim)
+    gnn = DummyGNN(in_channels=4, out_channels=8)
+
+    model = GRetriever(
+        llm=llm,
+        gnn=gnn,
+        mlp_out_tokens=2,
+    )
+
+    # graph inputs
+    x = torch.randn(num_nodes, 4)
+    edge_index = torch.tensor([[0, 1, 2], [1, 2, 3]])
+    batch = torch.zeros(num_nodes, dtype=torch.long)
+
+    # token ids
+    questions = ["What is this graph?"] * batch_size
+    labels = ["dummy answer"] * batch_size
+
+    out = model(
+        x=x,
+        edge_index=edge_index,
+        batch=batch,
+        question=questions,
+        label=labels,
+    )
+
+    # basic correctness assertions
+    assert hasattr(out, "logits")
+    assert out.logits.shape[0] == batch_size
@@ -1,5 +1,6 @@
 import gc
 
+import pytest
 import torch
 from torch import Tensor
 
@@ -28,3 +29,96 @@ def test_llm() -> None:
     del model
     gc.collect()
     torch.cuda.empty_cache()
+
+
+class DummyBatch(dict):
+    """Mimics HuggingFace BatchEncoding."""
+    def to(self, device):
+        return self
+
+
+class DummyTokenizer:
+    pad_token_id = 0
+    padding_side = "left"
+
+    def __call__(self, texts, return_tensors=None, padding=True):
+        lengths = [len(t) for t in texts]
+        max_len = max(lengths)
+
+        ids = []
+        mask = []
+
+        for seq_len in lengths:
+            padding = max_len - seq_len
+            ids.append([0] * padding + list(range(1, seq_len + 1)))
+            mask.append([0] * padding + [1] * seq_len)
+
+        return DummyBatch({
+            "input_ids": torch.tensor(ids),
+            "attention_mask": torch.tensor(mask)
+        })
+
+
+class DummyModel(torch.nn.Module):
+    def get_input_embeddings(self):
+        return torch.nn.Embedding(100, 8)
+
+    def forward(self, inputs_embeds=None, attention_mask=None, **kwargs):
+        batch, seq, dim = inputs_embeds.shape
+
+        class Out:
+            pass
+
+        out = Out()
+        out.logits = torch.zeros(batch, seq, 10)
+        return out
+
+
+@pytest.fixture
+def dummy_llm():
+    llm = LLM.__new__(LLM)
+    torch.nn.Module.__init__(llm)
+    llm.device = torch.device("cpu")
+    llm.tokenizer = DummyTokenizer()
+    llm.model = DummyModel()
+    return llm
+
+
+@onlyRAG
+def test_llm_prepare_inputs(dummy_llm):
+    prompts = ["hello", "hi"]
+
+    encoded = dummy_llm.tokenizer(prompts)
+
+    input_ids = encoded["input_ids"]
+    attention_mask = encoded["attention_mask"]
+
+    emb = dummy_llm.model.get_input_embeddings()
+    inputs_embeds = emb(input_ids)
+
+    out = dummy_llm.model(inputs_embeds=inputs_embeds,
+                          attention_mask=attention_mask)
+
+    assert inputs_embeds.shape[0] == 2
+    assert attention_mask.shape == input_ids.shape
+    assert hasattr(out, "logits")
+    assert out.logits.shape[:2] == inputs_embeds.shape[:2]
+
+
+@onlyRAG
+def test_llm_single_prompt(dummy_llm):
+    encoded = dummy_llm.tokenizer(["test"])
+
+    assert encoded["input_ids"].shape[0] == 1
+
+
+@onlyRAG
+def test_llm_variable_lengths(dummy_llm):
+    prompts = ["a", "abcdef", "abc"]
+
+    encoded = dummy_llm.tokenizer(prompts)
+
+    input_ids = encoded["input_ids"]
+
+    assert input_ids.shape[0] == 3
+    assert input_ids.shape[1] == max(len(p) for p in prompts)
@@ -10,26 +10,30 @@
 @pytest.mark.parametrize('batch_size', [None, 1])
 @pytest.mark.parametrize('pooling_strategy', ['mean', 'last', 'cls'])
 def test_sentence_transformer(batch_size, pooling_strategy, device):
+
+    model_name = 'bert-base-uncased'
     model = SentenceTransformer(
-        model_name='prajjwal1/bert-tiny',
+        model_name=model_name,
         pooling_strategy=pooling_strategy,
     ).to(device)
     assert model.device == device
-    assert str(model) == 'SentenceTransformer(model_name=prajjwal1/bert-tiny)'
+    assert str(model) == f'SentenceTransformer(model_name={model_name})'
 
     text = [
         "this is a basic english text",
         "PyG is the best open-source GNN library :)",
     ]
 
+    model_embedding_dim = model.model.config.hidden_size
+
     out = model.encode(text, batch_size=batch_size)
     assert out.device == device
-    assert out.size() == (2, 128)
+    assert out.shape == (2, model_embedding_dim)
 
     out = model.encode(text, batch_size=batch_size, output_device='cpu')
     assert out.is_cpu
-    assert out.size() == (2, 128)
+    assert out.shape == (2, model_embedding_dim)
 
     out = model.encode([], batch_size=batch_size)
     assert out.device == device
-    assert out.size() == (0, 128)
+    assert out.shape == (0, model_embedding_dim)