|
| 1 | +from types import SimpleNamespace |
| 2 | + |
| 3 | +import torch |
| 4 | +from torch import nn |
| 5 | + |
| 6 | +from tfkit.task.clm.model import Model as CLMModel |
| 7 | +from tfkit.task.seq2seq.model import Model as Seq2SeqModel |
| 8 | + |
| 9 | + |
| 10 | +class DummyTokenizer: |
| 11 | + def __init__(self, vocab_size): |
| 12 | + self.vocab_size = vocab_size |
| 13 | + |
| 14 | + def __len__(self): |
| 15 | + return self.vocab_size |
| 16 | + |
| 17 | + def convert_ids_to_tokens(self, idx): |
| 18 | + return f"token-{idx}" |
| 19 | + |
| 20 | + |
| 21 | +class DummyCausalPretrained(nn.Module): |
| 22 | + def __init__(self): |
| 23 | + super().__init__() |
| 24 | + self.config = SimpleNamespace(vocab_size=5, hidden_size=4) |
| 25 | + self.output_layer = nn.Linear(self.config.hidden_size, self.config.vocab_size) |
| 26 | + self.last_kwargs = None |
| 27 | + |
| 28 | + def get_output_embeddings(self): |
| 29 | + return self.output_layer |
| 30 | + |
| 31 | + def forward(self, input_ids, attention_mask=None, return_dict=True, **kwargs): |
| 32 | + self.last_kwargs = kwargs |
| 33 | + batch_size, seq_len = input_ids.shape |
| 34 | + logits = torch.zeros(batch_size, seq_len, self.config.vocab_size) |
| 35 | + outputs = { |
| 36 | + "logits": logits, |
| 37 | + "last_hidden_state": torch.zeros(batch_size, seq_len, self.config.hidden_size), |
| 38 | + } |
| 39 | + if "labels" in kwargs: |
| 40 | + outputs["loss"] = torch.tensor(0.0) |
| 41 | + return outputs |
| 42 | + |
| 43 | + |
| 44 | +class DummyEncoderPretrained(nn.Module): |
| 45 | + def __init__(self): |
| 46 | + super().__init__() |
| 47 | + self.config = SimpleNamespace(vocab_size=5, hidden_size=4) |
| 48 | + self.last_kwargs = None |
| 49 | + |
| 50 | + def get_output_embeddings(self): |
| 51 | + return None |
| 52 | + |
| 53 | + def forward(self, input_ids, attention_mask=None, return_dict=True, **kwargs): |
| 54 | + self.last_kwargs = kwargs |
| 55 | + batch_size, seq_len = input_ids.shape |
| 56 | + hidden = torch.zeros(batch_size, seq_len, self.config.hidden_size) |
| 57 | + return {"last_hidden_state": hidden} |
| 58 | + |
| 59 | + |
| 60 | +class DummySeq2SeqPretrained(nn.Module): |
| 61 | + def __init__(self): |
| 62 | + super().__init__() |
| 63 | + self.config = SimpleNamespace(vocab_size=3, hidden_size=4) |
| 64 | + self.decoder = nn.Module() |
| 65 | + self.output_layer = nn.Linear(self.config.hidden_size, self.config.vocab_size) |
| 66 | + |
| 67 | + def get_output_embeddings(self): |
| 68 | + return self.output_layer |
| 69 | + |
| 70 | + def forward( |
| 71 | + self, |
| 72 | + input_ids=None, |
| 73 | + attention_mask=None, |
| 74 | + decoder_input_ids=None, |
| 75 | + decoder_attention_mask=None, |
| 76 | + output_hidden_states=False, |
| 77 | + use_cache=False, |
| 78 | + return_dict=True, |
| 79 | + **kwargs, |
| 80 | + ): |
| 81 | + batch_size, seq_len = decoder_input_ids.shape |
| 82 | + hidden = torch.zeros(batch_size, seq_len, self.config.hidden_size) |
| 83 | + outputs = { |
| 84 | + "last_hidden_state": hidden, |
| 85 | + "decoder_hidden_states": (hidden,), |
| 86 | + } |
| 87 | + return outputs |
| 88 | + |
| 89 | + |
| 90 | +def test_clm_model_uses_pretrained_head_for_loss(): |
| 91 | + tokenizer = DummyTokenizer(vocab_size=5) |
| 92 | + pretrained = DummyCausalPretrained() |
| 93 | + model = CLMModel(tokenizer=tokenizer, pretrained=pretrained) |
| 94 | + |
| 95 | + batch = { |
| 96 | + "input": torch.zeros((1, 2), dtype=torch.long), |
| 97 | + "mask": torch.ones((1, 2), dtype=torch.long), |
| 98 | + "target": torch.tensor([[0, -1]]), |
| 99 | + } |
| 100 | + |
| 101 | + loss = model.forward(batch, eval=False) |
| 102 | + assert torch.is_tensor(loss) |
| 103 | + assert "labels" in pretrained.last_kwargs |
| 104 | + assert pretrained.last_kwargs["labels"].tolist() == [[0, -100]] |
| 105 | + |
| 106 | + eval_batch = { |
| 107 | + **batch, |
| 108 | + "start": [0], |
| 109 | + } |
| 110 | + result = model.forward(eval_batch, eval=True) |
| 111 | + assert isinstance(result, dict) |
| 112 | + assert "max_item" in result |
| 113 | + |
| 114 | + |
| 115 | +def test_clm_model_falls_back_to_linear_head(): |
| 116 | + tokenizer = DummyTokenizer(vocab_size=5) |
| 117 | + pretrained = DummyEncoderPretrained() |
| 118 | + model = CLMModel(tokenizer=tokenizer, pretrained=pretrained) |
| 119 | + |
| 120 | + batch = { |
| 121 | + "input": torch.zeros((1, 2), dtype=torch.long), |
| 122 | + "mask": torch.ones((1, 2), dtype=torch.long), |
| 123 | + "target": torch.tensor([[0, -1]]), |
| 124 | + } |
| 125 | + |
| 126 | + loss = model.forward(batch, eval=False) |
| 127 | + assert torch.is_tensor(loss) |
| 128 | + assert pretrained.last_kwargs == {} |
| 129 | + |
| 130 | + |
| 131 | +def test_seq2seq_model_uses_pretrained_output_head(): |
| 132 | + tokenizer = DummyTokenizer(vocab_size=3) |
| 133 | + pretrained = DummySeq2SeqPretrained() |
| 134 | + model = Seq2SeqModel(tokenizer=tokenizer, pretrained=pretrained) |
| 135 | + |
| 136 | + batch = { |
| 137 | + "input": torch.zeros((1, 1), dtype=torch.long), |
| 138 | + "prev": torch.zeros((1, 1), dtype=torch.long), |
| 139 | + "encoder_mask": torch.ones((1, 1), dtype=torch.long), |
| 140 | + "decoder_mask": torch.ones((1, 1), dtype=torch.long), |
| 141 | + "target": torch.zeros((1, 1), dtype=torch.long), |
| 142 | + "ntarget": torch.full((1, 1), -1), |
| 143 | + } |
| 144 | + |
| 145 | + loss = model.forward(batch, eval=False) |
| 146 | + assert torch.is_tensor(loss) |
| 147 | + assert model.model is pretrained.output_layer |
0 commit comments