Fix output transform, add test to enforce tokenizer consistency (#73)

HugoSenetaire · web-flow · commit 3fe24ff8cd2e · 2024-05-17T15:29:18.000+02:00
*Description of changes:* 

The bin indexes were shifted by one between input transform and output
transform. Subtracting 1 to the sampled tokens in output transform lead
to the correct reconstruction of the signal.

Add a test to ensure the consistency of the Chronos Tokenizer.

By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.

Co-authored-by: Lorenzo Stella &lt;stellalo@amazon.com&gt; and Abdul Fatir
Ansari &lt;ansarnd@amazon.com&gt;
diff --git a/src/chronos/chronos.py b/src/chronos/chronos.py
@@ -185,7 +185,7 @@ def output_transform(
     ) -> torch.Tensor:
         scale_unsqueezed = scale.unsqueeze(-1).unsqueeze(-1)
         indices = torch.clamp(
-            samples - self.config.n_special_tokens,
+            samples - self.config.n_special_tokens - 1,
             min=0,
             max=len(self.centers) - 1,
         )
diff --git a/test/test_chronos.py b/test/test_chronos.py
@@ -7,7 +7,45 @@
 import torch
 import pytest
 
-from chronos import ChronosConfig, ChronosPipeline
+from chronos import ChronosConfig, ChronosPipeline, MeanScaleUniformBins
+
+
+@pytest.mark.parametrize("n_numerical_tokens", [5, 10, 27])
+@pytest.mark.parametrize("n_special_tokens", [2, 5, 13])
+def test_tokenizer_consistency(n_numerical_tokens: int, n_special_tokens: int):
+    n_tokens = n_numerical_tokens + n_special_tokens
+
+    config = ChronosConfig(
+        tokenizer_class="MeanScaleUniformBins",
+        tokenizer_kwargs=dict(low_limit=-1.0, high_limit=1.0),
+        n_tokens=n_tokens,
+        n_special_tokens=n_special_tokens,
+        pad_token_id=0,
+        eos_token_id=1,
+        use_eos_token=True,
+        model_type="seq2seq",
+        context_length=512,
+        prediction_length=64,
+        num_samples=20,
+        temperature=1.0,
+        top_k=50,
+        top_p=1.0,
+    )
+
+    tokenizer = config.create_tokenizer()
+    assert isinstance(tokenizer, MeanScaleUniformBins)
+
+    context = tokenizer.centers.unsqueeze(0)  # add batch dimension
+    scale = torch.ones((1,))  # fix the scale to one to turn off scaling
+
+    token_ids, _, _ = tokenizer.input_transform(context, scale=scale)
+
+    samples = tokenizer.output_transform(
+        token_ids[:, :-1].unsqueeze(1),  # remove final EOS, add sample dimension
+        scale=scale,
+    )
+
+    assert (samples[0, 0, :] == context).all()
 
 
 @pytest.mark.xfail

Original file line number	Diff line number	Diff line change
`@@ -185,7 +185,7 @@ def output_transform(`
`185`	`185`	`) -> torch.Tensor:`
`186`	`186`	`scale_unsqueezed = scale.unsqueeze(-1).unsqueeze(-1)`
`187`	`187`	`indices = torch.clamp(`
`188`		`- samples - self.config.n_special_tokens,`
	`188`	`+ samples - self.config.n_special_tokens - 1,`
`189`	`189`	`min=0,`
`190`	`190`	`max=len(self.centers) - 1,`
`191`	`191`	`)`