rstudio
diff --git a/‎.tether/vignettes-src/examples/nlp/neural_machine_translation_with_transformer.Rmd‎
Lines changed: 29 additions & 18 deletions b/‎.tether/vignettes-src/examples/nlp/neural_machine_translation_with_transformer.Rmd‎
Lines changed: 29 additions & 18 deletions
diff --git a/‎.tether/vignettes-src/parked/_distributed_training_with_jax.Rmd‎
Lines changed: 1 addition & 1 deletion b/‎.tether/vignettes-src/parked/_distributed_training_with_jax.Rmd‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.tether/vignettes-src/parked/_writing_a_custom_training_loop_in_jax.Rmd‎
Lines changed: 1 addition & 1 deletion b/‎.tether/vignettes-src/parked/_writing_a_custom_training_loop_in_jax.Rmd‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/install.R‎
Lines changed: 1 addition & 1 deletion b/‎R/install.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/dev/LICENSE-text.html‎
Lines changed: 3 additions & 3 deletions b/‎docs/dev/LICENSE-text.html‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/dev/articles/custom_train_step_in_tensorflow.html‎
Lines changed: 13 additions & 14 deletions b/‎docs/dev/articles/custom_train_step_in_tensorflow.html‎
Lines changed: 13 additions & 14 deletions
@@ -2,7 +2,7 @@
 title: English-to-Spanish translation with a sequence-to-sequence Transformer
 author: '[fchollet](https://twitter.com/fchollet)'
 date-created: 2021/05/26
-last-modified: 2023/02/25
+last-modified: 2024/11/18
 description: Implementing a sequence-to-sequence Transformer and training it on a
   machine translation task.
 accelerator: GPU
@@ -174,7 +174,7 @@ using the source sentence and the target words 0 to N.
 As such, the training dataset will yield a tuple `(inputs, targets)`, where:
 
 - `inputs` is a dictionary with the keys `encoder_inputs` and `decoder_inputs`.
-`encoder_inputs` is the vectorized source sentence and `encoder_inputs` is the target sentence "so far",
+`encoder_inputs` is the vectorized source sentence and `decoder_inputs` is the target sentence "so far",
 that is to say, the words 0 to N used to predict word N+1 (and beyond) in the target sentence.
 - `target` is the target sentence offset by one step:
 it provides the next words in the target sentence -- what the model will try to predict.
@@ -304,10 +304,7 @@ class PositionalEmbedding(layers.Layer):
         return embedded_tokens + embedded_positions
 
     def compute_mask(self, inputs, mask=None):
-        if mask is None:
-            return None
-        else:
-            return ops.not_equal(inputs, 0)
+        return ops.not_equal(inputs, 0)
 
     def get_config(self):
         config = super().get_config()
@@ -344,24 +341,30 @@ class TransformerDecoder(layers.Layer):
         self.layernorm_3 = layers.LayerNormalization()
         self.supports_masking = True
 
-    def call(self, inputs, encoder_outputs, mask=None):
+    def call(self, inputs, mask=None):
+        inputs, encoder_outputs = inputs
         causal_mask = self.get_causal_attention_mask(inputs)
-        if mask is not None:
-            padding_mask = ops.cast(mask[:, None, :], dtype="int32")
-            padding_mask = ops.minimum(padding_mask, causal_mask)
+
+        if mask is None:
+            inputs_padding_mask, encoder_outputs_padding_mask = None, None
         else:
-            padding_mask = None
+            inputs_padding_mask, encoder_outputs_padding_mask = mask
 
         attention_output_1 = self.attention_1(
-            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
+            query=inputs,
+            value=inputs,
+            key=inputs,
+            attention_mask=causal_mask,
+            query_mask=inputs_padding_mask,
         )
         out_1 = self.layernorm_1(inputs + attention_output_1)
 
         attention_output_2 = self.attention_2(
             query=out_1,
             value=encoder_outputs,
             key=encoder_outputs,
-            attention_mask=padding_mask,
+            query_mask=inputs_padding_mask,
+            key_mask=encoder_outputs_padding_mask,
         )
         out_2 = self.layernorm_2(out_1 + attention_output_2)
 
@@ -408,14 +411,15 @@ encoder = keras.Model(encoder_inputs, encoder_outputs)
 decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
 encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
 x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
-x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
+x = TransformerDecoder(embed_dim, latent_dim, num_heads)([x, encoder_outputs])
 x = layers.Dropout(0.5)(x)
 decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
 decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)
 
-decoder_outputs = decoder([decoder_inputs, encoder_outputs])
 transformer = keras.Model(
-    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
+    {"encoder_inputs": encoder_inputs, "decoder_inputs": decoder_inputs},
+    decoder_outputs,
+    name="transformer",
 )
 ```
 
@@ -432,7 +436,9 @@ epochs = 1  # This should be at least 30 for convergence
 
 transformer.summary()
 transformer.compile(
-    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
+    "rmsprop",
+    loss=keras.losses.SparseCategoricalCrossentropy(ignore_class=0),
+    metrics=["accuracy"],
 )
 transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)
 ```
@@ -455,7 +461,12 @@ def decode_sequence(input_sentence):
     decoded_sentence = "[start]"
     for i in range(max_decoded_sentence_length):
         tokenized_target_sentence = spa_vectorization([decoded_sentence])[:, :-1]
-        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
+        predictions = transformer(
+            {
+                "encoder_inputs": tokenized_input_sentence,
+                "decoder_inputs": tokenized_target_sentence,
+            }
+        )
 
         # ops.argmax(predictions[0, i, :]) is not a concrete value for jax here
         sampled_token_index = ops.convert_to_numpy(
 
@@ -174,7 +174,7 @@ optimizer.build(model.trainable_variables)
 # Keras provides a pure functional forward pass: model.stateless_call
 def compute_loss(trainable_variables, non_trainable_variables, x, y):
     y_pred, updated_non_trainable_variables = model.stateless_call(
-        trainable_variables, non_trainable_variables, x
+        trainable_variables, non_trainable_variables, x, training=True
     )
     loss_value = loss(y, y_pred)
     return loss_value, updated_non_trainable_variables
 
@@ -175,7 +175,7 @@ variables.
 ```python
 def compute_loss_and_updates(trainable_variables, non_trainable_variables, x, y):
     y_pred, non_trainable_variables = model.stateless_call(
-        trainable_variables, non_trainable_variables, x
+        trainable_variables, non_trainable_variables, x, training=True
     )
     loss = loss_fn(y, y_pred)
     return loss, non_trainable_variables
 
@@ -226,7 +226,7 @@ use_backend <- function(backend, gpu = NA) {
     },
 
     Linux_jax = {
-      py_require(c("tensorflow", "tensorflow[and-cuda]"), action = "remove")
+      py_require(c("tensorflow", "tensorflow[and-cuda]", "jax[cuda12]", "jax[cpu]"), action = "remove")
 
       if (is.na(gpu))
         gpu <- has_gpu()
Original file line number	Diff line number	Diff line change
`@@ -174,7 +174,7 @@ optimizer.build(model.trainable_variables)`
`174`	`174`	`# Keras provides a pure functional forward pass: model.stateless_call`
`175`	`175`	`def compute_loss(trainable_variables, non_trainable_variables, x, y):`
`176`	`176`	`y_pred, updated_non_trainable_variables = model.stateless_call(`
`177`		`- trainable_variables, non_trainable_variables, x`
	`177`	`+ trainable_variables, non_trainable_variables, x, training=True`
`178`	`178`	`)`
`179`	`179`	`loss_value = loss(y, y_pred)`
`180`	`180`	`return loss_value, updated_non_trainable_variables`
Original file line number	Diff line number	Diff line change
`@@ -175,7 +175,7 @@ variables.`
`175`	`175`	```python
`176`	`176`	`def compute_loss_and_updates(trainable_variables, non_trainable_variables, x, y):`
`177`	`177`	`y_pred, non_trainable_variables = model.stateless_call(`
`178`		`- trainable_variables, non_trainable_variables, x`
	`178`	`+ trainable_variables, non_trainable_variables, x, training=True`
`179`	`179`	`)`
`180`	`180`	`loss = loss_fn(y, y_pred)`
`181`	`181`	`return loss, non_trainable_variables`