Merge pull request #2244 from 8bitmp3:jax2tf_update

copybara-github · copybara-github · commit fde56502bf27 · 2023-07-13T15:14:33.000-07:00
PiperOrigin-RevId: 547937608
diff --git a/site/en/guide/jax2tf.ipynb b/site/en/guide/jax2tf.ipynb
@@ -111,7 +111,7 @@
         "import os\n",
         "from matplotlib import pyplot as plt\n",
         "from jax.experimental import jax2tf\n",
-        "from threading import Lock # only used in the visualization utility\n",
+        "from threading import Lock # Only used in the visualization utility.\n",
         "from functools import partial"
       ]
     },
@@ -123,15 +123,15 @@
       },
       "outputs": [],
       "source": [
-        "# Needed for TF and JAX to coexist in GPU memory\n",
+        "# Needed for TensorFlow and JAX to coexist in GPU memory.\n",
         "os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = \"false\"\n",
         "gpus = tf.config.list_physical_devices('GPU')\n",
         "if gpus:\n",
         "  try:\n",
         "    for gpu in gpus:\n",
         "      tf.config.experimental.set_memory_growth(gpu, True)\n",
         "  except RuntimeError as e:\n",
-        "    # Memory growth must be set before GPUs have been initialized\n",
+        "    # Memory growth must be set before GPUs have been initialized.\n",
         "    print(e)"
       ]
     },
@@ -148,12 +148,12 @@
         "\n",
         "plt.rcParams[\"figure.figsize\"] = (20,8)\n",
         "\n",
-        "# utility to display training and validation curves\n",
+        "# The utility for displaying training and validation curves.\n",
         "def display_train_curves(loss, avg_loss, eval_loss, eval_accuracy, epochs, steps_per_epochs, ignore_first_n=10):\n",
         "\n",
         "  ignore_first_n_epochs = int(ignore_first_n/steps_per_epochs)\n",
         "\n",
-        "  # Losses\n",
+        "  # The losses.\n",
         "  ax = plt.subplot(121)\n",
         "  if loss is not None:\n",
         "    x = np.arange(len(loss)) / steps_per_epochs #* epochs\n",
@@ -172,7 +172,7 @@
         "    ax.set_ylim(ymin-(ymax-ymin)/10, ymax+(ymax-ymin)/10)\n",
         "    ax.legend(['avg train', 'eval'])\n",
         "\n",
-        "  #Accuracy\n",
+        "  # The accuracy.\n",
         "  ax = plt.subplot(122)\n",
         "  ax.set_title('Eval Accuracy')\n",
         "  ax.set_ylabel('accuracy')\n",
@@ -197,7 +197,7 @@
         "        :param msg: the message displayed in the header of the progress bar\n",
         "        \"\"\"\n",
         "        self.maxi = maxi\n",
-        "        self.p = self.__start_progress(maxi)()  # () to get the iterator from the generator\n",
+        "        self.p = self.__start_progress(maxi)()  # `()`: to get the iterator from the generator.\n",
         "        self.header_printed = False\n",
         "        self.msg = msg\n",
         "        self.size = size\n",
@@ -232,7 +232,7 @@
         "                    d -= dx\n",
         "                d += dy\n",
         "                yield k\n",
-        "            # keep yielding the last result if too many steps\n",
+        "            # Keep yielding the last result if there are too many steps.\n",
         "            while True:\n",
         "              yield k\n",
         "\n",
@@ -295,17 +295,17 @@
       },
       "outputs": [],
       "source": [
-        "# Training hyperparams\n",
+        "# Training hyperparameters.\n",
         "JAX_EPOCHS = 3\n",
         "TF_EPOCHS = 7\n",
         "STEPS_PER_EPOCH = len(train_labels)//BATCH_SIZE\n",
         "LEARNING_RATE = 0.01\n",
         "LEARNING_RATE_EXP_DECAY = 0.6\n",
         "\n",
-        "# Learning Rate schedule for JAX\n",
+        "# The learning rate schedule for JAX (with Optax).\n",
         "jlr_decay = optax.exponential_decay(LEARNING_RATE, transition_steps=STEPS_PER_EPOCH, decay_rate=LEARNING_RATE_EXP_DECAY, staircase=True)\n",
         "\n",
-        "# Learning Rate schedule for TF\n",
+        "# THe learning rate schedule for TensorFlow.\n",
         "tflr_decay = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=LEARNING_RATE, decay_steps=STEPS_PER_EPOCH, decay_rate=LEARNING_RATE_EXP_DECAY, staircase=True)"
       ]
     },
@@ -341,17 +341,17 @@
         "    #x = flax.linen.log_softmax(x)\n",
         "    return x\n",
         "\n",
-        "  # JAX differentiation requires a function f(params, other_state, data, labels) -> loss (as a single number)\n",
-        "  # jax.grad will differentiate it against the fist argument.\n",
-        "  # The user must split trainable and non-trainable variables into \"params\" and \"other_state\"\n",
-        "  # Must pass a different RNG Key each time for dropout mask to be different\n",
+        "  # JAX differentiation requires a function `f(params, other_state, data, labels)` -> `loss` (as a single number).\n",
+        "  # `jax.grad` will differentiate it against the fist argument.\n",
+        "  # The user must split trainable and non-trainable variables into `params` and `other_state`.\n",
+        "  # Must pass a different RNG key each time for the dropout mask to be different.\n",
         "  def loss(self, params, other_state, rng, data, labels, train):\n",
         "    logits, batch_stats = self.apply({'params': params, **other_state},\n",
         "                                     data,\n",
         "                                     mutable=['batch_stats'],\n",
         "                                     rngs={'dropout': rng},\n",
         "                                     train=train)\n",
-        "    # loss averaged across batch dimension\n",
+        "    # The loss averaged across the batch dimension.\n",
         "    loss = optax.softmax_cross_entropy(logits, labels).mean()\n",
         "    return loss, batch_stats\n",
         "\n",
@@ -374,7 +374,7 @@
         "id": "7Cr0FRNFtHN4"
       },
       "source": [
-        "## Write the train_step"
+        "## Write the training step function"
       ]
     },
     {
@@ -385,7 +385,7 @@
       },
       "outputs": [],
       "source": [
-        "# Training step\n",
+        "# The training step.\n",
         "@partial(jax.jit, static_argnums=[0]) # this forces jax.jit to recompile for every new model\n",
         "def train_step(model, state, optimizer_state, rng, data, labels):\n",
         "\n",
@@ -423,18 +423,18 @@
         "  rng = jax.random.PRNGKey(0)\n",
         "  for epoch in range(epochs):\n",
         "\n",
-        "    # this is where the learning rate schedule state is stored in the optimizer state\n",
+        "    # This is where the learning rate schedule state is stored in the optimizer state.\n",
         "    optimizer_step = optimizer_state[1].count\n",
         "\n",
-        "    # run an epoch of training\n",
+        "    # Run an epoch of training.\n",
         "    for step, (data, labels) in enumerate(train_data):\n",
         "      p.step(reset=(step==0))\n",
         "      state, optimizer_state, rng, loss = train_step(model, state, optimizer_state, rng, data.numpy(), labels.numpy())\n",
         "      losses.append(loss)\n",
         "    avg_loss = np.mean(losses[-step:])\n",
         "    avg_losses.append(avg_loss)\n",
         "\n",
-        "    # run one epoch of evals (10,000 test images in a single batch)\n",
+        "    # Run one epoch of evals (10,000 test images in a single batch).\n",
         "    other_state, params = state.pop('params')\n",
         "    # Gotcha: must discard modified batch_stats here\n",
         "    eval_loss, _ = model.loss(params, other_state, rng, all_test_data.numpy(), all_test_labels.numpy(), train=False)\n",
@@ -453,7 +453,7 @@
         "id": "DGB3W5g0Wt1H"
       },
       "source": [
-        "## Create the model, and optimizer (with Optax)"
+        "## Create the model and the optimizer (with Optax)"
       ]
     },
     {
@@ -464,11 +464,11 @@
       },
       "outputs": [],
       "source": [
-        "# Model\n",
+        "# The model.\n",
         "model = ConvModel()\n",
         "state = model.init({'params':jax.random.PRNGKey(0), 'dropout':jax.random.PRNGKey(0)}, one_batch, train=True) # Flax allows a separate RNG for \"dropout\"\n",
         "\n",
-        "# Optimizer\n",
+        "# The optimizer.\n",
         "optimizer = optax.adam(learning_rate=jlr_decay) # Gotcha: it does not seem to be possible to pass just a callable as LR, must be an Optax Schedule\n",
         "optimizer_state = optimizer.init(state['params'])\n",
         "\n",
@@ -531,7 +531,7 @@
         "model = ConvModel()\n",
         "state = model.init({'params':jax.random.PRNGKey(0), 'dropout':jax.random.PRNGKey(0)}, one_batch, train=True) # Flax allows a separate RNG for \"dropout\"\n",
         "\n",
-        "# Optimizer\n",
+        "# The optimizer.\n",
         "optimizer = optax.adam(learning_rate=jlr_decay) # LR must be an Optax LR Schedule\n",
         "optimizer_state = optimizer.init(state['params'])\n",
         "\n",
@@ -567,7 +567,8 @@
       },
       "source": [
         "## Save just enough for inference\n",
-        "If your goal is deploy your JAX model (so you can run inference using `model.predict()`), simply exporting it to [SavedModel](https://www.tensorflow.org/guide/saved_model) is sufficient. This section demonstrates how to accomplish that."
+        "\n",
+        "If your goal is to deploy your JAX model (so you can run inference using `model.predict()`), simply exporting it to [SavedModel](https://www.tensorflow.org/guide/saved_model) is sufficient. This section demonstrates how to accomplish that."
       ]
     },
     {
@@ -578,17 +579,17 @@
       },
       "outputs": [],
       "source": [
-        "# test data with different batch size to test polymorphic shapes\n",
-        "x,y = next(iter(train_data.unbatch().batch(13)))\n",
+        "# Test data with a different batch size to test polymorphic shapes.\n",
+        "x, y = next(iter(train_data.unbatch().batch(13)))\n",
         "\n",
         "m = tf.Module()\n",
-        "# wrap JAX state in tf.Variable (needed when calling converted JAX function\n",
+        "# Wrap the JAX state in `tf.Variable` (needed when calling the converted JAX function.\n",
         "state_vars = tf.nest.map_structure(tf.Variable, state)\n",
-        "# keep the wrapped state as flat list (needed in TF fine-tuning)\n",
+        "# Keep the wrapped state as flat list (needed in TensorFlow fine-tuning).\n",
         "m.vars = tf.nest.flatten(state_vars)\n",
-        "# convert the desired JAX function (model.predict)\n",
+        "# Convert the desired JAX function (`model.predict`).\n",
         "predict_fn = jax2tf.convert(model.predict, polymorphic_shapes=[\"...\", \"(b, 28, 28, 1)\"])\n",
-        "# wrap converted function in tf.function with correct TensorSpec (necessary for dynamic shapes to work)\n",
+        "# Wrap the converted function in `tf.function` with the correct `tf.TensorSpec` (necessary for dynamic shapes to work).\n",
         "@tf.function(autograph=False, input_signature=[tf.TensorSpec(shape=(None, 28, 28, 1), dtype=tf.float32)])\n",
         "def predict(data):\n",
         "    return predict_fn(state_vars, data)\n",
@@ -604,12 +605,12 @@
       },
       "outputs": [],
       "source": [
-        "# test the converted function\n",
-        "print(\"converted function predictions:\", np.argmax(m.predict(x).numpy(), axis=-1))\n",
-        "#reload the model\n",
+        "# Test the converted function.\n",
+        "print(\"Converted function predictions:\", np.argmax(m.predict(x).numpy(), axis=-1))\n",
+        "# Reload the model.\n",
         "reloaded_model = tf.saved_model.load(\"./\")\n",
-        "# test the reloaded converted function (should be same result)\n",
-        "print(\"reloaded  function predictions:\", np.argmax(reloaded_model.predict(x).numpy(), axis=-1))"
+        "# Test the reloaded converted function (the result should be the same).\n",
+        "print(\"Reloaded  function predictions:\", np.argmax(reloaded_model.predict(x).numpy(), axis=-1))"
       ]
     },
     {
@@ -725,10 +726,10 @@
       },
       "outputs": [],
       "source": [
-        "# instantiate the model\n",
+        "# Instantiate the model.\n",
         "tf_model = TFModel(state, model)\n",
         "\n",
-        "# save\n",
+        "# Save the model.\n",
         "tf.saved_model.save(tf_model, \"./\")"
       ]
     },
@@ -751,7 +752,7 @@
       "source": [
         "reloaded_model = tf.saved_model.load(\"./\")\n",
         "\n",
-        "# test if it works and that the batch size is indeed variable\n",
+        "# Test if it works and that the batch size is indeed variable.\n",
         "x,y = next(iter(train_data.unbatch().batch(13)))\n",
         "print(np.argmax(reloaded_model.predict(x).numpy(), axis=-1))\n",
         "x,y = next(iter(train_data.unbatch().batch(20)))\n",
@@ -780,14 +781,14 @@
       "source": [
         "optimizer = tf.keras.optimizers.Adam(learning_rate=tflr_decay)\n",
         "\n",
-        "# set the iteration step for the LR to resume from where it left off in JAX\n",
+        "# Set the iteration step for the learning rate to resume from where it left off in JAX.\n",
         "optimizer.iterations.assign(len(eval_losses)*STEPS_PER_EPOCH)\n",
         "\n",
         "p = Progress(STEPS_PER_EPOCH)\n",
         "\n",
         "for epoch in range(JAX_EPOCHS, JAX_EPOCHS+TF_EPOCHS):\n",
         "\n",
-        "  # this is where the learning rate schedule state is stored in the optimizer state\n",
+        "  # This is where the learning rate schedule state is stored in the optimizer state.\n",
         "  optimizer_step = optimizer.iterations\n",
         "\n",
         "  for step, (data, labels) in enumerate(train_data):\n",