JAX/FLAX scales up worse than Tensorflow #16473

giorgiofranceschelli · 2023-06-19T10:28:48Z

giorgiofranceschelli
Jun 19, 2023

Hello,
I started learning JAX/FLAX with a very simple VAE model working on MNIST, and when I saw its better performances vs Tensorflow I decided to move my current project to JAX/FLAX. However, with bigger and more complex architectures, I experienced worse performances than the original TF implementation. So I went back to the VAE-MNIST and checked if it scales up correctly, but it des not seem to be the case.
In particular, with this implementation:

import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import jax
from jax import numpy as jnp
from flax import linen as nn
from flax.training import train_state, checkpoints
import optax
import time
import os
os.environ['XLA_PYTHON_CLIENT_PREALLOCATE']='false' # useful to allow tf use gpu as well

# define tf models and training fn

class TFEncoder:

    def __init__(self, image_dim=(28, 28, 1), z_dim=3, conv_filters=32):
        def sampling(args):
            mean, logvar = args
            batch = tf.shape(mean)[0]
            dim = tf.shape(mean)[1]
            eps = tf.random.normal(shape=(batch, dim))
            return eps * tf.math.exp(logvar * 0.5) + mean
        inp = tf.keras.Input(shape=image_dim, name='image_input')
        x = inp
        x = tf.keras.layers.Conv2D(filters=conv_filters, kernel_size=3, strides=1, padding='same', activation='relu', name='conv_0')(x)
        x = tf.keras.layers.Conv2D(filters=conv_filters, kernel_size=3, strides=2, padding='same', activation='relu', name='conv_1')(x)
        x = tf.keras.layers.Conv2D(filters=conv_filters, kernel_size=3, strides=2, padding='same', activation='relu', name='conv_2')(x)
        x = tf.keras.layers.Conv2D(filters=conv_filters, kernel_size=3, strides=1, padding='same', activation='relu', name='conv_3')(x)
        x = tf.keras.layers.Flatten(name='flattening')(x)
        mean = tf.keras.layers.Dense(z_dim, name='mean_output')(x)
        logvar = tf.keras.layers.Dense(z_dim, name='logvar_output')(x)
        sample = tf.keras.layers.Lambda(sampling, name='sample_output')([mean, logvar])
        self.encoder = tf.keras.Model(inputs=inp, outputs=(mean, logvar, sample), name='encoder')

    def __call__(self, x):
        return self.encoder(x)

class TFDecoder:

    def __init__(self, image_dim=(28, 28, 1), z_dim=3, conv_filters=32):
        starting_shape = (image_dim[0] // 4, image_dim[1] // 4, conv_filters)
        inp = tf.keras.Input(shape=z_dim, name='z_input')
        x = tf.keras.layers.Dense(np.prod(starting_shape), name='dense')(inp)
        x = tf.keras.layers.Reshape(starting_shape, name='reshape')(x)
        x = tf.keras.layers.Conv2DTranspose(filters=conv_filters, kernel_size=3, strides=1, padding='same', activation='relu', name='convtranspose_0')(x)
        x = tf.keras.layers.Conv2DTranspose(filters=conv_filters, kernel_size=3, strides=2, padding='same', activation='relu', name='convtranspose_1')(x)
        x = tf.keras.layers.Conv2DTranspose(filters=conv_filters, kernel_size=3, strides=2, padding='same', activation='relu', name='convtranspose_2')(x)
        out = tf.keras.layers.Conv2DTranspose(filters=image_dim[-1], kernel_size=3, strides=1, padding='same', activation='sigmoid', name='image_output')(x)
        self.decoder = tf.keras.Model(inputs=inp, outputs=out, name='decoder')

    def __call__(self, z):
        return self.decoder(z)

class TFVAE:

    def __init__(self, image_dim=(28, 28, 1), z_dim=3, conv_filters=32):
        self.encoder = TFEncoder(image_dim, z_dim, conv_filters)
        self.decoder = TFDecoder(image_dim, z_dim, conv_filters)
        inp = self.encoder.encoder.input
        mean, logvar, sample = self.encoder(inp)
        out = self.decoder(sample) #was decoder.decoder
        self.model = tf.keras.Model(inputs=inp, outputs=[out, mean, logvar], name='vae')
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

    @tf.function
    def train_step(self, batch):
        def loss_fn():
            out, mean, logvar = self.model(batch, training=True)
            rec_loss = tf.reduce_mean(tf.math.square(batch-out))
            reg_loss = tf.reduce_mean(-0.5*tf.reduce_sum(1+logvar-tf.math.square(mean)-tf.math.exp(logvar), axis=1))
            return rec_loss + 0.05 * reg_loss, (rec_loss, reg_loss)
        with tf.GradientTape() as tape:
            loss, aux = loss_fn()
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        return aux

# define jax/flax models and training fn

class JFEncoder(nn.Module):
    image_dim: tuple = (28, 28, 1)
    z_dim: int = 3
    conv_filters: int = 32

    @nn.compact
    def __call__(self, x, rng):
        x = nn.Conv(conv_filters, (3, 3), strides=(1, 1), padding='SAME', name='conv_0')(x)
        x = nn.relu(x)
        x = nn.Conv(conv_filters, (3, 3), strides=(2, 2), padding='SAME', name='conv_1')(x)
        x = nn.relu(x)
        x = nn.Conv(conv_filters, (3, 3), strides=(2, 2), padding='SAME', name='conv_2')(x)
        x = nn.relu(x)
        x = nn.Conv(conv_filters, (3, 3), strides=(1, 1), padding='SAME', name='conv_3')(x)
        x = nn.relu(x)
        x = jnp.reshape(x, (x.shape[0], x.shape[1]*x.shape[2]*x.shape[3]))
        mean = nn.Dense(self.z_dim, name='dense_mean')(x)
        logvar = nn.Dense(self.z_dim, name='dense_logvar')(x)
        eps = jax.random.normal(rng, mean.shape)
        sample = mean + eps * jnp.exp(logvar * 0.5)
        return mean, logvar, sample

class JFDecoder(nn.Module):
    image_dim: tuple = (28, 28, 1)
    z_dim: int = 3
    conv_filters: int = 32

    @nn.compact
    def __call__(self, z):
        starting_shape = (int(self.image_dim[0] // 4), int(self.image_dim[1] // 4), conv_filters)
        dense_shape = starting_shape[0] * starting_shape[1] * starting_shape[2]
        x = nn.Dense(dense_shape, name='dense_input')(z)
        x = jnp.reshape(x, (x.shape[0], *starting_shape))
        x = nn.ConvTranspose(conv_filters, (3, 3), strides=(1, 1), padding='SAME', name='conv_0')(x)
        x = nn.relu(x)
        x = nn.ConvTranspose(conv_filters, (3, 3), strides=(2, 2), padding='SAME', name='conv_1')(x)
        x = nn.relu(x)
        x = nn.ConvTranspose(conv_filters, (3, 3), strides=(2, 2), padding='SAME', name='conv_2')(x)
        x = nn.relu(x)
        x = nn.ConvTranspose(self.image_dim[-1], (3, 3), strides=(1, 1), padding=1, name='conv_output')(x)
        return nn.sigmoid(x)

class JFVAE(nn.Module):
    image_dim: tuple = (28, 28, 1)
    z_dim: int = 3
    conv_filters: int = 32

    def setup(self):
        self.encoder = JFEncoder(image_dim=image_dim, z_dim=z_dim, conv_filters=conv_filters)
        self.decoder = JFDecoder(image_dim=image_dim, z_dim=z_dim, conv_filters=conv_filters)

    def __call__(self, x, rng):
        mean, logvar, sample = self.encoder(x, rng)
        out = self.decoder(sample)
        return out, mean, logvar

@jax.jit
def jf_train_step(vae_state, rnd_key, batch):
    def loss_fn(params):
        out, mean, logvar = vae_state.apply_fn({'params': params}, batch, rnd_key)
        rec_loss = jnp.square(batch - out).mean()
        reg_loss = jnp.multiply(-0.5, jnp.sum(1 + logvar - jnp.square(mean) - jnp.exp(logvar), axis=1)).mean()
        return rec_loss + 0.05 * reg_loss, (rec_loss, reg_loss)
    grads, aux = jax.grad(loss_fn, has_aux=True)(vae_state.params)
    vae_state = vae_state.apply_gradients(grads=grads)
    return vae_state, aux


image_dim = (28, 28, 1)
z_dim = 64
batch_size = 64
conv_filters = 32

# jax/flax tests

key = jax.random.PRNGKey(1)

jf_vae = JFVAE(image_dim, z_dim, conv_filters)
key, init_key, params_key = jax.random.split(key, num=3)
params = jf_vae.init(init_key, jnp.ones((1,) + image_dim), params_key)['params']
optimizer = optax.adam(learning_rate=0.0001)
jf_vae_state = train_state.TrainState.create(apply_fn=jf_vae.apply, params=params, tx=optimizer)

def normalize_img(ds):
    return tf.cast(ds['image'], tf.float32) / 255., ds['label']
    #return tf.image.resize(tf.cast(ds['image'], tf.float32), [64, 64]) / 255., ds['label']
(dataset, _), ds_info = tfds.load('mnist', split=['train', 'test'], shuffle_files=True, with_info=True)
dataset = dataset.map(normalize_img, num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.cache()
dataset = dataset.repeat()
dataset = dataset.shuffle(50000)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(5)
dataset = iter(tfds.as_numpy(dataset))
num_of_batches = 50000//batch_size

# time for compiling train step
t = time.time()
key, training_key = jax.random.split(key)
jf_vae_state, aux = jf_train_step(jf_vae_state, training_key, next(dataset)[0])
aux = aux[1].block_until_ready()
print("JF time to compile train fn:", str(time.time()-t))

# time for pre-compiled train steps
t = time.time()
for i in range(1, num_of_batches):
    key, training_key = jax.random.split(key)
    jf_vae_state, _ = jf_train_step(jf_vae_state, training_key, next(dataset)[0])
    #print("Epoch " + str(i) + ": " + str(aux))
print("JF time to run", str(num_of_batches-1), "train steps:", str(time.time()-t))


# tensorflow tests

tf_vae = TFVAE(image_dim=image_dim, z_dim=z_dim, conv_filters=conv_filters)

def normalize_img(ds):
    return tf.cast(ds['image'], tf.float32) / 255., ds['label']
    #return tf.image.resize(tf.cast(ds['image'], tf.float32), [64, 64]) / 255., ds['label']
(dataset, _), ds_info = tfds.load('mnist', split=['train', 'test'], shuffle_files=True, with_info=True)
dataset = dataset.map(normalize_img, num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.cache()
dataset = dataset.repeat()
dataset = dataset.shuffle(50000)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(5)
dataset = iter(tfds.as_numpy(dataset))
num_of_batches = 50000//batch_size

# time for compiling train step
t = time.time()
_ = tf_vae.train_step(next(dataset)[0])
print("TF time to compile train fn:", str(time.time()-t))

# time for pre-compiled train steps
t = time.time()
for i in range(1, num_of_batches):
    _ = tf_vae.train_step(next(dataset)[0])
    #print("Epoch " + str(i) + ": " + str(aux))
print("TF time to run", str(num_of_batches-1), "train steps:", str(time.time()-t))

and by varying image dimension, latent size, batch size or convolutional filters I obtained the following performances:

with z_dim = 64, bs = 64, image_dim = (28, 28, 1), conv_filters = 32:

JF time to compile train fn: 9.956361055374146
JF time to run 780 train steps: 2.8763749599456787
TF time to compile train fn: 8.214534759521484
TF time to run 780 train steps: 4.904665470123291

with z_dim = 64, bs = 64, image_dim = (64, 64, 1), conv_filters = 32:

JF time to compile train fn: 15.253747701644897
JF time to run 780 train steps: 17.020952939987183
TF time to compile train fn: 10.150501489639282
TF time to run 780 train steps: 11.845132827758789

with z_dim = 256, bs = 64, image_dim = (28, 28, 1), conv_filters = 32:

JF time to compile train fn: 10.856410503387451
JF time to run 780 train steps: 3.5549447536468506
TF time to compile train fn: 8.242745399475098
TF time to run 780 train steps: 4.743371486663818

with z_dim = 64, bs = 256, image_dim = (28, 28, 1), conv_filters = 32:

JF time to compile train fn: 12.112151384353638
JF time to run 194 train steps: 2.3101210594177246
TF time to compile train fn: 8.713548421859741
TF time to run 194 train steps: 2.8273186683654785

with z_dim = 64, bs = 64, image_dim = (28, 28, 1), conv_filters = 128:

JF time to compile train fn: 9.290544033050537
JF time to run 780 train steps: 14.344175577163696
TF time to compile train fn: 14.395432949066162
TF time to run 780 train steps: 13.691234588623047

I runned everything on GPU with Google Colab.

As you can see, JAX/FLAX is way faster for the base experiment, but slows down with respect to Tensorflow if z_dim, batch_size, image_dim or conv_filters increase.

What am I doing wrong? Any help would be appreciated.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

JAX/FLAX scales up worse than Tensorflow #16473

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

JAX/FLAX scales up worse than Tensorflow #16473

Uh oh!

giorgiofranceschelli Jun 19, 2023

Replies: 0 comments

giorgiofranceschelli
Jun 19, 2023