The following code builds and trains a Variational Autoencoder (VAE) using TensorFlow/Keras on the MNIST handwritten digits dataset, then uses it to generate new digit images.
#
# Install and import necessary libraries:
# tensorflow for the model, NumPy for array math, Matplotlib to visualize results
#
!pip install tensorflow
import numpy as np # for general numerical work
import matplotlib.pyplot as plt # for plotting generated images
import tensorflow as tf # for building and training the VAE with Keras
#
# Load MNIST dataset
#
mnist = tf.keras.datasets.mnist # load the MNIST dataset: 28×28 grayscale digit images
(x_train, _), (x_test, _) = mnist.load_data() # ignore the labels (_) because VAE is unsupervised.
x_train, x_test = x_train / 255.0, x_test / 255.0 # divide by 255 so pixel values go from [0, 255] -> [0, 1]
# VAEs model pixel intensities as probabilities; scaling to [0, 1] works well
# with the sigmoid output and binary cross-entropy loss.
#
# Set hyperparameters
# (control training speed, stability, and how long training runs)
#
learning_rate = 0.001 # how big each gradient step is (used by optimizer)
num_steps = 100 # number of epochs (full passes over the dataset)
batch_size = 64 # number of examples per gradient update
#
# Define model architecture
#
latent_dim = 2 # example latent space dimension
# 2D makes the latent space easy to visualize (e.g., grid or scatter of digits),
# and is enough for a simple demo.
# Define the encoder part
encoder_inputs = tf.keras.Input(shape=(28, 28)) # (28, 28) image
x = tf.keras.layers.Flatten()(encoder_inputs) # flatten to 784-dimensional vector
x = tf.keras.layers.Dense(512, activation='relu')(x) # dense layer (512 units, ReLU) for feature extraction
z_mean = tf.keras.layers.Dense(latent_dim)(x) # z_mean = μ (mean) of latent distribution
z_log_var = tf.keras.layers.Dense(latent_dim)(x) # z_log_var = log σ² (log-variance)
# define the decoder part
# the decoder maps latent vector to reconstructed image
latent_inputs = tf.keras.Input(shape=(latent_dim,)) # latent vector z of size latent_dim (here 2)
x = tf.keras.layers.Dense(512, activation='relu')(latent_inputs) # dense 512 ReLU
x = tf.keras.layers.Dense(784, activation='sigmoid')(x) # dense 784 sigmoid
decoder_outputs = tf.keras.layers.Reshape((28, 28))(x) # reshape to (28, 28)
#
# Define the sampling function
#
class Sampling(tf.keras.layers.Layer):
# This custom Keras layer implements z=μ+σ⋅ϵ,ϵ∼N(0,I).
# This is the reparameterization trick.
# We want to sample from a distribution but still be able to backpropagate gradients.
# Writing z as a differentiable function of μ, σ, and a noise term ε makes this possible.
def call(self, inputs):
z_mean, z_log_var = inputs # z_mean = μ, z_log_var = log σ² → tf.exp(0.5*z_log_var) = σ
batch = tf.shape(z_mean)[0]
dim = tf.shape(z_mean)[1]
epsilon = tf.keras.backend.random_normal(shape=(batch, dim)) # epsilon = random noise ~ N(0, 1)
return z_mean + tf.exp(0.5 * z_log_var) * epsilon # return sampled latent vector z
#
# Connect the encoder and decoder
#
# Encoder model
# input: image, output: (z_mean, z_log_var, z)
encoder_outputs = Sampling()([z_mean, z_log_var])
encoder = tf.keras.Model(inputs=encoder_inputs, outputs=[z_mean, z_log_var, encoder_outputs])
# Decoder model
# input: latent vector z, output: reconstructed image
decoder = tf.keras.Model(inputs=latent_inputs, outputs=decoder_outputs)
vae_outputs = decoder(encoder(encoder_inputs)[2])
vae = tf.keras.Model(inputs=encoder_inputs, outputs=vae_outputs)
#
# Define the loss function and compile the model
#
# Define the VAE loss within the VAE model class
class VAE(tf.keras.Model):
def __init__(self, encoder, decoder, **kwargs):
super(VAE, self).__init__(**kwargs)
self.encoder = encoder
self.decoder = decoder
def compute_loss(self, x):
z_mean, z_log_var, z = self.encoder(x)
reconstructed = self.decoder(z)
reconstruction_loss = tf.reduce_mean( # Binary cross-entropy per pixel → average over batch
tf.keras.losses.binary_crossentropy(x, reconstructed)
)
reconstruction_loss *= 28 * 28 # multiply by 28x28 to bring it to total-pixel scale
# Implement the closed-form KL divergence between:
# q(z∣x)=N(z mean , exp(z log_var ))
# and 𝑝 ( 𝑧 ) = 𝑁 ( 0 , 𝐼 ) p(z)=N(0,I).
kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
kl_loss = tf.reduce_mean(kl_loss)
kl_loss *= -0.5
return reconstruction_loss + kl_loss
def train_step(self, data):
if isinstance(data, tuple):
data = data[0]
with tf.GradientTape() as tape:
loss = self.compute_loss(data)
grads = tape.gradient(loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
return {'loss': loss}
# Instantiate and compile the VAE
vae = VAE(encoder, decoder)
vae.compile(optimizer='adam')
#
# Train the model
#
vae.fit(x_train, x_train, epochs=num_steps, batch_size=batch_size)
#
# Generate a manifold of digits
#
def generate_images(model, n_images):
# sample from the latent space
random_latent_vectors = tf.random.normal(shape=(n_images, latent_dim))
# decode them to fake images
generated_images = model.decoder(random_latent_vectors)
generated_images = generated_images.numpy()
# calculate the number of rows needed in the subplot grid
n_rows = int(np.ceil(n_images / 4))
# plot the generated images
plt.figure(figsize=(10, 10))
for i in range(n_images):
plt.subplot(n_rows, 4, i + 1)
plt.imshow(generated_images[i].reshape(28, 28), cmap='gray')
plt.axis('off')
plt.show()
#
# Generate and display images
#
generate_images(vae, 16)