Create a feedforward neural network to classify MNIST digits
A neural network is like a complex function that learns to map inputs to outputs. For MNIST digit classification, we need to map 28×28 pixel images to one of 10 digit classes (0-9).
Input Layer: 784 neurons (28×28 flattened pixels)
↓
Hidden Layer 1: 128 neurons + ReLU activation
↓
Hidden Layer 2: 64 neurons + ReLU activation
↓
Output Layer: 10 neurons + Softmax activation (one per digit)
Total Parameters: ~101,000 trainable parameters!
Let's start by importing libraries and loading our preprocessed MNIST data.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
from tensorflow.keras.utils import plot_model
# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")
# Load and preprocess MNIST data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
# Normalize pixel values to [0, 1]
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
# Flatten images for our dense network
x_train_flat = x_train.reshape(-1, 28 * 28)
x_test_flat = x_test.reshape(-1, 28 * 28)
print(f"Training data shape: {x_train_flat.shape}")
print(f"Test data shape: {x_test_flat.shape}")
print(f"Number of classes: {len(np.unique(y_train))}")
Keras provides multiple ways to build models. The Sequential API is perfect for stacking layers one after another.
# Create a Sequential model
model = tf.keras.Sequential([
# Input layer (implicitly defined by first layer)
tf.keras.layers.Dense(128, activation='relu', input_shape=(784,)),
# Hidden layer
tf.keras.layers.Dense(64, activation='relu'),
# Output layer
tf.keras.layers.Dense(10, activation='softmax')
])
# Display model architecture
print("Model Architecture:")
model.summary()
# Visualize the model (optional)
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True)
# Alternative way to build the same model
model_alt = tf.keras.Sequential()
# Add layers one by one
model_alt.add(tf.keras.layers.Dense(128, activation='relu', input_shape=(784,)))
model_alt.add(tf.keras.layers.Dense(64, activation='relu'))
model_alt.add(tf.keras.layers.Dense(10, activation='softmax'))
print("Alternative model summary:")
model_alt.summary()
# Let's examine a single Dense layer
single_layer = tf.keras.layers.Dense(64, activation='relu', input_shape=(784,))
# Create a mini-model to see the layer in action
test_model = tf.keras.Sequential([single_layer])
test_model.build(input_shape=(None, 784))
print(f"Layer weights shape: {single_layer.weights[0].shape}") # (784, 64)
print(f"Layer bias shape: {single_layer.weights[1].shape}") # (64,)
print(f"Total parameters in this layer: {784 * 64 + 64}")
# Test with a single image
test_input = x_train_flat[0:1] # Shape: (1, 784)
test_output = test_model(test_input)
print(f"Input shape: {test_input.shape}")
print(f"Output shape: {test_output.shape}")
print(f"Output sample: {test_output[0][:10].numpy()}") # First 10 values
# Compare different activation functions
x = np.linspace(-5, 5, 100)
# ReLU: max(0, x)
relu = np.maximum(0, x)
# Sigmoid: 1 / (1 + e^(-x))
sigmoid = 1 / (1 + np.exp(-x))
# Tanh: (e^x - e^(-x)) / (e^x + e^(-x))
tanh = np.tanh(x)
# Plot comparison
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.plot(x, relu, 'b-', linewidth=2)
plt.title('ReLU Activation')
plt.xlabel('Input')
plt.ylabel('Output')
plt.grid(True)
plt.subplot(1, 3, 2)
plt.plot(x, sigmoid, 'r-', linewidth=2)
plt.title('Sigmoid Activation')
plt.xlabel('Input')
plt.ylabel('Output')
plt.grid(True)
plt.subplot(1, 3, 3)
plt.plot(x, tanh, 'g-', linewidth=2)
plt.title('Tanh Activation')
plt.xlabel('Input')
plt.ylabel('Output')
plt.grid(True)
plt.tight_layout()
plt.show()
print("Activation function properties:")
print("ReLU: Fast, helps with vanishing gradients, but can 'die'")
print("Sigmoid: Smooth, but suffers from vanishing gradients")
print("Tanh: Zero-centered, better than sigmoid for hidden layers")
Before training, we need to compile the model by specifying the optimizer, loss function, and metrics.
# Compile the model
model.compile(
optimizer='adam', # Adaptive learning rate optimizer
loss='sparse_categorical_crossentropy', # For integer labels
metrics=['accuracy'] # Track accuracy during training
)
print("Model compiled successfully!")
print("\nCompilation choices explained:")
print("Optimizer: Adam - adaptive learning rate, works well for most problems")
print("Loss: Sparse categorical crossentropy - for multi-class classification with integer labels")
print("Metrics: Accuracy - percentage of correct predictions")
# Alternative compilation with different settings
model_alt.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)
print("\nAlternative compilation with explicit objects:")
Let's examine our model more closely before training.
# Detailed model information
print("=== MODEL SUMMARY ===")
model.summary()
print(f"\n=== TRAINABLE PARAMETERS ===")
total_params = model.count_params()
print(f"Total parameters: {total_params:,}")
print(f"\n=== LAYER DETAILS ===")
for i, layer in enumerate(model.layers):
print(f"Layer {i}: {layer.name}")
print(f" Type: {type(layer).__name__}")
print(f" Output shape: {layer.output_shape}")
print(f" Parameters: {layer.count_params():,}")
if hasattr(layer, 'activation'):
print(f" Activation: {layer.activation.__name__}")
print()
# Test forward pass with a single example
print("=== FORWARD PASS TEST ===")
test_image = x_train_flat[0:1] # Single image
predictions = model(test_image)
print(f"Input shape: {test_image.shape}")
print(f"Output shape: {predictions.shape}")
print(f"Raw predictions: {predictions[0].numpy()}")
print(f"Predicted class: {np.argmax(predictions[0])}")
print(f"Actual class: {y_train[0]}")
print(f"Confidence: {np.max(predictions[0]):.4f}")
Let's see what our untrained model predicts - this will be essentially random!
# Test untrained model on several examples
num_samples = 5
test_indices = np.random.choice(len(x_train), num_samples, replace=False)
print("=== UNTRAINED MODEL PREDICTIONS ===")
plt.figure(figsize=(15, 6))
for i, idx in enumerate(test_indices):
# Get prediction
image = x_train_flat[idx:idx+1]
prediction = model(image)
predicted_class = np.argmax(prediction[0])
confidence = np.max(prediction[0])
actual_class = y_train[idx]
# Plot image
plt.subplot(2, num_samples, i+1)
plt.imshow(x_train[idx], cmap='gray')
plt.title(f'Actual: {actual_class}')
plt.axis('off')
# Plot prediction probabilities
plt.subplot(2, num_samples, i+1+num_samples)
plt.bar(range(10), prediction[0])
plt.title(f'Pred: {predicted_class} ({confidence:.2f})')
plt.xlabel('Digit')
plt.ylabel('Probability')
plt.xticks(range(10))
plt.tight_layout()
plt.show()
print("Notice how the untrained model's predictions are essentially random!")
print("The probabilities are roughly equal across all classes.")
Now it's your turn to build and experiment with neural networks!
Create a neural network with the following specifications:
# Your solution here
def create_custom_model():
"""Create a neural network with 3 hidden layers"""
model = tf.keras.Sequential([
# Your code here
# Add layers according to specifications
])
# Compile the model
# Your code here
return model
# Test your model
custom_model = create_custom_model()
custom_model.summary()
# Compare parameter counts
print(f"Original model parameters: {model.count_params():,}")
print(f"Your model parameters: {custom_model.count_params():,}")
Create models with different activation functions and compare their architectures:
# Create models with different activations
def create_model_with_activation(activation):
"""Create a model with specified activation function"""
# Your code here
pass
# Test different activations
activations = ['relu', 'tanh', 'sigmoid']
models_dict = {}
for activation in activations:
models_dict[activation] = create_model_with_activation(activation)
print(f"\n{activation.upper()} Model:")
models_dict[activation].summary()
# Test prediction on same image
test_pred = models_dict[activation](x_train_flat[0:1])
print(f"Sample prediction range: [{test_pred[0].numpy().min():.4f}, {test_pred[0].numpy().max():.4f}]")
Congratulations! You've built your first neural network. Here's what you've accomplished:
Your model is ready but untrained - it's making random predictions! In the next lesson, we'll train this network and watch it learn to recognize digits with high accuracy.