Build a CNN for superior performance and deploy your model
While dense neural networks work well for MNIST, Convolutional Neural Networks (CNNs) are specifically designed for image data. They can capture spatial patterns and achieve higher accuracy with fewer parameters.
Let's understand the key components of a CNN before building one.
Input Image (28×28×1)
↓
Conv2D (32 filters, 3×3) + ReLU → 26×26×32
↓
MaxPooling2D (2×2) → 13×13×32
↓
Conv2D (64 filters, 3×3) + ReLU → 11×11×64
↓
MaxPooling2D (2×2) → 5×5×64
↓
Flatten → 1600 neurons
↓
Dense (128) + ReLU → 128 neurons
↓
Dense (10) + Softmax → 10 classes
Let's build a CNN step by step and compare it to our dense networks.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
# Load and preprocess data for CNN
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
# Normalize pixel values
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
# Reshape for CNN (add channel dimension)
x_train_cnn = x_train.reshape(-1, 28, 28, 1)
x_test_cnn = x_test.reshape(-1, 28, 28, 1)
# Create validation split
val_size = int(len(x_train_cnn) * 0.2)
x_val_cnn = x_train_cnn[:val_size]
y_val = y_train[:val_size]
x_train_final_cnn = x_train_cnn[val_size:]
y_train_final = y_train[val_size:]
print(f"Training data shape: {x_train_final_cnn.shape}")
print(f"Validation data shape: {x_val_cnn.shape}")
print(f"Test data shape: {x_test_cnn.shape}")
# Build the CNN model
def create_cnn_model():
"""Create a Convolutional Neural Network for MNIST"""
model = tf.keras.Sequential([
# First convolutional block
tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
tf.keras.layers.MaxPooling2D((2, 2)),
# Second convolutional block
tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
tf.keras.layers.MaxPooling2D((2, 2)),
# Third convolutional block (optional)
tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
# Flatten and dense layers
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10, activation='softmax')
])
return model
# Create and compile the CNN
cnn_model = create_cnn_model()
cnn_model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Display model architecture
print("CNN Model Architecture:")
cnn_model.summary()
# Calculate parameters comparison
total_params = cnn_model.count_params()
print(f"\nTotal parameters: {total_params:,}")
print("This is much fewer parameters than a dense network with similar performance!")
Let's visualize what each layer does to understand how CNNs process images.
# Visualize CNN layer outputs
def visualize_cnn_layers(model, image, layer_names=None):
"""Visualize intermediate layer outputs"""
if layer_names is None:
layer_names = ['conv2d', 'conv2d_1', 'conv2d_2']
# Create models that output intermediate layers
layer_outputs = []
for layer_name in layer_names:
for layer in model.layers:
if layer_name in layer.name:
intermediate_model = tf.keras.Model(
inputs=model.input,
outputs=layer.output
)
layer_outputs.append(intermediate_model(image))
break
# Plot original image and layer outputs
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
# Original image
axes[0, 0].imshow(image[0, :, :, 0], cmap='gray')
axes[0, 0].set_title('Original Image')
axes[0, 0].axis('off')
# Feature maps from each layer
for i, layer_output in enumerate(layer_outputs):
if i < 3: # Show first 3 layers
# Show first few feature maps
for j in range(min(3, layer_output.shape[-1])):
if i * 3 + j + 1 < 8: # Don't exceed subplot count
row = (i * 3 + j + 1) // 4
col = (i * 3 + j + 1) % 4
axes[row, col].imshow(layer_output[0, :, :, j], cmap='viridis')
axes[row, col].set_title(f'Layer {i+1}, Filter {j+1}')
axes[row, col].axis('off')
plt.tight_layout()
plt.show()
# Select a test image and visualize layers
test_image = x_test_cnn[0:1] # First test image
print(f"Visualizing layers for digit: {y_test[0]}")
# Train model briefly to get meaningful feature maps
print("Training CNN for a few epochs to see feature maps...")
cnn_model.fit(x_train_final_cnn, y_train_final,
epochs=3, batch_size=32, verbose=1)
# Now visualize the layers
visualize_cnn_layers(cnn_model, test_image)
Let's train our CNN and compare its performance to dense networks.
# Train the CNN model
print("Training CNN model...")
cnn_history = cnn_model.fit(
x_train_final_cnn, y_train_final,
epochs=15,
batch_size=32,
validation_data=(x_val_cnn, y_val),
verbose=1
)
# Evaluate on test set
cnn_test_loss, cnn_test_acc = cnn_model.evaluate(x_test_cnn, y_test, verbose=0)
print(f"\nCNN Test Results:")
print(f"Test Accuracy: {cnn_test_acc:.4f} ({cnn_test_acc*100:.2f}%)")
print(f"Test Loss: {cnn_test_loss:.4f}")
# Plot training history
def plot_training_history(history, title="Training History"):
"""Plot training and validation metrics"""
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
# Plot accuracy
ax1.plot(history.history['accuracy'], 'b-', label='Training Accuracy')
ax1.plot(history.history['val_accuracy'], 'r-', label='Validation Accuracy')
ax1.set_title(f'{title} - Accuracy')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.legend()
ax1.grid(True)
# Plot loss
ax2.plot(history.history['loss'], 'b-', label='Training Loss')
ax2.plot(history.history['val_loss'], 'r-', label='Validation Loss')
ax2.set_title(f'{title} - Loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.grid(True)
plt.tight_layout()
plt.show()
# Plot CNN training history
plot_training_history(cnn_history, "CNN Model")
# Compare with a dense network
print("\nTraining comparable dense network for comparison...")
dense_model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10, activation='softmax')
])
dense_model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
dense_history = dense_model.fit(
x_train_final_cnn, y_train_final,
epochs=15,
batch_size=32,
validation_data=(x_val_cnn, y_val),
verbose=0
)
dense_test_loss, dense_test_acc = dense_model.evaluate(x_test_cnn, y_test, verbose=0)
# Compare results
print(f"\n=== MODEL COMPARISON ===")
print(f"CNN Test Accuracy: {cnn_test_acc:.4f} ({cnn_test_acc*100:.2f}%)")
print(f"Dense Test Accuracy: {dense_test_acc:.4f} ({dense_test_acc*100:.2f}%)")
print(f"CNN Parameters: {cnn_model.count_params():,}")
print(f"Dense Parameters: {dense_model.count_params():,}")
print(f"Improvement: {(cnn_test_acc - dense_test_acc)*100:.2f}% better accuracy with fewer parameters!")
Let's implement some advanced techniques to push our CNN performance even higher.
# Advanced CNN with data augmentation and regularization
def create_advanced_cnn():
"""Create an advanced CNN with modern techniques"""
model = tf.keras.Sequential([
# Data augmentation (applied during training)
tf.keras.layers.RandomRotation(0.1),
tf.keras.layers.RandomZoom(0.1),
# First convolutional block
tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
tf.keras.layers.MaxPooling2D((2, 2)),
tf.keras.layers.Dropout(0.25),
# Second convolutional block
tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
tf.keras.layers.MaxPooling2D((2, 2)),
tf.keras.layers.Dropout(0.25),
# Dense layers
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(10, activation='softmax')
])
return model
# Create and train advanced CNN
advanced_cnn = create_advanced_cnn()
# Use advanced training techniques
advanced_cnn.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Advanced callbacks
callbacks = [
tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=3),
tf.keras.callbacks.ModelCheckpoint('best_cnn_model.h5', save_best_only=True)
]
print("Training advanced CNN...")
advanced_history = advanced_cnn.fit(
x_train_final_cnn, y_train_final,
epochs=30,
batch_size=32,
validation_data=(x_val_cnn, y_val),
callbacks=callbacks,
verbose=1
)
# Evaluate advanced model
advanced_test_loss, advanced_test_acc = advanced_cnn.evaluate(x_test_cnn, y_test, verbose=0)
print(f"\nAdvanced CNN Test Accuracy: {advanced_test_acc:.4f} ({advanced_test_acc*100:.2f}%)")
# Plot comparison
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(cnn_history.history['val_accuracy'], label='Basic CNN')
plt.plot(advanced_history.history['val_accuracy'], label='Advanced CNN')
plt.title('Validation Accuracy Comparison')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.subplot(1, 2, 2)
plt.bar(['Basic CNN', 'Advanced CNN'], [cnn_test_acc, advanced_test_acc])
plt.title('Final Test Accuracy')
plt.ylabel('Accuracy')
plt.ylim(0.98, 1.0)
# Add value labels
for i, v in enumerate([cnn_test_acc, advanced_test_acc]):
plt.text(i, v + 0.0005, f'{v:.4f}', ha='center', va='bottom')
plt.tight_layout()
plt.show()
Now let's learn how to save, load, and deploy your trained models for real-world use.
# Save your best model
print("Saving the best model...")
# Method 1: Save entire model
advanced_cnn.save('mnist_cnn_complete_model.h5')
print("Complete model saved as 'mnist_cnn_complete_model.h5'")
# Method 2: Save in SavedModel format (recommended for deployment)
advanced_cnn.save('mnist_cnn_savedmodel', save_format='tf')
print("Model saved in SavedModel format")
# Method 3: Save weights only
advanced_cnn.save_weights('mnist_cnn_weights.h5')
print("Weights saved as 'mnist_cnn_weights.h5'")
# Load and test the saved model
print("\nLoading and testing saved model...")
loaded_model = tf.keras.models.load_model('mnist_cnn_complete_model.h5')
# Test loaded model
loaded_test_loss, loaded_test_acc = loaded_model.evaluate(x_test_cnn, y_test, verbose=0)
print(f"Loaded model test accuracy: {loaded_test_acc:.4f}")
print("✅ Model saved and loaded successfully!")
# Test inference
sample_image = x_test_cnn[0:1]
original_prediction = advanced_cnn.predict(sample_image, verbose=0)
loaded_prediction = loaded_model.predict(sample_image, verbose=0)
print(f"Original prediction: {np.argmax(original_prediction)}")
print(f"Loaded prediction: {np.argmax(loaded_prediction)}")
print(f"Predictions match: {np.argmax(original_prediction) == np.argmax(loaded_prediction)}")
Let's create a simple function to use your model in production.
# Create inference function
def mnist_predictor(model_path, image_array):
"""
Predict digit from MNIST-like image
Args:
model_path: Path to saved model
image_array: 28x28 numpy array or 28x28x1 array
Returns:
dict: Prediction results
"""
# Load model
model = tf.keras.models.load_model(model_path)
# Preprocess image
if len(image_array.shape) == 2:
image_array = image_array.reshape(1, 28, 28, 1)
elif len(image_array.shape) == 3:
image_array = image_array.reshape(1, 28, 28, 1)
# Normalize if needed
if image_array.max() > 1.0:
image_array = image_array.astype('float32') / 255.0
# Make prediction
predictions = model.predict(image_array, verbose=0)
predicted_class = np.argmax(predictions[0])
confidence = np.max(predictions[0])
# Return results
return {
'predicted_digit': int(predicted_class),
'confidence': float(confidence),
'all_probabilities': predictions[0].tolist()
}
# Test the inference function
print("Testing inference function...")
test_image = x_test[0] # Raw image (28, 28)
result = mnist_predictor('mnist_cnn_complete_model.h5', test_image)
print(f"Predicted digit: {result['predicted_digit']}")
print(f"Confidence: {result['confidence']:.4f}")
print(f"Actual digit: {y_test[0]}")
print(f"Correct prediction: {result['predicted_digit'] == y_test[0]}")
# Test with multiple images
print("\nTesting on multiple images:")
correct_predictions = 0
total_predictions = 10
for i in range(total_predictions):
result = mnist_predictor('mnist_cnn_complete_model.h5', x_test[i])
is_correct = result['predicted_digit'] == y_test[i]
correct_predictions += is_correct
print(f"Image {i}: Predicted {result['predicted_digit']}, "
f"Actual {y_test[i]}, "
f"Confidence {result['confidence']:.3f}, "
f"{'✅' if is_correct else '❌'}")
accuracy = correct_predictions / total_predictions
print(f"\nSample accuracy: {accuracy:.2f} ({accuracy*100:.0f}%)")
Create your ultimate MNIST classifier and deployment pipeline!
Build a complete end-to-end solution with these requirements:
# Your ultimate MNIST solution
class MNISTClassifier:
"""Complete MNIST classification solution"""
def __init__(self):
self.model = None
self.model_loaded = False
def build_model(self):
"""Build the ultimate CNN architecture"""
# Your best CNN architecture here
pass
def train(self, x_train, y_train, x_val, y_val):
"""Train the model with best practices"""
# Your training code here
pass
def save_model(self, filepath):
"""Save the trained model"""
# Your saving code here
pass
def load_model(self, filepath):
"""Load a saved model"""
# Your loading code here
pass
def predict(self, image):
"""Predict digit from image with validation"""
# Your prediction code with error handling
pass
def evaluate(self, x_test, y_test):
"""Comprehensive model evaluation"""
# Your evaluation code here
pass
# Test your complete solution
classifier = MNISTClassifier()
# ... implement and test your solution
# Document your results
print("=== FINAL MODEL PERFORMANCE ===")
# Your performance documentation here
Try these advanced techniques:
You've completed the TensorFlow & Neural Networks course! Here's what you've mastered:
Your Achievement: You can now build, train, and deploy neural networks that achieve >99% accuracy on MNIST! These skills transfer to many real-world computer vision problems.
You've built a solid foundation in neural networks! Here are suggested next steps:
Consider taking the Quantitative Trading with ML course next to see how these techniques apply to financial markets!