Train your neural network and watch it learn to recognize digits
Training a neural network is like teaching it through examples. The network makes predictions, we tell it the correct answers, and it adjusts its internal parameters (weights and biases) to improve future predictions.
Let's prepare our model and data for training.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
# Load and preprocess data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
# Normalize and reshape
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train_flat = x_train.reshape(-1, 784)
x_test_flat = x_test.reshape(-1, 784)
# Create validation split
validation_split = 0.2
val_size = int(len(x_train_flat) * validation_split)
x_val = x_train_flat[:val_size]
y_val = y_train[:val_size]
x_train_final = x_train_flat[val_size:]
y_train_final = y_train[val_size:]
print(f"Training samples: {len(x_train_final)}")
print(f"Validation samples: {len(x_val)}")
print(f"Test samples: {len(x_test_flat)}")
# Build the model
model = tf.keras.Sequential([
tf.keras.layers.Dense(128, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
# Compile the model
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
print("Model ready for training!")
Let's train the model and watch it learn! We'll start with a short training run to see the process.
# Train the model for 5 epochs
print("Starting training...")
start_time = datetime.now()
history = model.fit(
x_train_final, y_train_final,
epochs=5,
batch_size=32,
validation_data=(x_val, y_val),
verbose=1 # Show training progress
)
end_time = datetime.now()
training_time = (end_time - start_time).total_seconds()
print(f"\nTraining completed in {training_time:.2f} seconds!")
# Display final metrics
final_loss = history.history['loss'][-1]
final_acc = history.history['accuracy'][-1]
final_val_loss = history.history['val_loss'][-1]
final_val_acc = history.history['val_accuracy'][-1]
print(f"Final training accuracy: {final_acc:.4f}")
print(f"Final validation accuracy: {final_val_acc:.4f}")
Let's decode what happens during each epoch of training.
Graphs help us understand how well our model is learning.
# Plot training history
def plot_training_history(history):
"""Plot training and validation metrics"""
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
# Plot accuracy
ax1.plot(history.history['accuracy'], 'b-', label='Training Accuracy')
ax1.plot(history.history['val_accuracy'], 'r-', label='Validation Accuracy')
ax1.set_title('Model Accuracy')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.legend()
ax1.grid(True)
# Plot loss
ax2.plot(history.history['loss'], 'b-', label='Training Loss')
ax2.plot(history.history['val_loss'], 'r-', label='Validation Loss')
ax2.set_title('Model Loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.grid(True)
plt.tight_layout()
plt.show()
# Visualize our training
plot_training_history(history)
# Print improvement summary
initial_acc = history.history['accuracy'][0]
final_acc = history.history['accuracy'][-1]
improvement = final_acc - initial_acc
print(f"Accuracy improvement: {initial_acc:.4f} โ {final_acc:.4f} (+{improvement:.4f})")
print(f"That's a {improvement*100:.2f}% improvement!")
Let's see how our trained model performs on actual digit images.
# Test the trained model
def test_model_predictions(model, x_test, y_test, num_samples=8):
"""Test model predictions on random samples"""
# Select random test samples
test_indices = np.random.choice(len(x_test), num_samples, replace=False)
plt.figure(figsize=(16, 8))
for i, idx in enumerate(test_indices):
# Get prediction
image = x_test[idx:idx+1]
prediction = model.predict(image, verbose=0)
predicted_class = np.argmax(prediction[0])
confidence = np.max(prediction[0])
actual_class = y_test[idx]
# Determine if prediction is correct
is_correct = predicted_class == actual_class
color = 'green' if is_correct else 'red'
# Plot image
plt.subplot(2, num_samples, i+1)
plt.imshow(x_test.reshape(-1, 28, 28)[idx], cmap='gray')
plt.title(f'True: {actual_class}', color='black')
plt.axis('off')
# Plot prediction probabilities
plt.subplot(2, num_samples, i+1+num_samples)
bars = plt.bar(range(10), prediction[0], color=['red' if j == predicted_class else 'lightblue' for j in range(10)])
plt.title(f'Pred: {predicted_class} ({confidence:.2f})', color=color)
plt.xlabel('Digit')
plt.ylabel('Probability')
plt.xticks(range(10))
plt.ylim(0, 1)
plt.tight_layout()
plt.show()
# Test our model
print("Testing trained model predictions:")
test_model_predictions(model, x_test_flat, y_test)
Let's train for more epochs to see further improvement.
# Train for more epochs
print("Training for 15 more epochs...")
# Continue training from where we left off
extended_history = model.fit(
x_train_final, y_train_final,
epochs=15,
batch_size=32,
validation_data=(x_val, y_val),
verbose=1,
initial_epoch=5 # Continue from epoch 5
)
# Combine training histories
combined_history = {
'accuracy': history.history['accuracy'] + extended_history.history['accuracy'],
'val_accuracy': history.history['val_accuracy'] + extended_history.history['val_accuracy'],
'loss': history.history['loss'] + extended_history.history['loss'],
'val_loss': history.history['val_loss'] + extended_history.history['val_loss']
}
# Create a mock history object for plotting
class MockHistory:
def __init__(self, history_dict):
self.history = history_dict
combined_mock_history = MockHistory(combined_history)
# Plot the complete training history
print("Complete training progress:")
plot_training_history(combined_mock_history)
# Evaluate on test set
test_loss, test_accuracy = model.evaluate(x_test_flat, y_test, verbose=0)
print(f"\nFinal Test Results:")
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"Test Loss: {test_loss:.4f}")
# Calculate improvement
total_improvement = combined_history['accuracy'][-1] - combined_history['accuracy'][0]
print(f"Total accuracy improvement: {total_improvement:.4f} ({total_improvement*100:.2f}%)")
The optimizer determines how the model updates its weights. Let's compare different optimizers.
# Compare different optimizers
optimizers_to_test = {
'SGD': tf.keras.optimizers.SGD(learning_rate=0.01),
'Adam': tf.keras.optimizers.Adam(learning_rate=0.001),
'RMSprop': tf.keras.optimizers.RMSprop(learning_rate=0.001),
}
optimizer_results = {}
for name, optimizer in optimizers_to_test.items():
print(f"\nTesting {name} optimizer...")
# Create a fresh model
test_model = tf.keras.Sequential([
tf.keras.layers.Dense(128, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
# Compile with different optimizer
test_model.compile(
optimizer=optimizer,
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Train for 5 epochs
history = test_model.fit(
x_train_final, y_train_final,
epochs=5,
batch_size=32,
validation_data=(x_val, y_val),
verbose=0 # Silent training
)
# Store results
final_acc = history.history['val_accuracy'][-1]
optimizer_results[name] = final_acc
print(f"{name} final validation accuracy: {final_acc:.4f}")
# Compare results
print("\n=== OPTIMIZER COMPARISON ===")
for name, acc in sorted(optimizer_results.items(), key=lambda x: x[1], reverse=True):
print(f"{name}: {acc:.4f} ({acc*100:.2f}%)")
# Visualize comparison
plt.figure(figsize=(10, 6))
names = list(optimizer_results.keys())
accuracies = list(optimizer_results.values())
bars = plt.bar(names, accuracies, color=['blue', 'orange', 'green'])
plt.title('Optimizer Comparison (5 Epochs)')
plt.ylabel('Validation Accuracy')
plt.ylim(0, 1)
# Add value labels on bars
for bar, acc in zip(bars, accuracies):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
f'{acc:.3f}', ha='center', va='bottom')
plt.show()
These hyperparameters significantly affect training performance.
# Test different learning rates
learning_rates = [0.1, 0.01, 0.001, 0.0001]
lr_results = {}
print("Testing different learning rates...")
for lr in learning_rates:
print(f"\nTesting learning rate: {lr}")
# Create fresh model
test_model = tf.keras.Sequential([
tf.keras.layers.Dense(128, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
# Compile with different learning rate
test_model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Train for 3 epochs (quick test)
try:
history = test_model.fit(
x_train_final, y_train_final,
epochs=3,
batch_size=32,
validation_data=(x_val, y_val),
verbose=0
)
final_acc = history.history['val_accuracy'][-1]
lr_results[lr] = final_acc
print(f"Learning rate {lr}: {final_acc:.4f}")
except Exception as e:
print(f"Learning rate {lr}: Failed - {str(e)}")
lr_results[lr] = 0.0
# Visualize learning rate effects
plt.figure(figsize=(10, 6))
lrs = list(lr_results.keys())
accs = list(lr_results.values())
plt.semilogx(lrs, accs, 'bo-', linewidth=2, markersize=8)
plt.title('Learning Rate Effect on Performance')
plt.xlabel('Learning Rate')
plt.ylabel('Validation Accuracy (3 epochs)')
plt.grid(True)
# Annotate points
for lr, acc in zip(lrs, accs):
plt.annotate(f'{acc:.3f}', (lr, acc), textcoords="offset points", xytext=(0,10), ha='center')
plt.show()
print("\nBest learning rate:", max(lr_results.items(), key=lambda x: x[1]))
Time to experiment with training parameters!
Create a custom training setup with the following specifications:
# Your solution here
def create_and_train_custom_model():
"""Create and train a custom model"""
# Build model
model = tf.keras.Sequential([
# Your architecture here
])
# Compile model
model.compile(
# Your compilation settings here
)
# Train model
history = model.fit(
# Your training parameters here
)
# Evaluate model
test_loss, test_acc = model.evaluate(x_test_flat, y_test, verbose=0)
return model, history, test_acc
# Test your implementation
custom_model, custom_history, custom_test_acc = create_and_train_custom_model()
print(f"Your model test accuracy: {custom_test_acc:.4f}")
# Plot your results
plot_training_history(custom_history)
Implement early stopping to prevent overfitting:
# Implement early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=3,
restore_best_weights=True
)
# Your model with early stopping
model_with_es = tf.keras.Sequential([
# Your model architecture
])
model_with_es.compile(
# Your compilation settings
)
# Train with early stopping
history_es = model_with_es.fit(
x_train_final, y_train_final,
epochs=50, # Large number, but will stop early
batch_size=32,
validation_data=(x_val, y_val),
callbacks=[early_stopping],
verbose=0
)
print(f"Training stopped at epoch: {len(history_es.history['loss'])}")
print(f"Best validation accuracy: {max(history_es.history['val_accuracy']):.4f}")
# Plot early stopping results
plot_training_history(history_es)
Excellent work! You've successfully trained your first neural network. Here's what you've accomplished:
Your model is now trained and performing well! In the next lesson, we'll dive deeper into evaluating model performance and techniques to improve accuracy even further.