Lesson 5: Model Evaluation & Improvement

Beyond Basic Accuracy

While accuracy is important, it's just one metric. To truly understand your model's performance, you need a comprehensive evaluation using multiple metrics, confusion matrices, and error analysis.

Key Evaluation Metrics

Accuracy: Overall percentage of correct predictions
Precision: Of predicted positives, how many were actually positive
Recall: Of actual positives, how many were correctly predicted
F1-Score: Harmonic mean of precision and recall
Confusion Matrix: Detailed breakdown of predictions vs reality

Setting Up for Evaluation

Let's start by training a model and then evaluating it comprehensively.

🚀 Model Setup and Training

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

# Load and preprocess data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train_flat = x_train.reshape(-1, 784)
x_test_flat = x_test.reshape(-1, 784)

# Create validation split
val_size = int(len(x_train_flat) * 0.2)
x_val = x_train_flat[:val_size]
y_val = y_train[:val_size]
x_train_final = x_train_flat[val_size:]
y_train_final = y_train[val_size:]

# Build and train a model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(784,)),
    tf.keras.layers.Dropout(0.2),  # Add dropout for regularization
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
print("Training model with dropout regularization...")
history = model.fit(
    x_train_final, y_train_final,
    epochs=15,
    batch_size=32,
    validation_data=(x_val, y_val),
    verbose=1
)

print("Training completed!")

Expected: Model trains with ~98% validation accuracy

Comprehensive Model Evaluation

Let's evaluate our model using multiple metrics and visualizations.

# Get predictions for test set
print("Evaluating model performance...")
test_loss, test_accuracy = model.evaluate(x_test_flat, y_test, verbose=0)
y_pred_probs = model.predict(x_test_flat, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)

print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"Test Loss: {test_loss:.4f}")

# Calculate detailed metrics
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred, average=None)

print("\n=== PER-CLASS METRICS ===")
print("Class | Precision | Recall | F1-Score | Support")
print("------|-----------|--------|----------|--------")
for i in range(10):
    print(f"  {i}   |   {precision[i]:.3f}   | {recall[i]:.3f} |  {f1[i]:.3f}  |  {support[i]:4d}")

# Overall metrics
overall_precision = np.average(precision, weights=support)
overall_recall = np.average(recall, weights=support)
overall_f1 = np.average(f1, weights=support)

print(f"\n=== OVERALL METRICS ===")
print(f"Weighted Precision: {overall_precision:.4f}")
print(f"Weighted Recall: {overall_recall:.4f}")
print(f"Weighted F1-Score: {overall_f1:.4f}")
print(f"Accuracy: {test_accuracy:.4f}")

Confusion Matrix Analysis

The confusion matrix shows exactly which digits your model confuses with each other.

# Create and visualize confusion matrix
def plot_confusion_matrix(y_true, y_pred, class_names):
    """Plot a beautiful confusion matrix"""
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()
    
    return cm

# Plot confusion matrix
class_names = [str(i) for i in range(10)]
cm = plot_confusion_matrix(y_test, y_pred, class_names)

# Analyze confusion matrix
print("=== CONFUSION MATRIX ANALYSIS ===")
print("Most confused digit pairs:")

# Find the biggest off-diagonal values (misclassifications)
misclassifications = []
for i in range(10):
    for j in range(10):
        if i != j and cm[i, j] > 0:
            misclassifications.append((cm[i, j], i, j))

# Sort by number of misclassifications
misclassifications.sort(reverse=True)

for count, true_label, pred_label in misclassifications[:5]:
    print(f"  {true_label} predicted as {pred_label}: {count} times")

# Calculate per-class accuracy
per_class_accuracy = cm.diagonal() / cm.sum(axis=1)
print(f"\n=== PER-CLASS ACCURACY ===")
for i, acc in enumerate(per_class_accuracy):
    print(f"Digit {i}: {acc:.3f} ({acc*100:.1f}%)")

Error Analysis

Let's examine the images that our model gets wrong to understand its limitations.

# Find misclassified examples
def analyze_errors(x_test, y_test, y_pred, y_pred_probs, num_examples=12):
    """Analyze and visualize misclassified examples"""
    # Find incorrect predictions
    incorrect_indices = np.where(y_test != y_pred)[0]
    
    # Sort by confidence (most confident wrong predictions first)
    confidences = np.max(y_pred_probs[incorrect_indices], axis=1)
    sorted_indices = incorrect_indices[np.argsort(confidences)[::-1]]
    
    plt.figure(figsize=(16, 8))
    
    for i in range(min(num_examples, len(sorted_indices))):
        idx = sorted_indices[i]
        image = x_test.reshape(-1, 28, 28)[idx]
        true_label = y_test[idx]
        pred_label = y_pred[idx]
        confidence = np.max(y_pred_probs[idx])
        
        plt.subplot(3, 4, i+1)
        plt.imshow(image, cmap='gray')
        plt.title(f'True: {true_label}, Pred: {pred_label}\nConf: {confidence:.3f}', 
                 color='red')
        plt.axis('off')
    
    plt.suptitle('Most Confident Wrong Predictions', fontsize=16)
    plt.tight_layout()
    plt.show()
    
    return sorted_indices[:num_examples]

print("Analyzing model errors...")
error_indices = analyze_errors(x_test, y_test, y_pred, y_pred_probs)

# Analyze error patterns
print("\n=== ERROR PATTERNS ===")
error_true_labels = y_test[error_indices]
error_pred_labels = y_pred[error_indices]

print("Common error patterns:")
for true_label in range(10):
    true_errors = error_indices[error_true_labels == true_label]
    if len(true_errors) > 0:
        pred_errors = error_pred_labels[error_true_labels == true_label]
        most_common_error = np.bincount(pred_errors).argmax()
        error_count = np.sum(pred_errors == most_common_error)
        print(f"  Digit {true_label} → most often predicted as {most_common_error} ({error_count} times)")

Model Improvement Techniques

Now let's implement several techniques to improve our model's performance.

1. Dropout Regularization

# Compare models with and without dropout
def create_model_with_dropout(dropout_rate=0.3):
    """Create model with dropout regularization"""
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=(784,)),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    return model

def create_model_without_dropout():
    """Create model without dropout"""
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=(784,)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    return model

# Test different dropout rates
dropout_rates = [0.0, 0.2, 0.3, 0.5]
dropout_results = {}

print("Testing different dropout rates...")
for rate in dropout_rates:
    print(f"\nTesting dropout rate: {rate}")
    
    if rate == 0.0:
        test_model = create_model_without_dropout()
    else:
        test_model = create_model_with_dropout(rate)
    
    test_model.compile(optimizer='adam', 
                      loss='sparse_categorical_crossentropy', 
                      metrics=['accuracy'])
    
    # Train for fewer epochs for comparison
    history = test_model.fit(x_train_final, y_train_final,
                           epochs=10, batch_size=32,
                           validation_data=(x_val, y_val),
                           verbose=0)
    
    val_acc = max(history.history['val_accuracy'])
    dropout_results[rate] = val_acc
    print(f"Best validation accuracy: {val_acc:.4f}")

# Visualize dropout effects
plt.figure(figsize=(10, 6))
rates = list(dropout_results.keys())
accs = list(dropout_results.values())

plt.plot(rates, accs, 'bo-', linewidth=2, markersize=8)
plt.title('Dropout Rate vs Validation Accuracy')
plt.xlabel('Dropout Rate')
plt.ylabel('Best Validation Accuracy')
plt.grid(True)

for rate, acc in zip(rates, accs):
    plt.annotate(f'{acc:.3f}', (rate, acc), textcoords="offset points", 
                xytext=(0,10), ha='center')

plt.show()

print(f"Best dropout rate: {max(dropout_results, key=dropout_results.get)}")

2. Learning Rate Scheduling

# Implement learning rate scheduling
def create_model_with_lr_schedule():
    """Create model with learning rate scheduling"""
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(784,)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    
    # Create learning rate schedule
    lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=0.0001,
        verbose=1
    )
    
    model.compile(optimizer='adam',
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])
    
    return model, lr_schedule

print("Testing learning rate scheduling...")
model_lr, lr_callback = create_model_with_lr_schedule()

history_lr = model_lr.fit(
    x_train_final, y_train_final,
    epochs=20,
    batch_size=32,
    validation_data=(x_val, y_val),
    callbacks=[lr_callback],
    verbose=0
)

print(f"Final validation accuracy with LR scheduling: {max(history_lr.history['val_accuracy']):.4f}")

# Plot learning rate scheduling effects
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history_lr.history['accuracy'], label='Training')
plt.plot(history_lr.history['val_accuracy'], label='Validation')
plt.title('Accuracy with LR Scheduling')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history_lr.history['loss'], label='Training')
plt.plot(history_lr.history['val_loss'], label='Validation')
plt.title('Loss with LR Scheduling')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

3. Batch Normalization

# Add batch normalization
def create_model_with_batch_norm():
    """Create model with batch normalization"""
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, input_shape=(784,)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        tf.keras.layers.Dropout(0.2),
        
        tf.keras.layers.Dense(128),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        tf.keras.layers.Dropout(0.2),
        
        tf.keras.layers.Dense(64),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        tf.keras.layers.Dropout(0.2),
        
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    return model

print("Testing batch normalization...")
model_bn = create_model_with_batch_norm()
model_bn.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

history_bn = model_bn.fit(
    x_train_final, y_train_final,
    epochs=15,
    batch_size=32,
    validation_data=(x_val, y_val),
    verbose=0
)

print(f"Final validation accuracy with Batch Norm: {max(history_bn.history['val_accuracy']):.4f}")

# Compare training speeds
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['val_accuracy'], label='Original Model')
plt.plot(history_bn.history['val_accuracy'], label='With Batch Norm')
plt.title('Validation Accuracy Comparison')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history.history['val_loss'], label='Original Model')
plt.plot(history_bn.history['val_loss'], label='With Batch Norm')
plt.title('Validation Loss Comparison')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

Hands-On Exercise

Time to build your own improved model!

Exercise 1: Ultimate MNIST Model

Create the best possible model using techniques from this lesson:

Use optimal dropout rates from your testing
Include batch normalization
Add learning rate scheduling
Implement early stopping
Target >98.5% test accuracy

# Your ultimate model
def create_ultimate_mnist_model():
    """Create the best possible MNIST model"""
    model = tf.keras.Sequential([
        # Your architecture here - use what you learned!
    ])
    
    # Compile with optimal settings
    model.compile(
        # Your compilation settings
    )
    
    # Define callbacks
    callbacks = [
        # Your callbacks here
    ]
    
    return model, callbacks

# Train your ultimate model
ultimate_model, callbacks = create_ultimate_mnist_model()

ultimate_history = ultimate_model.fit(
    x_train_final, y_train_final,
    epochs=30,
    batch_size=32,
    validation_data=(x_val, y_val),
    callbacks=callbacks,
    verbose=1
)

# Evaluate your ultimate model
test_loss, test_acc = ultimate_model.evaluate(x_test_flat, y_test, verbose=0)
print(f"\nYour Ultimate Model Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")

# Did you beat 98.5%?
if test_acc > 0.985:
    print("🎉 Congratulations! You achieved >98.5% accuracy!")
else:
    print("Keep experimenting - you can do better!")

Exercise 2: Error Analysis Deep Dive

Analyze your best model's errors in detail:

# Detailed error analysis
def detailed_error_analysis(model, x_test, y_test):
    """Perform comprehensive error analysis"""
    
    # Get predictions
    y_pred_probs = model.predict(x_test_flat, verbose=0)
    y_pred = np.argmax(y_pred_probs, axis=1)
    
    # Find errors
    errors = y_test != y_pred
    error_indices = np.where(errors)[0]
    
    print(f"Total errors: {len(error_indices)} out of {len(y_test)}")
    print(f"Error rate: {len(error_indices)/len(y_test)*100:.2f}%")
    
    # Analyze by digit
    print("\nErrors by digit:")
    for digit in range(10):
        digit_mask = y_test == digit
        digit_errors = np.sum(errors & digit_mask)
        digit_total = np.sum(digit_mask)
        error_rate = digit_errors / digit_total * 100
        print(f"  Digit {digit}: {digit_errors}/{digit_total} errors ({error_rate:.1f}%)")
    
    # Your analysis code here
    return error_indices

# Analyze your ultimate model
error_indices = detailed_error_analysis(ultimate_model, x_test, y_test)

Improvement Strategies

Regularization: Dropout, L1/L2 regularization to prevent overfitting
Normalization: Batch normalization for faster, more stable training
Learning Rate: Scheduling and adaptive learning rates
Architecture: Experiment with different layer sizes and depths
Data Augmentation: Generate more training examples artificially
Ensemble Methods: Combine multiple models for better performance

What's Next?

Outstanding work! You've mastered model evaluation and improvement. Here's what you've accomplished:

Comprehensive Evaluation: Used multiple metrics beyond just accuracy
Confusion Matrix Analysis: Understood exactly where your model fails
Error Analysis: Examined misclassified examples to understand limitations
Regularization Techniques: Implemented dropout and batch normalization
Advanced Training: Used learning rate scheduling and early stopping
Model Optimization: Built a high-performance MNIST classifier

In our final lesson, we'll explore Convolutional Neural Networks (CNNs) and learn how to deploy your trained models for real-world use!