Analyze performance and implement techniques to boost accuracy
While accuracy is important, it's just one metric. To truly understand your model's performance, you need a comprehensive evaluation using multiple metrics, confusion matrices, and error analysis.
Let's start by training a model and then evaluating it comprehensively.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
# Load and preprocess data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train_flat = x_train.reshape(-1, 784)
x_test_flat = x_test.reshape(-1, 784)
# Create validation split
val_size = int(len(x_train_flat) * 0.2)
x_val = x_train_flat[:val_size]
y_val = y_train[:val_size]
x_train_final = x_train_flat[val_size:]
y_train_final = y_train[val_size:]
# Build and train a model
model = tf.keras.Sequential([
tf.keras.layers.Dense(128, activation='relu', input_shape=(784,)),
tf.keras.layers.Dropout(0.2), # Add dropout for regularization
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Train the model
print("Training model with dropout regularization...")
history = model.fit(
x_train_final, y_train_final,
epochs=15,
batch_size=32,
validation_data=(x_val, y_val),
verbose=1
)
print("Training completed!")
Let's evaluate our model using multiple metrics and visualizations.
# Get predictions for test set
print("Evaluating model performance...")
test_loss, test_accuracy = model.evaluate(x_test_flat, y_test, verbose=0)
y_pred_probs = model.predict(x_test_flat, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"Test Loss: {test_loss:.4f}")
# Calculate detailed metrics
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred, average=None)
print("\n=== PER-CLASS METRICS ===")
print("Class | Precision | Recall | F1-Score | Support")
print("------|-----------|--------|----------|--------")
for i in range(10):
print(f" {i} | {precision[i]:.3f} | {recall[i]:.3f} | {f1[i]:.3f} | {support[i]:4d}")
# Overall metrics
overall_precision = np.average(precision, weights=support)
overall_recall = np.average(recall, weights=support)
overall_f1 = np.average(f1, weights=support)
print(f"\n=== OVERALL METRICS ===")
print(f"Weighted Precision: {overall_precision:.4f}")
print(f"Weighted Recall: {overall_recall:.4f}")
print(f"Weighted F1-Score: {overall_f1:.4f}")
print(f"Accuracy: {test_accuracy:.4f}")
The confusion matrix shows exactly which digits your model confuses with each other.
# Create and visualize confusion matrix
def plot_confusion_matrix(y_true, y_pred, class_names):
"""Plot a beautiful confusion matrix"""
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
return cm
# Plot confusion matrix
class_names = [str(i) for i in range(10)]
cm = plot_confusion_matrix(y_test, y_pred, class_names)
# Analyze confusion matrix
print("=== CONFUSION MATRIX ANALYSIS ===")
print("Most confused digit pairs:")
# Find the biggest off-diagonal values (misclassifications)
misclassifications = []
for i in range(10):
for j in range(10):
if i != j and cm[i, j] > 0:
misclassifications.append((cm[i, j], i, j))
# Sort by number of misclassifications
misclassifications.sort(reverse=True)
for count, true_label, pred_label in misclassifications[:5]:
print(f" {true_label} predicted as {pred_label}: {count} times")
# Calculate per-class accuracy
per_class_accuracy = cm.diagonal() / cm.sum(axis=1)
print(f"\n=== PER-CLASS ACCURACY ===")
for i, acc in enumerate(per_class_accuracy):
print(f"Digit {i}: {acc:.3f} ({acc*100:.1f}%)")
Let's examine the images that our model gets wrong to understand its limitations.
# Find misclassified examples
def analyze_errors(x_test, y_test, y_pred, y_pred_probs, num_examples=12):
"""Analyze and visualize misclassified examples"""
# Find incorrect predictions
incorrect_indices = np.where(y_test != y_pred)[0]
# Sort by confidence (most confident wrong predictions first)
confidences = np.max(y_pred_probs[incorrect_indices], axis=1)
sorted_indices = incorrect_indices[np.argsort(confidences)[::-1]]
plt.figure(figsize=(16, 8))
for i in range(min(num_examples, len(sorted_indices))):
idx = sorted_indices[i]
image = x_test.reshape(-1, 28, 28)[idx]
true_label = y_test[idx]
pred_label = y_pred[idx]
confidence = np.max(y_pred_probs[idx])
plt.subplot(3, 4, i+1)
plt.imshow(image, cmap='gray')
plt.title(f'True: {true_label}, Pred: {pred_label}\nConf: {confidence:.3f}',
color='red')
plt.axis('off')
plt.suptitle('Most Confident Wrong Predictions', fontsize=16)
plt.tight_layout()
plt.show()
return sorted_indices[:num_examples]
print("Analyzing model errors...")
error_indices = analyze_errors(x_test, y_test, y_pred, y_pred_probs)
# Analyze error patterns
print("\n=== ERROR PATTERNS ===")
error_true_labels = y_test[error_indices]
error_pred_labels = y_pred[error_indices]
print("Common error patterns:")
for true_label in range(10):
true_errors = error_indices[error_true_labels == true_label]
if len(true_errors) > 0:
pred_errors = error_pred_labels[error_true_labels == true_label]
most_common_error = np.bincount(pred_errors).argmax()
error_count = np.sum(pred_errors == most_common_error)
print(f" Digit {true_label} → most often predicted as {most_common_error} ({error_count} times)")
Now let's implement several techniques to improve our model's performance.
# Compare models with and without dropout
def create_model_with_dropout(dropout_rate=0.3):
"""Create model with dropout regularization"""
model = tf.keras.Sequential([
tf.keras.layers.Dense(256, activation='relu', input_shape=(784,)),
tf.keras.layers.Dropout(dropout_rate),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(dropout_rate),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(dropout_rate),
tf.keras.layers.Dense(10, activation='softmax')
])
return model
def create_model_without_dropout():
"""Create model without dropout"""
model = tf.keras.Sequential([
tf.keras.layers.Dense(256, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
return model
# Test different dropout rates
dropout_rates = [0.0, 0.2, 0.3, 0.5]
dropout_results = {}
print("Testing different dropout rates...")
for rate in dropout_rates:
print(f"\nTesting dropout rate: {rate}")
if rate == 0.0:
test_model = create_model_without_dropout()
else:
test_model = create_model_with_dropout(rate)
test_model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# Train for fewer epochs for comparison
history = test_model.fit(x_train_final, y_train_final,
epochs=10, batch_size=32,
validation_data=(x_val, y_val),
verbose=0)
val_acc = max(history.history['val_accuracy'])
dropout_results[rate] = val_acc
print(f"Best validation accuracy: {val_acc:.4f}")
# Visualize dropout effects
plt.figure(figsize=(10, 6))
rates = list(dropout_results.keys())
accs = list(dropout_results.values())
plt.plot(rates, accs, 'bo-', linewidth=2, markersize=8)
plt.title('Dropout Rate vs Validation Accuracy')
plt.xlabel('Dropout Rate')
plt.ylabel('Best Validation Accuracy')
plt.grid(True)
for rate, acc in zip(rates, accs):
plt.annotate(f'{acc:.3f}', (rate, acc), textcoords="offset points",
xytext=(0,10), ha='center')
plt.show()
print(f"Best dropout rate: {max(dropout_results, key=dropout_results.get)}")
# Implement learning rate scheduling
def create_model_with_lr_schedule():
"""Create model with learning rate scheduling"""
model = tf.keras.Sequential([
tf.keras.layers.Dense(128, activation='relu', input_shape=(784,)),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10, activation='softmax')
])
# Create learning rate schedule
lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(
monitor='val_loss',
factor=0.5,
patience=3,
min_lr=0.0001,
verbose=1
)
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
return model, lr_schedule
print("Testing learning rate scheduling...")
model_lr, lr_callback = create_model_with_lr_schedule()
history_lr = model_lr.fit(
x_train_final, y_train_final,
epochs=20,
batch_size=32,
validation_data=(x_val, y_val),
callbacks=[lr_callback],
verbose=0
)
print(f"Final validation accuracy with LR scheduling: {max(history_lr.history['val_accuracy']):.4f}")
# Plot learning rate scheduling effects
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history_lr.history['accuracy'], label='Training')
plt.plot(history_lr.history['val_accuracy'], label='Validation')
plt.title('Accuracy with LR Scheduling')
plt.legend()
plt.grid(True)
plt.subplot(1, 2, 2)
plt.plot(history_lr.history['loss'], label='Training')
plt.plot(history_lr.history['val_loss'], label='Validation')
plt.title('Loss with LR Scheduling')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# Add batch normalization
def create_model_with_batch_norm():
"""Create model with batch normalization"""
model = tf.keras.Sequential([
tf.keras.layers.Dense(256, input_shape=(784,)),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(128),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(64),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10, activation='softmax')
])
return model
print("Testing batch normalization...")
model_bn = create_model_with_batch_norm()
model_bn.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
history_bn = model_bn.fit(
x_train_final, y_train_final,
epochs=15,
batch_size=32,
validation_data=(x_val, y_val),
verbose=0
)
print(f"Final validation accuracy with Batch Norm: {max(history_bn.history['val_accuracy']):.4f}")
# Compare training speeds
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['val_accuracy'], label='Original Model')
plt.plot(history_bn.history['val_accuracy'], label='With Batch Norm')
plt.title('Validation Accuracy Comparison')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.subplot(1, 2, 2)
plt.plot(history.history['val_loss'], label='Original Model')
plt.plot(history_bn.history['val_loss'], label='With Batch Norm')
plt.title('Validation Loss Comparison')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
Time to build your own improved model!
Create the best possible model using techniques from this lesson:
# Your ultimate model
def create_ultimate_mnist_model():
"""Create the best possible MNIST model"""
model = tf.keras.Sequential([
# Your architecture here - use what you learned!
])
# Compile with optimal settings
model.compile(
# Your compilation settings
)
# Define callbacks
callbacks = [
# Your callbacks here
]
return model, callbacks
# Train your ultimate model
ultimate_model, callbacks = create_ultimate_mnist_model()
ultimate_history = ultimate_model.fit(
x_train_final, y_train_final,
epochs=30,
batch_size=32,
validation_data=(x_val, y_val),
callbacks=callbacks,
verbose=1
)
# Evaluate your ultimate model
test_loss, test_acc = ultimate_model.evaluate(x_test_flat, y_test, verbose=0)
print(f"\nYour Ultimate Model Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")
# Did you beat 98.5%?
if test_acc > 0.985:
print("🎉 Congratulations! You achieved >98.5% accuracy!")
else:
print("Keep experimenting - you can do better!")
Analyze your best model's errors in detail:
# Detailed error analysis
def detailed_error_analysis(model, x_test, y_test):
"""Perform comprehensive error analysis"""
# Get predictions
y_pred_probs = model.predict(x_test_flat, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)
# Find errors
errors = y_test != y_pred
error_indices = np.where(errors)[0]
print(f"Total errors: {len(error_indices)} out of {len(y_test)}")
print(f"Error rate: {len(error_indices)/len(y_test)*100:.2f}%")
# Analyze by digit
print("\nErrors by digit:")
for digit in range(10):
digit_mask = y_test == digit
digit_errors = np.sum(errors & digit_mask)
digit_total = np.sum(digit_mask)
error_rate = digit_errors / digit_total * 100
print(f" Digit {digit}: {digit_errors}/{digit_total} errors ({error_rate:.1f}%)")
# Your analysis code here
return error_indices
# Analyze your ultimate model
error_indices = detailed_error_analysis(ultimate_model, x_test, y_test)
Outstanding work! You've mastered model evaluation and improvement. Here's what you've accomplished:
In our final lesson, we'll explore Convolutional Neural Networks (CNNs) and learn how to deploy your trained models for real-world use!