# Libraries used for section 1
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

# Checking that we are utilising a GPU in the current environment
device = torch.device('cuda')
print(device)

cuda

# Downloading the CIFAR-10 dataset and loading it into memory
def load_data_cifar10(batch_size, resize=None):
    trans = [transforms.ToTensor()]
    if resize:
        trans.insert(0, transforms.Resize(resize))
    trans.append(transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)))
    trans = transforms.Compose(trans)

    cifar10_train = torchvision.datasets.CIFAR10(
        root="./data", train=True, transform=trans, download=True)
    cifar10_test = torchvision.datasets.CIFAR10(
        root="./data", train=False, transform=trans, download=True)

    return (torch.utils.data.DataLoader(cifar10_train, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True),
            torch.utils.data.DataLoader(cifar10_test, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True))

batch_size = 128 # Defines the batch size.
train_iter, test_iter = load_data_cifar10(batch_size) # Loads the CIFAR-10 dataset. `train_iter` and `test_iter` are `DataLoader` objects.

Files already downloaded and verified
Files already downloaded and verified

# Checking the shape of our tensor outputs from the dataset
X, y = next(iter(train_iter)) # Requests the first training batch
print(X.size()) # 128 images per batch. Each image is represented by a 3 x 32 x 32 tensor (number of channels x height x width). The images are RGB, so there are three channels.
print(y.size()) # 128 targets. Each target is a number between 0 and 9. The classification problem has 10 classes.
print(y[:10]) # Viewing the elements within a target tensor, which should have 10 elements between 0-9.

torch.Size([128, 3, 32, 32])
torch.Size([128])
tensor([9, 5, 2, 7, 1, 1, 0, 7, 6, 1])

# CIFAR-10 class labels
class_labels = ['airplane', 'automobile', 'bird', 'cat', 'deer',
                'dog', 'frog', 'horse', 'ship', 'truck']

# Setting up the figure size
plt.figure(figsize=(12, 4))

# Looping over the first 5 images in the batch
for i in range(5):
    img = X[i].numpy().transpose(1, 2, 0)  # Convert from [C, H, W] to [H, W, C]

    # Un-normalise the image
    img = img * 0.5 + 0.5

    # Displaying images using matplotlib
    plt.subplot(1, 5, i + 1)
    plt.imshow(img)
    plt.title(f"{class_labels[int(y[i])]}")
    plt.axis('off')

plt.show()

# Libraries used for section 2
import torch
import torch.nn as nn
import torch.nn.functional as F

class Block(nn.Module):
    def __init__(self, in_channels, out_channels, L=4):
        super().__init__()

        # L represents the number of independent convolutional layers
        self.L = L
        self.out_channels = out_channels

        # Defining L independent convolutional layers
        self.convs = nn.ModuleList()
        for i in range(L):
            self.convs.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))

        # Fully connected layer that takes channel-wise averages of x as input and outputs L * C_out weights
        self.fc = nn.Linear(in_channels, L * out_channels)

    def forward(self, x):
        # Batch size, channels, height, width
        B, C_in, H, W = x.shape

        # Applying each convolutional layer independently to x
        conv_outputs = []
        for i in range(self.L):
            Ci_x = self.convs[i](x)
            conv_outputs.append(Ci_x)

        # Computing the channel-wise mean of x
        m = x.mean(dim=(2, 3))

        # Computing weights from m using the fully connected layer
        a = self.fc(m)
        a = a.view(B, self.L, self.out_channels)

        # Normalising weights with softmax over the L dimension
        a = F.softmax(a, dim=1)

        # Stacking convolutional outputs and applying weights
        conv_stack = torch.stack(conv_outputs, dim=1)

        # Un-squeezing weights to match output dimensions
        a = a.unsqueeze(-1).unsqueeze(-1)

        # Weighted sum
        out = (a * conv_stack).sum(dim=1)

        return out

class OutputBlock(nn.Module):
    def __init__(self, in_channels, hidden_dim=256):
        super().__init__()

        # Hidden fully connected layer
        self.fc1 = nn.Linear(in_channels, hidden_dim)
        self.relu = nn.ReLU()

        # Final classification layer
        self.fc2 = nn.Linear(hidden_dim, 10)

    def forward(self, x):
        # Global average pooling over height and width = vector m
        m = x.mean(dim=(2, 3))

        # Additional hidden layer with ReLU activation, to introduce non-linearity
        h = self.relu(self.fc1(m))

        # Outputting logits for 10 classes
        logits = self.fc2(h)

        return logits

class RashadNet(nn.Module):
    def __init__(self, num_blocks=3, L=4, base_channels=32):
        super().__init__()

        # Initialising an empty list to hold the intermediate blocks
        block_list = []

        # CIFAR-10 RGB input
        in_channels = 3

        # Defining and adding each intermediate block
        for i in range(num_blocks):
            out_channels = base_channels * (2 ** i)
            block = Block(in_channels, out_channels, L)
            block_list.append(block)
            in_channels = out_channels

        # Stacking all intermediate blocks into a sequential module
        self.blocks = nn.Sequential(*block_list)

        # Final output block for classification
        self.output_block = OutputBlock(in_channels)

    def forward(self, x):
        # Passing input through all intermediate blocks
        x = self.blocks(x)

        # Passing the result through the output block
        logits = self.output_block(x)

        return logits

# Testing and checking the output shape for a dummy input
model = RashadNet(num_blocks=3, L=4)

dummy_input = torch.randn(8, 3, 32, 32)
print("Output shape:", model(dummy_input).shape)

Output shape: torch.Size([8, 10])

# Libraries used for section 3
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# Accuracy function
def compute_accuracy(logits, labels):
    preds = logits.argmax(dim=1)
    return (preds == labels).float().mean().item()

# Training and evaluation loop
def train_model(model, train_iter, test_iter, num_epochs=10, learning_rate=0.001, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model = model.to(device)

    # Loss and optimiser
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Logging variables
    train_losses = []
    train_accuracies = []
    test_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        correct = 0
        total = 0

        for inputs, labels in train_iter:
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Tracking loss and accuracy
            epoch_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)
            train_losses.append(loss.item())

        train_acc = correct / total
        train_accuracies.append(train_acc)

        # Evaluation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in test_iter:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                correct += (outputs.argmax(1) == labels).sum().item()
                total += labels.size(0)
        test_acc = correct / total
        test_accuracies.append(test_acc)

        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f} | Training accuracy: {train_acc:.4f} | Test accuracy: {test_acc:.4f}")

    # Return logs for plotting
    return train_losses, train_accuracies, test_accuracies

# Plotting function
def plot_training(train_losses, train_accuracies, test_accuracies):
    plt.figure(figsize=(12, 4))

    # Loss per batch
    plt.subplot(1, 2, 1)
    plt.plot(train_losses)
    plt.xlabel("Batch")
    plt.ylabel("Loss")
    plt.title("Training Loss")

    # Training and test accuracy per epoch
    plt.subplot(1, 2, 2)
    plt.plot(train_accuracies, label="Train Accuracy")
    plt.plot(test_accuracies, label="Test Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Train vs Test Accuracy")
    plt.legend()

    plt.show()

# Test run
model = RashadNet(num_blocks=3, L=4)
train_losses, train_accs, test_accs = train_model(model, train_iter, test_iter, num_epochs=20)
print(f"Best test accuracy: {max(test_accs)*100:.2f}%")
plot_training(train_losses, train_accs, test_accs)

Epoch 1/20 - Loss: 796.2382 | Training accuracy: 0.2356 | Test accuracy: 0.2643
Epoch 2/20 - Loss: 770.4613 | Training accuracy: 0.2703 | Test accuracy: 0.2899
Epoch 3/20 - Loss: 759.3736 | Training accuracy: 0.2860 | Test accuracy: 0.3113
Epoch 4/20 - Loss: 742.3297 | Training accuracy: 0.3060 | Test accuracy: 0.3180
Epoch 5/20 - Loss: 728.3378 | Training accuracy: 0.3215 | Test accuracy: 0.3258
Epoch 6/20 - Loss: 718.1654 | Training accuracy: 0.3353 | Test accuracy: 0.3466
Epoch 7/20 - Loss: 704.6519 | Training accuracy: 0.3519 | Test accuracy: 0.3588
Epoch 8/20 - Loss: 691.3199 | Training accuracy: 0.3683 | Test accuracy: 0.3729
Epoch 9/20 - Loss: 680.2092 | Training accuracy: 0.3785 | Test accuracy: 0.3788
Epoch 10/20 - Loss: 672.3048 | Training accuracy: 0.3844 | Test accuracy: 0.3874
Epoch 11/20 - Loss: 666.2301 | Training accuracy: 0.3916 | Test accuracy: 0.3930
Epoch 12/20 - Loss: 659.9940 | Training accuracy: 0.3973 | Test accuracy: 0.3889
Epoch 13/20 - Loss: 655.1748 | Training accuracy: 0.4023 | Test accuracy: 0.4076
Epoch 14/20 - Loss: 650.4176 | Training accuracy: 0.4071 | Test accuracy: 0.4058
Epoch 15/20 - Loss: 646.3508 | Training accuracy: 0.4083 | Test accuracy: 0.4055
Epoch 16/20 - Loss: 639.5911 | Training accuracy: 0.4162 | Test accuracy: 0.4115
Epoch 17/20 - Loss: 635.6083 | Training accuracy: 0.4204 | Test accuracy: 0.4095
Epoch 18/20 - Loss: 630.2677 | Training accuracy: 0.4245 | Test accuracy: 0.4178
Epoch 19/20 - Loss: 626.6025 | Training accuracy: 0.4283 | Test accuracy: 0.4210
Epoch 20/20 - Loss: 621.7701 | Training accuracy: 0.4334 | Test accuracy: 0.4167
Best test accuracy: 42.10%

# Training transform with data augmentation
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Test transform remains the same (no augmentation)
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Datasets with new transforms
train_set = torchvision.datasets.CIFAR10(
    root="./data", train=True, download=True, transform=transform_train)

test_set = torchvision.datasets.CIFAR10(
    root="./data", train=False, download=True, transform=transform_test)

# DataLoaders
train_iter = torch.utils.data.DataLoader(train_set, batch_size=128, shuffle=True, num_workers=2)
test_iter = torch.utils.data.DataLoader(test_set, batch_size=128, shuffle=False, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified

# Adding batch normalisation to our block class
class Block(nn.Module):
    def __init__(self, in_channels, out_channels, L=4):
        super().__init__()

        # L represents the number of independent convolutional layers
        self.L = L
        self.out_channels = out_channels

        # Defining L independent convolutional layers followed by BatchNorm
        self.convs = nn.ModuleList()
        for i in range(self.L):
            self.convs.append(nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
                nn.BatchNorm2d(out_channels)  # BatchNorm added
            ))

        # Fully connected layer that takes channel-wise averages of x as input and outputs L * C_out weights
        self.fc = nn.Linear(in_channels, L * out_channels)

    def forward(self, x):
        # Batch size, channels, height, width
        B, C_in, H, W = x.shape

        # Applying each convolutional layer + batch norm independently to x
        conv_outputs = []
        for i in range(self.L):
            Ci_x = self.convs[i](x)
            conv_outputs.append(Ci_x)

        # Computing the channel-wise mean of x
        m = x.mean(dim=(2, 3))

        # Computing weights from m using the fully connected layer
        a = self.fc(m)
        a = a.view(B, self.L, self.out_channels)

        # Normalising weights with softmax over the L dimension
        a = F.softmax(a, dim=1)

        # Stacking convolutional outputs and applying weights
        conv_stack = torch.stack(conv_outputs, dim=1)

        # Unsqueezing weights to match output dimensions
        a = a.unsqueeze(-1).unsqueeze(-1)

        # Weighted sum
        out = (a * conv_stack).sum(dim=1)

        return out

# Updating the output block with dropout
class OutputBlock(nn.Module):
    def __init__(self, in_channels, hidden_dim=256, dropout_prob=0.5):
        super().__init__()

        # Hidden fully connected layer
        self.fc1 = nn.Linear(in_channels, hidden_dim)
        self.relu = nn.ReLU()

        # Dropout layer to reduce overfitting
        self.dropout = nn.Dropout(p=dropout_prob)

        # Final classification layer
        self.fc2 = nn.Linear(hidden_dim, 10)

    def forward(self, x):
        # Global average pooling over height and width = vector m
        m = x.mean(dim=(2, 3))  # Shape: [B, C_in]

        # Hidden layer with ReLU activation
        h = self.relu(self.fc1(m))

        # Apply dropout
        h = self.dropout(h)

        # Output logits for 10 classes
        logits = self.fc2(h)

        return logits

# Updating the network with downsampling
class RashadNet(nn.Module):
    def __init__(self, num_blocks=3, L=4, base_channels=32):
        super().__init__()

        # Initialising an empty list to hold the intermediate blocks
        block_list = []

        # CIFAR-10 RGB input
        in_channels = 3

        # Defining and adding each intermediate block
        for i in range(num_blocks):
            out_channels = base_channels * (2 ** min(i, 3))  # Cap channel size at 256
            block = Block(in_channels, out_channels, L)
            block_list.append(block)

            # Add downsampling *after* block if not the last one
            if i < num_blocks - 1:
                block_list.append(nn.AvgPool2d(kernel_size=2))  # Downsample: H/2, W/2

            in_channels = out_channels  # Update for the next block

        # Stacking all intermediate blocks (and downsampling layers) into a sequential module
        self.blocks = nn.Sequential(*block_list)

        # Final output block for classification
        self.output_block = OutputBlock(in_channels)

    def forward(self, x):
        # Passing input through all intermediate blocks
        x = self.blocks(x)

        # Passing the result through the output block
        logits = self.output_block(x)

        return logits

# Increasing the network blocks
model = RashadNet(num_blocks=6, L=4)

train_losses, train_accs, test_accs = train_model(model, train_iter, test_iter, num_epochs=40)
print(f"Best test accuracy: {max(test_accs)*100:.2f}%")
plot_training(train_losses, train_accs, test_accs)

Epoch 1/40 - Loss: 754.5791 | Training accuracy: 0.2912 | Test accuracy: 0.3475
Epoch 2/40 - Loss: 699.1657 | Training accuracy: 0.3521 | Test accuracy: 0.3948
Epoch 3/40 - Loss: 666.3104 | Training accuracy: 0.3877 | Test accuracy: 0.4331
Epoch 4/40 - Loss: 635.3993 | Training accuracy: 0.4181 | Test accuracy: 0.4527
Epoch 5/40 - Loss: 609.7160 | Training accuracy: 0.4404 | Test accuracy: 0.4724
Epoch 6/40 - Loss: 591.3145 | Training accuracy: 0.4610 | Test accuracy: 0.4896
Epoch 7/40 - Loss: 578.8761 | Training accuracy: 0.4722 | Test accuracy: 0.4944
Epoch 8/40 - Loss: 565.9846 | Training accuracy: 0.4859 | Test accuracy: 0.5085
Epoch 9/40 - Loss: 551.1728 | Training accuracy: 0.4977 | Test accuracy: 0.5198
Epoch 10/40 - Loss: 542.3439 | Training accuracy: 0.5073 | Test accuracy: 0.5103
Epoch 11/40 - Loss: 533.1482 | Training accuracy: 0.5147 | Test accuracy: 0.5240
Epoch 12/40 - Loss: 521.6773 | Training accuracy: 0.5270 | Test accuracy: 0.5229
Epoch 13/40 - Loss: 513.0874 | Training accuracy: 0.5330 | Test accuracy: 0.5314
Epoch 14/40 - Loss: 503.2199 | Training accuracy: 0.5417 | Test accuracy: 0.5364
Epoch 15/40 - Loss: 492.1076 | Training accuracy: 0.5505 | Test accuracy: 0.5370
Epoch 16/40 - Loss: 484.3859 | Training accuracy: 0.5581 | Test accuracy: 0.5472
Epoch 17/40 - Loss: 475.1830 | Training accuracy: 0.5668 | Test accuracy: 0.5442
Epoch 18/40 - Loss: 464.7450 | Training accuracy: 0.5764 | Test accuracy: 0.5469
Epoch 19/40 - Loss: 456.2599 | Training accuracy: 0.5826 | Test accuracy: 0.5470
Epoch 20/40 - Loss: 446.9204 | Training accuracy: 0.5923 | Test accuracy: 0.5505
Epoch 21/40 - Loss: 437.2596 | Training accuracy: 0.5993 | Test accuracy: 0.5514
Epoch 22/40 - Loss: 425.4121 | Training accuracy: 0.6080 | Test accuracy: 0.5519
Epoch 23/40 - Loss: 419.5061 | Training accuracy: 0.6147 | Test accuracy: 0.5499
Epoch 24/40 - Loss: 407.8815 | Training accuracy: 0.6246 | Test accuracy: 0.5614
Epoch 25/40 - Loss: 397.1849 | Training accuracy: 0.6330 | Test accuracy: 0.5440
Epoch 26/40 - Loss: 388.5386 | Training accuracy: 0.6403 | Test accuracy: 0.5485
Epoch 27/40 - Loss: 378.2871 | Training accuracy: 0.6505 | Test accuracy: 0.5558
Epoch 28/40 - Loss: 369.4858 | Training accuracy: 0.6575 | Test accuracy: 0.5513
Epoch 29/40 - Loss: 357.8976 | Training accuracy: 0.6670 | Test accuracy: 0.5476
Epoch 30/40 - Loss: 351.3728 | Training accuracy: 0.6750 | Test accuracy: 0.5480
Epoch 31/40 - Loss: 343.3703 | Training accuracy: 0.6793 | Test accuracy: 0.5539
Epoch 32/40 - Loss: 331.2982 | Training accuracy: 0.6899 | Test accuracy: 0.5581
Epoch 33/40 - Loss: 325.5694 | Training accuracy: 0.6965 | Test accuracy: 0.5547
Epoch 34/40 - Loss: 315.7147 | Training accuracy: 0.7057 | Test accuracy: 0.5519
Epoch 35/40 - Loss: 307.7898 | Training accuracy: 0.7098 | Test accuracy: 0.5541
Epoch 36/40 - Loss: 296.8432 | Training accuracy: 0.7202 | Test accuracy: 0.5459
Epoch 37/40 - Loss: 291.0907 | Training accuracy: 0.7289 | Test accuracy: 0.5465
Epoch 38/40 - Loss: 286.1637 | Training accuracy: 0.7337 | Test accuracy: 0.5503
Epoch 39/40 - Loss: 275.0402 | Training accuracy: 0.7431 | Test accuracy: 0.5519
Epoch 40/40 - Loss: 268.0697 | Training accuracy: 0.7501 | Test accuracy: 0.5460
Best test accuracy: 56.14%

# Updating the output block with new dropout value
class OutputBlock(nn.Module):
    def __init__(self, in_channels, hidden_dim=256, dropout_prob=0.3):
        super().__init__()

        # Hidden fully connected layer
        self.fc1 = nn.Linear(in_channels, hidden_dim)
        self.relu = nn.ReLU()

        # Dropout layer to reduce overfitting
        self.dropout = nn.Dropout(p=dropout_prob)

        # Final classification layer
        self.fc2 = nn.Linear(hidden_dim, 10)

    def forward(self, x):
        # Global average pooling over height and width = vector m
        m = x.mean(dim=(2, 3))  # Shape: [B, C_in]

        # Hidden layer with ReLU activation
        h = self.relu(self.fc1(m))

        # Apply dropout
        h = self.dropout(h)

        # Output logits for 10 classes
        logits = self.fc2(h)

        return logits

# Updating the training function with label smoothing and the cosine annealing LR scheduler
def train_model(model, train_iter, test_iter, num_epochs=10, learning_rate=0.001, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model = model.to(device)

    # Loss function with label smoothing
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    # Optimiser
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Cosine Annealing learning rate scheduler
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-5)

    # Logging variables
    train_losses = []
    train_accuracies = []
    test_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        correct = 0
        total = 0

        for inputs, labels in train_iter:
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Track loss and accuracy
            epoch_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)
            train_losses.append(loss.item())

        train_acc = correct / total
        train_accuracies.append(train_acc)

        # Step the LR scheduler
        scheduler.step()

        # Evaluation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in test_iter:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                correct += (outputs.argmax(1) == labels).sum().item()
                total += labels.size(0)
        test_acc = correct / total
        test_accuracies.append(test_acc)

        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f} | Training accuracy: {train_acc:.4f} | Test accuracy: {test_acc:.4f}")

    return train_losses, train_accuracies, test_accuracies

# Updating the network with max pooling
class RashadNet(nn.Module):
    def __init__(self, num_blocks=3, L=4, base_channels=32):
        super().__init__()

        # Initialising an empty list to hold the intermediate blocks
        block_list = []

        # CIFAR-10 RGB input
        in_channels = 3

        # Defining and adding each intermediate block
        for i in range(num_blocks):
            out_channels = base_channels * (2 ** min(i, 3))  # Cap channel size at 256
            block = Block(in_channels, out_channels, L)
            block_list.append(block)

            # Pool only on first 5 blocks
            if i < 5:
                # Changed to MaxPool2d
                block_list.append(nn.MaxPool2d(kernel_size=2))

            in_channels = out_channels

        # Stacking all intermediate blocks (and downsampling layers) into a sequential module
        self.blocks = nn.Sequential(*block_list)

        # Final output block for classification
        self.output_block = OutputBlock(in_channels)

    def forward(self, x):
        # Passing input through all intermediate blocks
        x = self.blocks(x)

        # Passing the result through the output block
        logits = self.output_block(x)

        return logits

# Further increasing the network blocks (from 6 to 10) and base channels from 32 to 64
model = RashadNet(num_blocks=10, L=4, base_channels=64)

train_losses, train_accs, test_accs = train_model(model, train_iter, test_iter, num_epochs=40)
print(f"Best test accuracy: {max(test_accs)*100:.2f}%")
plot_training(train_losses, train_accs, test_accs)

Epoch 1/40 - Loss: 675.2219 | Training accuracy: 0.4208 | Test accuracy: 0.5303
Epoch 2/40 - Loss: 557.3429 | Training accuracy: 0.5864 | Test accuracy: 0.6404
Epoch 3/40 - Loss: 503.2689 | Training accuracy: 0.6502 | Test accuracy: 0.6826
Epoch 4/40 - Loss: 471.1875 | Training accuracy: 0.6918 | Test accuracy: 0.6627
Epoch 5/40 - Loss: 446.2939 | Training accuracy: 0.7218 | Test accuracy: 0.7265
Epoch 6/40 - Loss: 425.8395 | Training accuracy: 0.7478 | Test accuracy: 0.7444
Epoch 7/40 - Loss: 411.5328 | Training accuracy: 0.7630 | Test accuracy: 0.7646
Epoch 8/40 - Loss: 395.5109 | Training accuracy: 0.7817 | Test accuracy: 0.7689
Epoch 9/40 - Loss: 384.6158 | Training accuracy: 0.7984 | Test accuracy: 0.7932
Epoch 10/40 - Loss: 372.4356 | Training accuracy: 0.8092 | Test accuracy: 0.7890
Epoch 11/40 - Loss: 362.0540 | Training accuracy: 0.8225 | Test accuracy: 0.8028
Epoch 12/40 - Loss: 353.4043 | Training accuracy: 0.8312 | Test accuracy: 0.8088
Epoch 13/40 - Loss: 344.4180 | Training accuracy: 0.8410 | Test accuracy: 0.8188
Epoch 14/40 - Loss: 336.5886 | Training accuracy: 0.8497 | Test accuracy: 0.8202
Epoch 15/40 - Loss: 328.3080 | Training accuracy: 0.8577 | Test accuracy: 0.8235
Epoch 16/40 - Loss: 320.4020 | Training accuracy: 0.8678 | Test accuracy: 0.8210
Epoch 17/40 - Loss: 313.9059 | Training accuracy: 0.8757 | Test accuracy: 0.8306
Epoch 18/40 - Loss: 306.4146 | Training accuracy: 0.8830 | Test accuracy: 0.8365
Epoch 19/40 - Loss: 299.5489 | Training accuracy: 0.8896 | Test accuracy: 0.8388
Epoch 20/40 - Loss: 293.6745 | Training accuracy: 0.8965 | Test accuracy: 0.8394
Epoch 21/40 - Loss: 286.7109 | Training accuracy: 0.9058 | Test accuracy: 0.8490
Epoch 22/40 - Loss: 281.2311 | Training accuracy: 0.9106 | Test accuracy: 0.8385
Epoch 23/40 - Loss: 275.5952 | Training accuracy: 0.9170 | Test accuracy: 0.8497
Epoch 24/40 - Loss: 270.4363 | Training accuracy: 0.9226 | Test accuracy: 0.8460
Epoch 25/40 - Loss: 264.7259 | Training accuracy: 0.9285 | Test accuracy: 0.8535
Epoch 26/40 - Loss: 260.1237 | Training accuracy: 0.9324 | Test accuracy: 0.8565
Epoch 27/40 - Loss: 255.7247 | Training accuracy: 0.9387 | Test accuracy: 0.8575
Epoch 28/40 - Loss: 251.0470 | Training accuracy: 0.9432 | Test accuracy: 0.8593
Epoch 29/40 - Loss: 247.9823 | Training accuracy: 0.9481 | Test accuracy: 0.8582
Epoch 30/40 - Loss: 244.7038 | Training accuracy: 0.9499 | Test accuracy: 0.8588
Epoch 31/40 - Loss: 240.9654 | Training accuracy: 0.9541 | Test accuracy: 0.8620
Epoch 32/40 - Loss: 237.5728 | Training accuracy: 0.9589 | Test accuracy: 0.8600
Epoch 33/40 - Loss: 234.3296 | Training accuracy: 0.9617 | Test accuracy: 0.8619
Epoch 34/40 - Loss: 233.1574 | Training accuracy: 0.9636 | Test accuracy: 0.8654
Epoch 35/40 - Loss: 230.5739 | Training accuracy: 0.9670 | Test accuracy: 0.8622
Epoch 36/40 - Loss: 228.9546 | Training accuracy: 0.9681 | Test accuracy: 0.8640
Epoch 37/40 - Loss: 227.8908 | Training accuracy: 0.9691 | Test accuracy: 0.8651
Epoch 38/40 - Loss: 227.2022 | Training accuracy: 0.9697 | Test accuracy: 0.8669
Epoch 39/40 - Loss: 227.7611 | Training accuracy: 0.9692 | Test accuracy: 0.8684
Epoch 40/40 - Loss: 226.3471 | Training accuracy: 0.9710 | Test accuracy: 0.8665
Best test accuracy: 86.84%

Test	Description	Key Parameter Changes	Best Test Accuracy
Baseline	Initial model	`num_blocks=3`, `base_channels=32`, `epochs=20`	42.10%
1	First wave updates	Data augmentation, batch normalisation, dropout (`0.5`), deeper network (`num_blocks=6`) with downsampling (`AvgPool2d`), `epochs=40`	56.14%
2	Second wave updates	Label smoothing (`0.1`), cosine annealing LR, dropout update (`0.3`), `MaxPool2d`, deeper network (`num_blocks=8`), wider network (`base_channels=64`), `epochs=40`	86.84%

Test	Description	Best Test Accuracy
3	Random erasing ("Cutout") augmentation	84.74%
4	MixUp augmentation with label smoothing (`0.1`)	84.70%
5	MixUp augmentation without label smoothing	85.03%

CIFAR-10 Image Classification with Custom Neural Network Architecture¶

1 Dataset¶

2 Basic architecture¶

2.1 Intermediate block¶

2.2 Output block¶

2.3 Full architecture¶

3 Training and testing¶

3.1 Accuracy function¶

3.2 Main training and evaluation loop¶

3.3 Plotting function¶

3.4 Testing the model with the training and plotting function¶

4 Improving the results¶

4.1 First wave updates¶

Data augmentation¶

Batch normalisation¶

Dropout¶

Downsampling to deepen and optimise the network¶

First wave results¶

4.2 Second wave updates¶

Further tuning dropout probability¶

Label smoothing and Cosine Annealing LR scheduler¶

MaxPool¶

Second wave results¶

4.3 Summary of updates (including additional test results)¶