# Daily Dose of Data Science

Code accompanying the newsletter issue: [Label Smoothing: The Overlooked and Lesser-Talked Regularization Technique](https://www.blog.dailydoseofds.com/p/label-smoothing-the-overlooked-and)

Author: Avi Chawla

## Imports

In [None]:
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
import numpy as np
import pandas as pd

from skorch import NeuralNetClassifier
from time import time
from tqdm import tqdm
from torch.utils.data import DataLoader

## Dataset

In [None]:
# Define data transformations
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

# Load the Fashion MNIST dataset for both train and test sets
train_dataset = torchvision.datasets.FashionMNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = torchvision.datasets.FashionMNIST(root='./data', train=False, transform=transform, download=True)

# Define batch sizes for train and test data loaders
batch_size = 64

# Create data loaders for train and test sets
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Set seeds

In [None]:
np.random.seed(20)
torch.manual_seed(20)

## Define Neural Network

In [None]:
# Define a simple teacher neural network with 4 fully connected layers
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x1 = torch.relu(self.fc1(x))
        x2 = torch.relu(self.fc2(x1))
        x3 = torch.relu(self.fc3(x2))
        x4 = self.fc4(x3)
        return x1, x2, x3, x4  # Return intermediate feature activations for activation pruning

## Test evaluation

In [None]:
def evaluate(model):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            inputs, labels = data
            outputs = model(inputs)[-1] # use last element returned by forward function
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

## Without label Smoothing

In [None]:
net = SimpleNet()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

for epoch in range(20):
    net.train()
    running_loss = 0.0
    
    for data in trainloader:
        inputs, labels = data
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs[-1], labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
    accuracy = evaluate(net)
        
    print(f"Epoch {epoch + 1}, Loss: {round(running_loss / len(trainloader), 2)}, Accuracy: {accuracy * 100:.2f}%")

### Output probability on a sample

In [None]:
net.eval()

with torch.no_grad():
        for data in testloader:
            inputs, labels = data

F.softmax(net(inputs[0])[-1])

## With Label Smoothing

Restart the kernel before executing the cell below. This time, don't run the "Without label Smoothing" cell.

In [None]:
net = SimpleNet()
criterion = nn.CrossEntropyLoss(label_smoothing = 0.2)
optimizer = optim.Adam(net.parameters(), lr=0.001)

for epoch in range(20):
    net.train()
    running_loss = 0.0
    
    for data in trainloader:
        inputs, labels = data
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs[-1], labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
    accuracy = evaluate(net)
        
    print(f"Epoch {epoch + 1}, Loss: {round(running_loss / len(trainloader), 2)}, Accuracy: {accuracy * 100:.2f}%")

### Output probability on a sample

In [None]:
net.eval()

with torch.no_grad():
        for data in testloader:
            inputs, labels = data

F.softmax(net(inputs[0])[-1])