# Daily Dose of Data Science

The notebook accompanies the code for optimizing neural network training under memory constraints.

[Gradient Accumulation: Increase Batch Size Without Explicitly Increasing Batch Size](https://www.blog.dailydoseofds.com/p/gradient-accumulation-increase-batch)

Author: Avi Chawla

## Imports

In [1]:
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
import numpy as np
import pandas as pd

from time import time
from tqdm import tqdm
from torch.utils.data import DataLoader

## Data

In [2]:
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=32, shuffle=True)

testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
testloader = DataLoader(testset, batch_size=32, shuffle=False)

## Network

In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x1 = torch.relu(self.fc1(x))
        x2 = torch.relu(self.fc2(x1))
        x3 = torch.relu(self.fc3(x2))
        x4 = self.fc4(x3)
        return x4

## Evaluate model 

In [4]:
def evaluate(model):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            inputs, labels = data
            outputs = model(inputs) # use last element returned by forward function
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

## Gradient Accumulation

In [5]:
accumulation_steps = 4

In [6]:
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

for epoch in range(2):
    net.train()
    running_loss = 0.0
    
    for idx, data in enumerate(trainloader):
        inputs, labels = data
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        
        if ((idx + 1) % accumulation_steps == 0) or ((idx+1) == len(trainloader)):
            optimizer.step()
            optimizer.zero_grad()
            
        running_loss += loss.item()
        
    accuracy = evaluate(net)
        
    print(f"Epoch {epoch + 1}, Loss: {round(running_loss / len(trainloader), 2)}, Accuracy: {accuracy * 100:.2f}%")

Epoch 1, Loss: 0.38, Accuracy: 94.20%
Epoch 2, Loss: 0.15, Accuracy: 95.68%


## Typical Training

In [7]:
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

for epoch in range(2):
    net.train()
    running_loss = 0.0
    
    for idx, data in enumerate(trainloader):
        optimizer.zero_grad()
        inputs, labels = data
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        
        optimizer.step()            
        running_loss += loss.item()
        
    accuracy = evaluate(net)
        
    print(f"Epoch {epoch + 1}, Loss: {round(running_loss / len(trainloader), 2)}, Accuracy: {accuracy * 100:.2f}%")

Epoch 1, Loss: 0.31, Accuracy: 94.87%
Epoch 2, Loss: 0.15, Accuracy: 95.14%
