import torch
import matplotlib.pyplot as plt
from torch.utils.data import Dataset

class TwoClassDataset(Dataset):
    # don't forget the self identifier!
    def __init__(self, N, sigma):
        self.N = N # number of data points per class
        self.sigma = sigma # standard deviation of each class cluster
        self.plus_class = self.sigma*torch.randn(N, 2) + torch.tensor([-1, 1])
        self.negative_class = self.sigma*torch.randn(N, 2) + torch.tensor([1, -1])
        self.data = torch.cat((self.plus_class, self.negative_class), dim=0)
        self.labels = torch.cat((torch.ones(self.N), torch.zeros(self.N)))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = self.data[idx]
        y = self.labels[idx]
        return x, y # return input and output pair

N = 50
sigma = 1.5
dataset = TwoClassDataset(N, sigma)

plus_data = dataset.plus_class
negative_data = dataset.negative_class
print('Dataset has {} points'.format(len(dataset)))
idx = 2
x, y = dataset[idx]
print('Dataset point with index {} is at x={} and label y={}'.format(idx, x, y))
plt.figure(figsize=(8, 6))
plt.scatter(plus_data[:, 0].numpy(), plus_data[:, 1].numpy(), color='tomato', s=50, edgecolor='black')
plt.scatter(negative_data[:, 0].numpy(), negative_data[:, 1].numpy(), color='cornflowerblue', s=50, edgecolor='black')
plt.tight_layout()

Dataset has 100 points
Dataset point with index 2 is at x=tensor([-2.6753, -0.9532]) and label y=1.0

import numpy as np
from torch.utils.data import DataLoader
from torch.utils.data import SubsetRandomSampler

# create indices for each split of dataset
N_train = 60
N_val = 20
N_test = 20
indices = np.arange(len(dataset))
np.random.shuffle(indices)
train_indices = indices[:N_train]
val_indices = indices[N_train:N_train+N_val]
test_indices = indices[N_train+N_val:]

# create dataloader for each split
batch_size = 8
train_loader = DataLoader(dataset, batch_size=batch_size, sampler=SubsetRandomSampler(train_indices))
val_loader = DataLoader(dataset, batch_size=batch_size, sampler=SubsetRandomSampler(val_indices))
test_loader = DataLoader(dataset, batch_size=batch_size, sampler=SubsetRandomSampler(test_indices))

# data loaders are iterable
for x_batch, y_batch in val_loader:
    print(x_batch, y_batch)

tensor([[-1.8366,  0.4123],
        [-0.0869,  4.4045],
        [-0.8618, -1.8265],
        [-3.2508,  3.2150],
        [ 2.1824, -3.7648],
        [-0.8552, -0.2620],
        [-0.1092,  2.2915],
        [ 1.0447,  0.4327]]) tensor([1., 1., 0., 1., 0., 1., 1., 0.])
tensor([[ 1.9356e+00, -1.0094e+00],
        [-5.8469e-01, -2.3641e-01],
        [ 5.6159e+00, -1.2584e+00],
        [ 1.7330e-01, -1.4470e+00],
        [-6.5362e-04,  1.4384e+00],
        [-1.9069e+00, -1.6281e+00],
        [-4.7954e+00,  7.6931e-01],
        [-9.0095e-01,  4.6437e+00]]) tensor([0., 1., 0., 0., 1., 1., 1., 1.])
tensor([[ 0.3596,  3.3488],
        [ 2.6648, -1.9620],
        [ 1.6563, -2.3351],
        [ 1.0746, -0.0341]]) tensor([1., 0., 0., 0.])

# code from previous lecture
import torch.nn as nn

class LogisticRegression(nn.Module):
    def __init__(self, N):
        super().__init__()
        self.w = nn.Parameter(torch.ones(N))
        self.b = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        return 1/(1+torch.exp(-(self.w@x+self.b)))

# compute classification accuracy
def model_accuracy(model, input_data, labels):
    predictions = model(input_data.unsqueeze(-1)).squeeze(-1)
    positive_preds = predictions >= 0.5
    negative_preds = predictions < 0.5
    n_correct = torch.sum(positive_preds*labels)+torch.sum(negative_preds*(1-labels))
    return n_correct

import numpy as np
from torch.utils.data import DataLoader
from torch.utils.data import SubsetRandomSampler

# create indices for each split of dataset
N_train = 60
N_val = 20
N_test = 20
indices = np.arange(len(dataset))
np.random.shuffle(indices)
train_indices = indices[:N_train]
val_indices = indices[N_train:N_train+N_val]
test_indices = indices[N_train+N_val:]

# create dataloader for each split
batch_size = 8
train_loader = DataLoader(dataset, batch_size=batch_size, sampler=SubsetRandomSampler(train_indices))
val_loader = DataLoader(dataset, batch_size=batch_size, sampler=SubsetRandomSampler(val_indices))
test_loader = DataLoader(dataset, batch_size=batch_size, sampler=SubsetRandomSampler(test_indices))

# training setup
criterion = nn.BCELoss(reduction='mean') # binary cross-entropy loss, use mean loss
lr = 1e-2 # learning rate
logreg_model = LogisticRegression(2) # initialize model
optimizer = torch.optim.SGD(logreg_model.parameters(), lr=lr, momentum=0.99, weight_decay=1e-3) # initialize optimizer

n_epoch = 50 # number of passes through the training dataset
loss_values, train_accuracies, val_accuracies = [], [], []
for n in range(n_epoch):
    epoch_loss, epoch_acc = 0, 0
    for x_batch, y_batch in train_loader:
        # zero out gradients
        optimizer.zero_grad()
        # pass batch to model
        predictions = logreg_model(x_batch.unsqueeze(-1)).squeeze(-1) # make dimensions match for loss function
        # calculate loss
        loss = criterion(predictions, y_batch)
        # backpropagate and update
        loss.backward()
        optimizer.step()
        # logging
        epoch_loss += loss.item()
        epoch_acc += model_accuracy(logreg_model, x_batch, y_batch)
    loss_values.append(epoch_loss/len(train_loader))
    train_accuracies.append(epoch_acc/N_train)
    # validation performance
    val_acc = 0
    for x_batch, y_batch in val_loader:
        # don't compute gradients since we are only evaluating the model
        with torch.no_grad():
            val_acc += model_accuracy(logreg_model, x_batch, y_batch)
    val_accuracies.append(val_acc/N_val)

plt.figure(figsize=(12,6))
plt.subplot(131)
plt.semilogy(loss_values)
plt.grid(True)
plt.title('Loss values')
plt.xlabel('Epoch')
plt.subplot(132)
plt.plot(train_accuracies)
plt.grid(True)
plt.title('Training Accuracy')
plt.xlabel('Epoch')
plt.subplot(133)
plt.plot(val_accuracies)
plt.grid(True)
plt.title('Validation Accuracy')
plt.xlabel('Epoch')

Text(0.5, 0, 'Epoch')

class MulticlassLogisticRegression(nn.Module):
    def __init__(self, N, M):
        super().__init__()
        self.N = N # input dimension
        self.M = M # number of classes
        self.weight_matrix = nn.Linear(N, M, bias=True) # N input dimensions, M output dimensions

    def forward(self, x):
        return self.weight_matrix(x)

import torch

from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler

class FourClassDataset(Dataset):
    def __init__(self, N, sigma):
        self.N = N # number of data points per class
        self.sigma = sigma # standard deviation of each class cluster
        self.class_zero = self.sigma*torch.randn(N, 2) + torch.tensor([1, 1])
        self.class_one = self.sigma*torch.randn(N, 2) + torch.tensor([-1, 1])
        self.class_two = self.sigma*torch.randn(N, 2) + torch.tensor([-1, -1])
        self.class_three = self.sigma*torch.randn(N, 2) + torch.tensor([1, -1])
        self.data = torch.cat((self.class_zero, self.class_one, self.class_two, self.class_three), dim=0)
        self.labels = torch.cat((torch.zeros(self.N), torch.ones(self.N),
                                 2*torch.ones(self.N), 3*torch.ones(self.N))).long()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = self.data[idx]
        y = self.labels[idx]
        return x, y # return input and output pair

# visualize dataset
N = 50
sigma = 0.6
dataset = FourClassDataset(N, sigma)
class_zero = dataset.class_zero
class_one = dataset.class_one
class_two = dataset.class_two
class_three = dataset.class_three
plt.figure(figsize=(8, 6))
plt.scatter(class_zero[:, 0].numpy(), class_zero[:, 1].numpy(), color='tomato', s=50, edgecolor='black', label='Class 0')
plt.scatter(class_one[:, 0].numpy(), class_one[:, 1].numpy(), color='cornflowerblue', s=50, edgecolor='black', label='Class 1')
plt.scatter(class_two[:, 0].numpy(), class_two[:, 1].numpy(), color='seagreen', s=50, edgecolor='black', label='Class 2')
plt.scatter(class_three[:, 0].numpy(), class_three[:, 1].numpy(), color='violet', s=50, edgecolor='black', label='Class 3')
plt.grid(True)
plt.legend()
plt.tight_layout()

def multiclass_model_accuracy(model, input_data, labels):
    predictions = model(input_data) # no need to squeeze/unsqueeze dimensions now!
    predicted_classes = torch.argmax(predictions, dim=1) # find highest scoring class along the columns
    n_correct = torch.sum(torch.eq(predicted_classes, labels))
    return n_correct
    
# Part a) create DataLoaders
N_train = 120
N_val = 40
N_test = 40
batch_size = 16
indices = np.arange(len(dataset))
np.random.shuffle(indices)
train_indices = indices[:N_train]
val_indices = indices[N_train:N_train+N_val]
test_indices = indices[N_train+N_val:]
train_loader = DataLoader(dataset, batch_size=batch_size, sampler=SubsetRandomSampler(train_indices))
val_loader = DataLoader(dataset, batch_size=batch_size, sampler=SubsetRandomSampler(val_indices))
test_loader = DataLoader(dataset, batch_size=batch_size, sampler=SubsetRandomSampler(test_indices))

# Part b) training loop
# initialize MulticlassLogisticRegression model
N = 2
M = 4
model = MulticlassLogisticRegression(N, M)

# initialize loss function and optimizer
criterion = nn.CrossEntropyLoss()
lr = 1e-4
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99)

# logging info
loss_values, train_accuracies, val_accuracies = [], [], []
n_epoch = 300 # set this value
for n in range(n_epoch):
    epoch_loss, epoch_acc = 0, 0
    for x_batch, y_batch in train_loader:
        # zero out gradients
        optimizer.zero_grad()
        # pass batch to model, no need to worry about using squeeze/unsqueeze now
        predictions = model(x_batch)
        # calculate loss
        loss = criterion(predictions, y_batch)
        # backpropagate and update
        loss.backward() # backprop
        optimizer.step()
        # logging to update epoch_loss (add loss value) and epoch_acc (add current batch accuracy)
        epoch_loss += loss.item()
        epoch_acc += multiclass_model_accuracy(model, x_batch, y_batch)

    loss_values.append(epoch_loss/len(train_loader))
    train_accuracies.append(epoch_acc/N_train)
    # validation performance
    val_acc = 0
    for x_batch, y_batch in val_loader:
        # don't compute gradients since we are only evaluating the model
        with torch.no_grad():
            # validation batch accuracy
            val_acc += multiclass_model_accuracy(model, x_batch, y_batch)
    val_accuracies.append(val_acc/N_val)

plt.figure(figsize=(12,6))
plt.subplot(131)
plt.semilogy(loss_values)
plt.grid(True)
plt.title('Loss values')
plt.xlabel('Epoch')
plt.subplot(132)
plt.plot(train_accuracies)
plt.grid(True)
plt.title('Training Accuracy')
plt.xlabel('Epoch')
plt.subplot(133)
plt.plot(val_accuracies)
plt.grid(True)
plt.title('Validation Accuracy')
plt.xlabel('Epoch')

Text(0.5, 0, 'Epoch')

Lecture 12 - Pytorch datasets¶

ECE364 - Programming Methods for Machine Learning¶

Nickvash Kani¶

Slides based off prior lectures by Alex Schwing, Aigou Han, Farzad Kamalabadi, Corey Snyder. All mistakes are my own!¶

Experimental Setup for Machine Learning Problems¶

PyTorch Datasets¶

PyTorch Dataloaders¶

The New and Improved Training Loop¶

Multi-class Logistic Regression¶

Example: Toy Multi-class Logistic Regression¶

Note: Why do we have to reset the gradient?¶

That's it for today¶