# CS 462 - Lecture 16

## Embeddings and Language Models

Bernhard Firner

2026-03-26

---

## Review

* Forgot to post the code for CIFAR10 training:

```python
import argparse
import os
import numpy as np
import random
from PIL import Image
import torch
import torchvision
import tarfile
from torchvision.ops.stochastic_depth import StochasticDepth

import pickle

def unpickle(file_obj):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

def read_cifar(file_path):
    """
    The tar.gz file should contain the following files:
        readme.html
        batches.meta
        data_batch_1
        data_batch_2
        data_batch_3
        data_batch_4
        data_batch_5
        test_batch
    This function returns a tuple of training data, training labels, testing data, testing labels, and label names.
    """
    with tarfile.open(file_path, 'r:gz') as f:
        # If you have limited RAM, then extracting these one at a time would be better
        dict1 = pickle.load(f.extractfile('cifar-10-batches-py/data_batch_1'), encoding='bytes')
        print(f"keys are {list(dict1.keys())}")
        dict2 = pickle.load(f.extractfile('cifar-10-batches-py/data_batch_2'), encoding='bytes')
        dict3 = pickle.load(f.extractfile('cifar-10-batches-py/data_batch_3'), encoding='bytes')
        dict4 = pickle.load(f.extractfile('cifar-10-batches-py/data_batch_4'), encoding='bytes')
        dict5 = pickle.load(f.extractfile('cifar-10-batches-py/data_batch_5'), encoding='bytes')
        test_dict = pickle.load(f.extractfile('cifar-10-batches-py/test_batch'), encoding='bytes')
        metadata = pickle.load(f.extractfile('cifar-10-batches-py/batches.meta'), encoding='bytes')

training_data = np.concatenate((dict1[b'data'], dict2[b'data'], dict3[b'data'], dict4[b'data'], dict5[b'data']))
        training_labels = np.concatenate((dict1[b'labels'], dict2[b'labels'], dict3[b'labels'], dict4[b'labels'], dict5[b'labels']))
        # This should already be a numpy array of type uint8
        test_data = test_dict[b'data']
        test_labels = np.array(test_dict[b'labels'])

label_names = metadata[b'label_names']

return training_data, training_labels, test_data, test_labels, label_names

class Linear(torch.nn.Module):
    """A linear neural network."""

def __init__(self, nonlinearity = torch.nn.ReLU, classes=10):
        super(Linear, self).__init__()
        self.net = torch.nn.Sequential(
                torch.nn.Flatten(),
                torch.nn.Linear(3*32*32, 512),
                nonlinearity(),
                torch.nn.Linear(512, 120),
                nonlinearity(),
                torch.nn.Linear(120, 84),
                torch.nn.Linear(84, classes)
                )
        self.decision = torch.nn.Softmax(dim=1)

for layer in [1, 3, 5]:
            torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu")

def forward(self, x):
        """Forward through the network."""
        y_hat = self.decision(self.net(x))
        return y_hat

class LeNet5(torch.nn.Module):
    """A mostly faithful recreation of LeNet 5."""

def __init__(self, nonlinearity = torch.nn.ReLU, classes=10):
        super(LeNet5, self).__init__()
        self.net = torch.nn.Sequential(
                # 5x5 convolution with 6 output feature maps
                torch.nn.Conv2d(3, 6, 5, padding=2),
                # 2x2 subsampling learned bias and weight, called S2 in the paper.
                # We'll use an average pool and then a 1x1 conv with 6 groups to emulate that.
                torch.nn.AvgPool2d(kernel_size=2, stride=2),
                torch.nn.Conv2d(6, 6, kernel_size=1, groups=6),
                nonlinearity(),
                # 5x5 convolution with 6 output feature maps of size 5x5
                torch.nn.Conv2d(6, 16, kernel_size=5),
                # This again, emulating layer S4 from the paper.
                torch.nn.AvgPool2d(kernel_size=2, stride=2),
                torch.nn.Conv2d(16, 16, kernel_size=1, groups=16),
                nonlinearity(),
                # The final convolution reduces features to 1x1 if this is 28x28, 2x2 if 32x32
                torch.nn.Conv2d(16, 120, kernel_size=5, stride=1),
                torch.nn.Flatten(),
                torch.nn.Linear(480, 84),
                # Training with cross entropy, not the exemplar method in the original LeNet
                torch.nn.Linear(84, classes),
                )
        self.decision = torch.nn.Softmax(dim=1)

for layer in [0, 2, 4, 6, 8, 10, 11]:
            torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu")

def forward(self, x):
        """Forward through the network."""
        y_hat = self.decision(self.net(x))
        return y_hat

class ConvNet(torch.nn.Module):
    """A mostly faithful recreation of LeNet 5."""

def __init__(self, nonlinearity = torch.nn.ReLU, classes=10):
        super(ConvNet, self).__init__()
        self.net = torch.nn.Sequential(
                # Basic ConvNet as in the 20xx years
                torch.nn.Conv2d(in_channels=3, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                nonlinearity(),
                torch.nn.Flatten(),
                torch.nn.Linear(240, 60),
                torch.nn.Linear(60, classes),
                )
        self.decision = torch.nn.Softmax(dim=1)

for layer in [0, 2, 4, 7]:
            torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu")

def forward(self, x):
        """Forward through the network."""
        y_hat = self.decision(self.net(x))
        return y_hat

class WideConvNet(torch.nn.Module):
    """A mostly faithful recreation of LeNet 5."""

def __init__(self, nonlinearity = torch.nn.ReLU, classes=10):
        super(WideConvNet, self).__init__()
        self.net = torch.nn.Sequential(
                # Basic ConvNet as in the 20xx years
                torch.nn.Conv2d(in_channels=3, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=30, kernel_size=(3,3), padding=1, stride=2),
                #torch.nn.BatchNorm2d(15),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=30, out_channels=60, kernel_size=(3,3), padding=1, stride=2),
                nonlinearity(),
                torch.nn.Flatten(),
                torch.nn.Linear(4*240, 60),
                torch.nn.Linear(60, classes),
                )
        self.decision = torch.nn.Softmax(dim=1)

for layer in [0, 2, 4, 7]:
            torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu")

def forward(self, x):
        """Forward through the network."""
        y_hat = self.decision(self.net(x))
        return y_hat

class DeepConvNet(torch.nn.Module):
    """A mostly faithful recreation of LeNet 5."""

def __init__(self, nonlinearity = torch.nn.ReLU, classes=10):
        super(DeepConvNet, self).__init__()
        self.net = torch.nn.Sequential(
                # Basic ConvNet as in the 20xx years
                torch.nn.Conv2d(in_channels=3, out_channels=15, kernel_size=(3,3), padding=1, stride=1),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                #torch.nn.BatchNorm2d(15),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=1),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                #torch.nn.BatchNorm2d(15),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                nonlinearity(),
                torch.nn.Flatten(),
                torch.nn.Linear(240, 60),
                torch.nn.Linear(60, classes),
                )
        self.decision = torch.nn.Softmax(dim=1)

for layer in [0, 2, 4, 6, 8]:
            torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu")

def forward(self, x):
        """Forward through the network."""
        y_hat = self.decision(self.net(x))
        return y_hat

class DeeperConvNet(torch.nn.Module):
    """A mostly faithful recreation of LeNet 5."""

def __init__(self, nonlinearity = torch.nn.ReLU, classes=10):
        super(DeeperConvNet, self).__init__()
        self.net = torch.nn.Sequential(
                # Basic ConvNet as in the 20xx years
                torch.nn.Conv2d(in_channels=3, out_channels=15, kernel_size=(3,3), padding=1, stride=1),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                #torch.nn.BatchNorm2d(15),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=1),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                #torch.nn.BatchNorm2d(15),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=1),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                #torch.nn.BatchNorm2d(15),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                nonlinearity(),
                torch.nn.Flatten(),
                torch.nn.Linear(60, 60),
                torch.nn.Linear(60, classes),
                )
        self.decision = torch.nn.Softmax(dim=1)

for layer in [0, 2, 4, 6, 8, 10, 12]:
            torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu")

def forward(self, x):
        """Forward through the network."""
        y_hat = self.decision(self.net(x))
        return y_hat

class DeeperBNConvNet(torch.nn.Module):
    """A mostly faithful recreation of LeNet 5."""

def __init__(self, nonlinearity = torch.nn.ReLU, classes=10):
        super(DeeperBNConvNet, self).__init__()
        self.net = torch.nn.Sequential(
                # Basic ConvNet as in the 20xx years
                torch.nn.Conv2d(in_channels=3, out_channels=15, kernel_size=(3,3), padding=1, stride=1),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                torch.nn.BatchNorm2d(15),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=1),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                torch.nn.BatchNorm2d(15),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=1),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                torch.nn.BatchNorm2d(15),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2),
                nonlinearity(),
                torch.nn.Flatten(),
                torch.nn.Linear(60, 60),
                torch.nn.Linear(60, classes),
                )
        self.decision = torch.nn.Softmax(dim=1)

for layer in [0, 2, 5, 7, 10, 12, 15]:
            torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu")

def forward(self, x):
        """Forward through the network."""
        y_hat = self.decision(self.net(x))
        return y_hat

class VariableBNConvNet(torch.nn.Module):
    """ConvNet with batch normal and a variable width and depth."""

def __init__(self, nonlinearity = torch.nn.ReLU, classes=10, repeats=1, width=30):
        super(VariableBNConvNet, self).__init__()
        layers = []
        layers.append(torch.nn.Conv2d(in_channels=3, out_channels=width, kernel_size=(3,3), padding=1, stride=1))
        torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu")
        layers.append(nonlinearity())
        # The layer conv will have stride 2, and we've already done one with stride 1
        for _ in range(repeats - 2):
                layers.append(torch.nn.Conv2d(in_channels=width, out_channels=width, kernel_size=(3,3), padding=1, stride=1))
                torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu")
                layers.append(torch.nn.BatchNorm2d(width))
                layers.append(nonlinearity())
        # Stride 2 convolution
        layers.append(torch.nn.Conv2d(in_channels=width, out_channels=width, kernel_size=(3,3), padding=1, stride=2))
        torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu")
        layers.append(torch.nn.BatchNorm2d(width))
        layers.append(nonlinearity())
        for _ in range(2):
            for _ in range(repeats - 1):
                    layers.append(torch.nn.Conv2d(in_channels=width, out_channels=width, kernel_size=(3,3), padding=1, stride=1))
                    torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu")
                    layers.append(torch.nn.BatchNorm2d(width))
                    layers.append(nonlinearity())
            # Stride 2 convolution
            layers.append(torch.nn.Conv2d(in_channels=width, out_channels=width, kernel_size=(3,3), padding=1, stride=2))
            torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu")
            layers.append(torch.nn.BatchNorm2d(width))
            layers.append(nonlinearity())

# Final conv
        layers.append(torch.nn.Conv2d(in_channels=width, out_channels=width, kernel_size=(3,3), padding=1, stride=2))
        torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu")
        layers.append(nonlinearity())
        layers.append(torch.nn.Flatten())
        layers.append(torch.nn.Linear(4*width, 60))
        layers.append(torch.nn.Linear(60, classes))

self.net = torch.nn.Sequential(*layers)

self.decision = torch.nn.Softmax(dim=1)

def forward(self, x):
        """Forward through the network."""
        y_hat = self.decision(self.net(x))
        return y_hat

class ResBlock(torch.nn.Module):
    """Simplifies using a residual block."""
    def __init__(self, nonlinearity, in_channels, out_channels, kernel_size, padding, stride, depth=None):
        super(ResBlock, self).__init__()
        self.net = torch.nn.Sequential(
                torch.nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
                                kernel_size=kernel_size, padding=padding, stride=1),
                torch.nn.BatchNorm2d(in_channels),
                nonlinearity(),
                torch.nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
                                kernel_size=kernel_size, padding=padding, stride=stride),
                torch.nn.BatchNorm2d(out_channels),
                )
        torch.nn.init.kaiming_normal_(self.net[3].weight.data, nonlinearity="relu")
        self.a = nonlinearity()
        # Either preserve the original input or use a 1x1 convolution to
        # increase channels or decrease dimensions. This is consistent with the original paper.
        if in_channels == out_channels and stride == 1:
            self.identity = lambda x: x
        else:
            self.identity = torch.nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
                                            kernel_size=1, padding=0, stride=stride)
        if depth is None:
            self.depth = None
        else:
            self.depth = StochasticDepth(depth, "row")

def forward(self, x):
        y = self.net(x)
        if self.depth is not None:
            y = self.depth(y)
        x_prime = self.identity(x)
        return self.a(y + x_prime)

class VariableResNet(torch.nn.Module):
    """Resnet type thing."""

def __init__(self, nonlinearity = torch.nn.ReLU, classes=10, blocks=30, width=30, depth_prob=None):
        super(VariableResNet, self).__init__()
        layers = []
        layers.append(torch.nn.Conv2d(in_channels=3, out_channels=width, kernel_size=(3,3), padding=1, stride=1))
        torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu")
        layers.append(nonlinearity())
        # The layer conv will have stride 2, and we've already done one with stride 1
        for _ in range(blocks - 2):
            layers.append(ResBlock(nonlinearity, width, width, 3, 1, 1))
        # Stride 2 convolution
        layers.append(ResBlock(nonlinearity, width, width, 3, 1, 2))
        # Need to downscale twice more, so divide the blocks evenly between the two feature map sizes
        blocks_1 = (blocks - 2)//2
        blocks_2 = blocks - 2 - blocks_1
        for block_idx in range(blocks_1):
            if depth_prob is not None:
                idx_prob = 0.4 * block_idx / (blocks_1 + blocks_2 - 1.0)
                layers.append(ResBlock(nonlinearity, width, width, 3, 1, 1, idx_prob))
            else:
                layers.append(ResBlock(nonlinearity, width, width, 3, 1, 1))
        # Stride 2 convolution
        layers.append(ResBlock(nonlinearity, width, width, 3, 1, 2))
        for block_idx in range(blocks_2):
            if depth_prob is not None:
                idx_prob = 0.4 * (blocks_1 + block_idx) / (blocks_1 + blocks_2 - 1.0)
                layers.append(ResBlock(nonlinearity, width, width, 3, 1, 1, idx_prob))
            else:
                layers.append(ResBlock(nonlinearity, width, width, 3, 1, 1))
        # Stride 2 convolution
        layers.append(ResBlock(nonlinearity, width, width, 3, 1, 2))

# Final conv
        layers.append(torch.nn.Conv2d(in_channels=width, out_channels=width,
                                      kernel_size=(3,3), padding=1, stride=2))
        torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu")
        layers.append(nonlinearity())
        layers.append(torch.nn.Flatten())
        layers.append(torch.nn.Linear(4*width, 60))
        layers.append(torch.nn.Linear(60, classes))

self.net = torch.nn.Sequential(*layers)

self.decision = torch.nn.Softmax(dim=1)

def forward(self, x):
        """Forward through the network."""
        y_hat = self.decision(self.net(x))
        return y_hat

def preprocess(images, order, device):
    # Normalize and then add a batch dimension
    # This will be required for convolutions, although it doesn't matter for the linear network.
    preprocessed = torch.tensor(images[order]).float()
    # Convert to 0 mean and unit variance
    var, mean = torch.var_mean(preprocessed)
    preprocessed = (preprocessed - mean) / var

# Add a channel dimension
    return preprocessed.reshape((-1, 3, 32, 32)).to(device)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--cifar",
        required=True,
        help="gzipped tar archive, as from https://www.cs.toronto.edu/~kriz/cifar.html.")
    parser.add_argument(
        "--epochs",
        required=False,
        type=int,
        default=10,
        help="Number of epochs to train.")
    parser.add_argument(
        "--train_samples",
        required=False,
        type=int,
        default=50000,
        help="Number of samples to use for training.")
    parser.add_argument(
        "--save_mismatch",
        required=False,
        type=int,
        default=0,
        help="The number of mismatches to save.")
    parser.add_argument(
        "--batch_size",
        required=False,
        type=int,
        default=8,
        help="The batch size.")
    parser.add_argument(
        "--random_seed",
        required=False,
        type=int,
        default=112,
        help="The random seed.")
    parser.add_argument(
        "--device",
        required=False,
        type=str,
        default=None,
        help="Override the automatically determined device (cuda or cpu).")
    parser.add_argument(
        "--model",
        required=False,
        type=str,
        default=None,
        help="mlp (multi-layer perceptron or conv")
    parser.add_argument(
        "--error_rate",
        required=False,
        type=float,
        default=0.0,
        help="The training label error rate.")
    parser.add_argument(
        "--weight_decay",
        required=False,
        type=float,
        default=0.0,
        help="Weight decay.")
    parser.add_argument(
        "--stochastic_depth",
        required=False,
        type=float,
        default=None,
        help="Max stochastic depth.")
    parser.add_argument(
        "--flips",
        required=False,
        default=False,
        action="store_true",
        help="Enable flip preprocessing.")

args = parser.parse_args()

# Seed all of the random number generators for repeatability.
    # Keep in mind though, that some algorithms are nondeterministic, so this
    # doesn't guarantee fully repeatable results.
    np.random.default_rng(args.random_seed)
    torch.manual_seed(args.random_seed)
    random.seed(args.random_seed)

if args.device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    else:
        device = torch.device(args.device)
    print(f"Using device: {device}")

print("Loading data")
    X_train, Y_train, X_test, Y_test, label_names = read_cifar(args.cifar)
    unique_classes = np.unique(Y_test)
    total_classes = len(unique_classes)
    # Make sure the first class is 0
    if min(unique_classes) != 0:
        Y_train = Y_train - min(unique_classes)
        X_train = X_train - min(unique_classes)

## If you want to save some examples
    #for i in range (100):
    #    # PIL Images expect the channel dimension last
    #    Image.fromarray(np.moveaxis(X_test[i].reshape((3, 32, 32)), 0, -1)).save(f"example_test_cifar_{i}.png")
    #print(f"Saved images have classes {[str(pos) + ':' + str(label_names[idx]) for pos, idx in enumerate(Y_test[:100])]}")

# Create the model
    if args.model == "mlp":
        model = Linear(classes=total_classes)
    elif args.model == "conv":
        model = ConvNet(classes=total_classes)
    elif args.model == "wide":
        model = WideConvNet(classes=total_classes)
    elif args.model == "deep":
        model = DeepConvNet(classes=total_classes)
    elif args.model == "deeper":
        model = DeeperConvNet(classes=total_classes)
    elif args.model == "deeperbn":
        model = DeeperBNConvNet(classes=total_classes)
    elif args.model == "lenet":
        model = LeNet5(classes=total_classes)
    elif args.model == "res":
        model = VariableResNet(classes=total_classes, depth_prob=args.stochastic_depth)
    elif int(args.model) > 0:
        model = VariableBNConvNet(classes=total_classes, repeats=int(args.model))
    else:
        print(f"Unknown model type requested: {args.model}")
        os.quit()

# Don't shuffle the test data, but otherwise treat it the same as the training data.
    X_test = preprocess(X_test, np.arange(X_test.shape[0]), device)
    Y_test = torch.tensor(Y_test).long().to(device)
    test_batch_size = 1000

if args.error_rate > 0.0:
        # Insert errors into the training data at the given error rate
        total_errors = int(args.error_rate * len(Y_train))
        to_change = random.choices(np.arange(len(Y_train)), k=total_errors)
        possible_labels = []
        for original in np.arange(10):
            # The possible wrong labels are every value but the correct one
            possible_labels.append(list(np.arange(original)) + list(np.arange(original+1, 10)))
        # This is read only, so make a writeable copy
        Y_train = Y_train.copy()
        for idx in to_change:
            original = Y_train[idx]
            Y_train[idx] = random.choice(possible_labels[original])

# Shuffle and preprocess the training data
    order = np.arange(X_train.shape[0])
    np.random.shuffle(order)

X_train = preprocess(X_train, order, device)
    Y_train = torch.tensor(Y_train[order]).long().to(device)

# Are we doing training, or just reloading?
    model.to(device)

if args.model == "res":
        optimizer = torch.optim.AdamW(model.parameters(), lr=0.0002, weight_decay=args.weight_decay)
        criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr=0.0002, weight_decay=args.weight_decay)
        criterion = torch.nn.CrossEntropyLoss()

if args.flips:
        tx = torchvision.transforms.Compose([
            torchvision.transforms.RandomHorizontalFlip(p=0.5)
            ])
    else:
        tx = None

# See how many batches we'll use per epoch
    batches = int(np.ceil(X_train.shape[0]/float(args.batch_size)))
    # We could just do this in one step, but let's assume that memory is finite
    test_batches = int(np.ceil(X_test.shape[0]/float(test_batch_size)))

print(f"Training on {X_train.shape[0]} examples over {batches} batches")

print(model)
    num_params = sum(param.numel() for param in model.parameters() if param.requires_grad)
    print(f"Model has {num_params} parameters.")

for epoch in range(args.epochs):

total_loss = 0.0
        model.train()
        for batch in range(batches):
            begin = batch*args.batch_size
            end = (batch+1)*args.batch_size

if tx is None:
                X_batch = X_train[begin:end]
            else:
                X_batch = tx(X_train[begin:end])
            Y_batch = Y_train[begin:end]

# Zero gradients before gradient calculation
            optimizer.zero_grad()

y_hat = model(X_batch)
            loss = criterion(y_hat, Y_batch)
            total_loss += loss.item() * X_batch.size(0)

# Gradient calculation
            loss.backward()
            # Update weights
            optimizer.step()

epoch_loss = total_loss / X_train.size(0)
        print(f"Epoch {epoch} training loss {epoch_loss}")

# Evaluation
        # Don't calculate gradients during these steps
        model.eval()
        with torch.no_grad():
            total_loss = 0.0
            for batch in range(test_batches):
                begin = batch*test_batch_size
                end = (batch+1)*test_batch_size

X_batch = X_test[begin:end]
                Y_batch = Y_test[begin:end]

y_hat = model(X_batch)
                loss = criterion(y_hat, Y_batch)
                total_loss += loss.item() * X_batch.size(0)
            epoch_loss = total_loss / X_test.size(0)
            print(f"Epoch {epoch} testing loss {epoch_loss}")
            ## Accuracy values
            # We can't just run over everything, that takes too much memory. Chop it up.
            matches = 0
            mismatches = 0
            for testbatch in range(int(np.ceil(X_train.shape[0]/float(test_batch_size)))):
                begin = testbatch*test_batch_size
                end = (testbatch+1)*test_batch_size
                y_hat = model(X_train[begin:end])
                classes = torch.argmax(y_hat, dim=1)
                matches += torch.sum(classes == Y_train[begin:end])
                mismatches += torch.sum(classes != Y_train[begin:end])
            train_accuracy = matches/X_train.size(0)
            matches = 0
            mismatches = 0
            for testbatch in range(int(np.ceil(X_test.shape[0]/float(test_batch_size)))):
                begin = testbatch*test_batch_size
                end = (testbatch+1)*test_batch_size
                y_hat = model(X_test[begin:end])
                classes = torch.argmax(y_hat, dim=1)
                matches += torch.sum(classes == Y_test[begin:end])
                mismatches += torch.sum(classes != Y_test[begin:end])
            test_accuracy = matches/X_test.size(0)
            print(f"Epoch {epoch} accuracies are {train_accuracy} {test_accuracy}")

# Final evaluation
    model.eval()
    with torch.no_grad():
        # DNN classification
        matches = 0
        mismatches = 0
        failed_indices = []
        for testbatch in range(int(np.ceil(X_test.shape[0]/float(test_batch_size)))):
            begin = testbatch*test_batch_size
            end = (testbatch+1)*test_batch_size
            y_hat = model(X_test[begin:end])
            classes = torch.argmax(y_hat, dim=1)
            matches += torch.sum(classes == Y_test[begin:end])
            mismatches += torch.sum(classes != Y_test[begin:end])
            failed_indices.append([begin + idx for idx in torch.where(classes != Y_test[begin:end])])
        sum_matches = matches
        test_accuracy = sum_matches/X_test.size(0)

print(f"Final accuracy {sum_matches}/{X_test.size(0)} ({test_accuracy})")
        print(f"Final failures at indices {failed_indices[0][0].tolist()}")
```

---

## Review: Hidden States

* Let's say we want to predict the next observation in a sequence
* Obviously we need to understand something about what is generating the observation
* In Markov Model terminology, we need to understand the "hidden state" of the system

---

## Learning Hidden States

* We begin not knowing anything about the hidden states, and must derive them from observations
  * So let's think of that hidden state as a function of our past observations
  * $O_{0:t-1}$, in formally
* Now, we likely cannot look all the way back to time 0, so we'll use a context window

---

## Here Comes ML

* That already sounds like we can make this an ML problem
  * given $f(O_{t-c:t})$ predict $O_{t+1}$
* But wait! What are the observations? Can we shove them into a neural network?

---

## Word Problems

* In natural language processing, our observations are words and punctuation
  * This is incredibly sparse, and has strange nonlinearities
  * For example, the words "Punchy" and "Judy" are related, but share only a couple of letters
  * "Sleepy" and "Creepy" share more letters, but their meanins are far apart

---

## Review: Embeddings

* So we need to convert our words into something different
* Let's choose a vector representation, called an embedding
* Notice that we can make embedding to any "token"
  * Token here could be a word, phrase, a song in your playlist, etc

---

## Training Embeddings

* Step one of accomplishing anything is training an embedding
  * Required before learning hidden states and making predictions
* But how?
* Two methods suggested in the [paper that lead to word2vec](https://arxiv.org/abs/1301.3781) are CBOW and Skip-grams

---

## CBOW Example

* *Continuous Bag of Words*
* Let's take this sentence from *Anna Karenina* and a context window of 5:

> Happy families are all alike; every unhappy family is unhappy in its own way.

* A bag of words might get words "happy families all alike"
  * It would predict "are" since that is the word most associated with the others
* If given "unhappy in own way" we would predict "its"

---

## Skip-Gram Example

> Happy families are all alike; every unhappy family is unhappy in its own way.

* Now we take a single word and predict the others
  * Given "are" we would predict, with equal probabilities every other word
* Obviously the exact prediction will depend upon the training corpus
  * And common words, such as "is", will be associated with many other words
  * And being equally associated with many things will end up being a weak association

---

## Training

* This won't require a sophisticated model, just lots of data
  * Although these are just for illustration

```python
class SkipEmbedder(torch.nn.Module):

def __init__(self, num_tokens, embedding_size):
        """
        Initialize a predictor.
        Arguments:
            num_tokens     (int): The number of unique tokens in the vocabulary.
            embedding_size (int): The number of features to represent the hidden state.
        """
        super(SkipEmbedder, self).__init__()
        self.embedding = torch.nn.Embedding(num_tokens, embedding_size)
        # Use the hidden state to predict the next token
        self.predictor = torch.nn.Sequential(
                torch.nn.Linear(embedding_size, num_tokens))
        self.decision = torch.nn.Softmax(dim=1)
        self.embedding.weight.data.uniform_(-0.5/embedding_size, 0.5/embedding_size)

def forward(self, observation):
        return self.decision(self.predictor(self.embedding(observation)))

class CBOWEmbedder(torch.nn.Module):

def __init__(self, num_tokens, embedding_size):
        """
        Initialize a predictor.
        Arguments:
            num_tokens     (int): The number of unique tokens in the vocabulary.
            embedding_size (int): The number of features to represent the hidden state.
        """
        super(CBOWEmbedder, self).__init__()
        self.embedding = torch.nn.Embedding(num_tokens, embedding_size)
        # Use the hidden state to predict the next token
        self.predictor = torch.nn.Sequential(
                torch.nn.Linear(embedding_size, num_tokens))
        self.decision = torch.nn.Softmax(dim=1)
        self.embedding.weight.data.uniform_(-0.5/embedding_size, 0.5/embedding_size)

def forward(self, observation):
        embeds = self.embedding(observation)
        # By taking the mean we make sentences easier to learn from embeddings
        # since they no longer have any ordering.
        return self.decision(self.predictor(torch.mean(embeds, dim=1)))
```

---

## Learning Examples

* Let's learn from something familiar
  * Going to use [Romeo and Juliet, from project Gutenberg](https://www.gutenberg.org/ebooks/1513)
  * It turns out the Anna Karenina is long and uses difficult words
  * And this code is too basic for anything serious
* Note that we'll get the copyright notice at the top, just like any good model

---

## Learning N-Grams

* Let's make our target task word prediction
  * Given previous n tokens, predict the next one
  * We will learn the embedding first, and then the task
* Remember, we are just doing something to learn and test out an embedding
  * Then we'll talk about bigger concepts

---

## N-Gram Model

* Notice that increasing the context eats up memory
  * Linear layer grows by embedding_size * lin_size with each additional token

```python
class NGramPredictor(torch.nn.Module):

def __init__(self, num_tokens, embedding_size, context_length):
        """
        Initialize a predictor.
        Arguments:
            num_tokens     (int): The number of unique tokens in the vocabulary.
            embedding_size (int): The number of features to represent the hidden state.
            context_length (int): The number of tokens of context.
        """
        super(NGramPredictor, self).__init__()
        self.embedding = torch.nn.Embedding(num_tokens, embedding_size)
        # Use the hidden state to predict the next token
        lin_size = 6*math.ceil(math.sqrt(num_tokens))
        self.predictor = torch.nn.Sequential(
                torch.nn.Flatten(),
                torch.nn.Linear(context_length * embedding_size, lin_size),
                torch.nn.LayerNorm(lin_size),
                torch.nn.GELU(),
                torch.nn.Linear(lin_size, num_tokens))
        self.decision = torch.nn.Softmax(dim=1)
        self.embedding.weight.data.uniform_(-0.1, 0.1)

def forward(self, observation):
        embeds = self.embedding(observation)
        return self.decision(self.predictor(embeds))
```

---

## Preprocessing

* Going to use the nltk (natural language toolkit) for preprocessing
* Will also remove stage directions
  * It is also common to replace rare words with a standin, such as "UNK"
  * We'll keep things simple

```python
    with open(args.corpus, 'r') as file:
        text = file.read()
        # Remove stage directions from Shapespeare's corpus
        text = re.sub('^[A-Z]+:$', '', text, flags=re.MULTILINE)
        text = re.sub('^[A-Z]+.$', '', text, flags=re.MULTILINE)
        text = re.sub('^ Enter.*$', '', text, flags=re.MULTILINE)
        text = re.sub(r'\[.*?\]', '', text, flags=re.MULTILINE)
        text = re.sub(r'\*\*\*.*\*\*\*', '', text)
        words = nltk.word_tokenize(words.lower())
        flat_words = np.array(words)

# First, count the number of unique words
    unique_tokens = np.unique(flat_words)
    num_tokens = len(unique_tokens)

print("Converting all words to numbers.")
    word_to_index = {}
    for idx, token in enumerate(unique_tokens):
        word_to_index[token] = idx
    # Convert the words to numbers. These will look up a vector embedding in our model's embedding.
    sentence_indices = [word_to_index[word] for word in flat_words]
```

---

## Preprocessing

* Preprocessing reduces Romeo and Juliet to 4119 unique tokens
* The length of the training corpus is 35,098
  * So words are used on average less than 9 times
  * We know that a few prepositions are going to grab most of that
* This will make things challenging

---

## Pretraining

```python
    # Make a continuous bag of words model
    cbow_model = CBOWEmbedder(num_tokens, args.embedding_size, 2*args.context_length)
    print("Building model.")
    print(cbow_model)

optimizer = torch.optim.AdamW(cbow_model.parameters(), lr=0.001, weight_decay=args.weight_decay)
    criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.001)

# Pretrain the embedding with CBOW
    if args.load is None:
        cbow_model.to(device)
        cbow_model.train()

rng = np.random.default_rng()

# Report loss every 1000 steps
        running_loss = 0.0
        alpha = 0.01
        for step in range(args.steps):
            # Make a batch
            begins = rng.integers(low=0, high=len(sentence_indices) - 2*args.context_length - 1, size=args.batch_size)
            # For continuous bag of words CBOW
            # This is generally used for pretraining
            groups = [sentence_indices[begin:begin+2*args.context_length+1] for begin in begins]
            left = torch.tensor([gr[:args.context_length] for gr in groups], dtype=torch.long)
            labels = torch.tensor([gr[args.context_length] for gr in groups], dtype=torch.long).to(device)
            right = torch.tensor([gr[args.context_length+1:] for gr in groups], dtype=torch.long)
            batch = torch.concatenate((left, right), dim=1).to(device)

# Get ready to learn
            cbow_model.zero_grad()

# Create a context window of the token embeddings
            y_hat = cbow_model(batch)
            loss = criterion(y_hat, labels)
            running_loss += alpha*(loss.item() - running_loss)

# Gradient calculation
            loss.backward()
            torch.nn.utils.clip_grad_norm_(cbow_model.parameters(), 1.0)
            # Update weights
            optimizer.step()
```

---

## Embedding Training

```python
    # Now train the NGRAM predictor
    model = NGramEmbedder(num_tokens, args.embedding_size, args.context_length)
    # Use the pretrained result from the CBOW
    with torch.no_grad():
        model.embedding.weight.copy_(cbow_model.embedding.weight)
        # If we trusted the embedding, we could turn off learning
        #model.embedding.weight.requires_grad = False
    print("Building model.")
    print(model)
    if args.load is not None:
        model.load_state_dict(torch.load(args.load, weights_only=True))
        model.to(device)
    else:

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=args.weight_decay/10)
        criterion = torch.nn.CrossEntropyLoss()

model.to(device)
        model.train()

rng = np.random.default_rng()

# Report loss every 1000 steps
        running_loss = 0.0
        alpha = 0.01
        for step in range(args.steps):
            # Make a batch
            begins = rng.integers(low=0, high=len(sentence_indices) - args.context_length - 1, size=args.batch_size)
            # For next word prediction
            batch = torch.tensor([sentence_indices[begin:begin+args.context_length] for begin in begins], dtype=torch.long).to(device)
            # The labels are the following token
            labels = torch.tensor([sentence_indices[begin+args.context_length] for begin in begins], dtype=torch.long).to(device)

# Get ready to learn
            model.zero_grad()

# Create a context window of the token embeddings
            y_hat = model(batch)
            loss = criterion(y_hat, labels)
            running_loss += alpha*(loss.item() - running_loss)

# Gradient calculation
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # Update weights
            optimizer.step()
```

---

## Copying the Embedding

* Notice that we can simply copy over the pretrained embedding
  * Remember that question about what we can do with initialization?
  * This should start us off nearer to a good minima

```python
    # Use the pretrained result from the CBOW
    with torch.no_grad():
        model.embedding.weight.copy_(cbow_model.embedding.weight)
```

---

## Prediction

```python
    # Now try to predict something.
    # Make 5 predictions, just to see how much variety we can get.
    print(f"Initial prompt is '{args.prompt}'")
    for _ in range(5):
        prompt_words = np.array(nltk.word_tokenize(args.prompt.lower()))
        # Convert prompt words to token values
        prompt_tokens = [word_to_index[word] for word in prompt_words]

model.eval()

if len(prompt_tokens) < args.context_length:
            padding = [np.where(unique_tokens == '.')[0][0]] * (args.context_length - len(prompt_tokens))
            prompt_tokens = padding + prompt_tokens

for i in range(20):
            # Create a context window of the token embeddings
            X = torch.tensor([prompt_tokens[-args.context_length:]]).to(device)

# Find the next token probabilities
            y_hat = model(X)
            next_word_index = np.random.choice(np.arange(len(unique_tokens)), p=y_hat[0].cpu().detach().numpy())
            prompt_tokens.append(next_word_index)
        prompt_words = [unique_tokens[idx] for idx in prompt_tokens]
        # Clean up the tokens.
        for idx in range(len(prompt_words)):
            if idx+1 < len(prompt_words) and prompt_words[idx+1] in ['.', '!', '?', ';', ',']:
                punctuation = prompt_words[idx+1]
                prompt_words[idx] = prompt_words[idx] + punctuation
                prompt_words[idx+1] = ''
                # Make the word after a sentence ending upper case.
                if punctuation in ['.', '?', '!'] and idx+2 < len(prompt_words):
                    prompt_words[idx+2] = prompt_words[idx+2][:1].upper() + prompt_words[idx+2][1:]
            if idx+2 < len(prompt_words) and prompt_words[idx+1] == "’" and len(prompt_words[idx+2]) == 1:
                prompt_words[idx] = prompt_words[idx] + prompt_words[idx+1] + prompt_words[idx+2]
                prompt_words[idx+1] = ''
                prompt_words[idx+2] = ''

print(re.sub(r' +', ' ', ' '.join(prompt_words)))
```

---

## Note About Training

* The loss will look bad
  * In fact, the model will be mostly wrong
  * Of course! This is language, guessing what word comes next will often fail
* But, as long as the predictions are reasonable we are doing okay

---

## Sanity Test

* If you want to be sure that things are working, begin with a subset of the training corpus
* This model will learn a few hundred lines perfectly, regurgitating everything in the original
  * (assuming a large enough context window)
* With that, we can proceed with actual training

---

## Results

```
Initial prompt is 'do you know the cause'
do you know the cause? I will not thy lips. O, my romeo, friar lawrence. Romeo, alack the day
do you know the cause? I have you love, that i must to my thee. Scene ii. Friar lawrence’s
do you know the cause? I will not thy lips. Cheeks, and lie with her, but let me take it.
do you know the cause? I will you and to bed. Ah, juliet’s to; for i will not away
do you know the cause? I would you are? What, shall i groan! Romeo, and would i can not love
```

---

## More Results

```
Initial prompt is 'into this bed of death'
into this bed of death? They romeo in juliet? I will not thy lips. Put thee in, and, and bid
into this bed of death, they romeo in juliet? I will not away. What, the here? I will not well
into this bed of death, they romeo in juliet? I will not away. What, the pox of such a lisping,
into this bed of death? They romeo, friar lawrence. Romeo, go you to bed. Ah, the’s my
into this bed of death, they romeo in juliet? I will not away. What, the here i s. Go and
```

---

## Preposition Problems

* What is most likely after "of"?
  * Romeo, it turns out

```
Initial prompt is 'into this bed of'
. into this bed of romeo? No, not he that tybalt is’s to bed. Ah, juliet’s to
. into this bed of romeo? No, i he not his the that of must, and convoy in the fearful of do
. into this bed of enter? Where here comes romeo? Friar lawrence. I hear some noise. If i come, ,
. into this bed of romeo? No, i he not his the tybalt of the, and i will give me occasion.
. into this bed of awhile, for romeo and in a montague, the only lacks a cover : the fish of if thou
```

---

## Embedding Influence

* The word "descend" is associated with "death" here

```
Initial prompt is 'descend into this bed of'
descend into this bed of death, they romeo in juliet, i will; for it is wisely, and; i come,
descend into this bed of death? They romeo, friar lawrence. Go with me to the fresh i will give you, sir
descend into this bed of death? They romeo, friar lawrence. Ay, forsooth. Well, he may chance to do some
descend into this bed of death? They romeo, friar lawrence. Romeo, there? What is it in that? I will
descend into this bed of death, they romeo in juliet, he is there lies, tybalt. First and, and you will
```

---

## Embedding Influence

* The other words may not even matter

```
Initial prompt is 'descend into this rose of'
descend into this rose of death, and romeo in juliet, he is it is my man, but i ’ ll be hanged
descend into this rose of death, and romeo in juliet, he is not, o, she was well? She’s
descend into this rose of death, and romeo in juliet, he is not, spited, grief you shall be a with sweet
descend into this rose of death, and romeo in juliet, he is not away. What, the pox of such a lisping
descend into this rose of death, and romeo in juliet, he is it is my man, but i ’ ll be hanged
```

---

## Good Embeddings

* Suppose we had a good embedding
  * We should be able to do something cool, like math on concepts in embedded space
  * $count - man + woman \approx countess$
  * $Paris - France + Itely \approx Rome$
* And what does it take to get such good embeddings?
  * Just a dataset of hundreds of billions of words

---

## Comparison With Images

* The embedding here is similar to the initial large convolution in ResNext
  * But learning the embedding is more troublesome
* Obviously everything afterwards depends upon it

---

## Improving Embeddings

* Training just the embedding is difficult
* But we've had good methods for more than 10 years
  * For an example, see [word2vec](https://arxiv.org/abs/1301.3781)
* And now you can download an embedding
  * Here are a few: [https://fasttext.cc/docs/en/crawl-vectors.html](https://fasttext.cc/docs/en/crawl-vectors.html)

---

## Examples

* Here are some "nearest neighbors" from a decent embedding

```
Query word? countess
Countess 0.786198
baroness 0.749479
marchioness 0.743028
duchess 0.741611
viscountess 0.736536
countesses 0.7086
marquess 0.674638
lady-in-waiting 0.653861
noblewoman 0.652758
nobleman 0.646855
Query word? count
counts 0.812811
counted 0.7097
counting 0.685638
count. 0.628533
Count 0.603085
count.And 0.597393
count.The 0.585259
count.I 0.580847
count.If 0.578759
count.So 0.56037
Query word? Bob
Jim 0.785597
Dave 0.734101
Mike 0.727408
Doug 0.710619
Tom 0.703408
.Bob 0.683471
Steve 0.67823
Ted 0.677032
Gary 0.674185
Greg 0.668247
```

---

## Embedding Math

* We can try to play with the embeddings a bit

```python
import io

import torch

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    #n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = torch.tensor([float(token) for token in  tokens[1:]]).float()
    return data

data = load_vectors("16_king_queen_vectors.txt")
print(data)
print(f"king & queen cosine similarity: {torch.cosine_similarity(data['king'], data['queen'], dim=0)}")
print(f"duke & duchess cosine similarity: {torch.cosine_similarity(data['duke'], data['duchess'], dim=0)}")
print(f"man & woman cosine similarity: {torch.cosine_similarity(data['man'], data['woman'], dim=0)}")
print(f"king & man cosine similarity: {torch.cosine_similarity(data['king'], data['man'], dim=0)}")
print(f"queen & woman cosine similarity: {torch.cosine_similarity(data['queen'], data['woman'], dim=0)}")
print(f"(king-man+woman) & queen cosine similarity: {torch.cosine_similarity(data['king']-data['man']+data['woman'], data['queen'], dim=0)}")
print(f"(duke-man+woman) & duchess cosine similarity: {torch.cosine_similarity(data['duke']-data['man']+data['woman'], data['duchess'], dim=0)}")
print(f"the & duchess cosine similarity: {torch.cosine_similarity(data['the'], data['duchess'], dim=0)}")
print(f"the & of cosine similarity: {torch.cosine_similarity(data['the'], data['of'], dim=0)}")
print(f"cat & king cosine similarity: {torch.cosine_similarity(data['cat'], data['king'], dim=0)}")
print(f"cat & queen cosine similarity: {torch.cosine_similarity(data['cat'], data['queen'], dim=0)}")
print(f"cat & dog cosine similarity: {torch.cosine_similarity(data['cat'], data['dog'], dim=0)}")
```

---

## Outputs

```
king & queen cosine similarity: 0.7068513631820679
duke & duchess cosine similarity: 0.7073330879211426
man & woman cosine similarity: 0.7658399939537048
king & man cosine similarity: 0.3418329954147339
queen & woman cosine similarity: 0.36072516441345215
(king-man+woman) & queen cosine similarity: 0.6542676091194153
(duke-man+woman) & duchess cosine similarity: 0.641383707523346
the & duchess cosine similarity: 0.09982849657535553
the & of cosine similarity: 0.5719975233078003
cat & king cosine similarity: 0.2159331738948822
cat & queen cosine similarity: 0.2383471429347992
cat & dog cosine similarity: 0.707862377166748
```

---

## Implications

* Notice that the embeddings must be capturing both parts of speech and meaning
  * 'the' and 'duchess' are not close, but 'the' and 'of' are
* But not all nouns are close
  * 'cat' and 'dog' are used interchangeably, but not 'cat' and 'queen'
* Math works better than noise
  * Reall great embeddings should produce more consistency

---

## Improvements

* So why is our embedding mediocre?
* We should really be applying techniques to improve up training
* At least
  * Subsampling
  * Negative sampling
* And we need a dataset of a billion tokens

---

## Subsampling

* When training CBOW or Skip-grams, the ordering of words doesn't matter
* Since structure has been tossed out of the window, we can also toss the over-common words
  * So if "of" shows up too often, just drop it
  * The sentences will be mangled, but training the embedding will improve without so much redundant data
* We can also search for common phrases and combine them into single tokens

---

## Negative Sampling

* Notice that the model predicts a single word out of thousands, and learns wrong guesses quickly
  * So the majority of the math in the loss function is pushing the same weights to 0 repeatedly
* We can change the loss to only look at the target word and $K$ negative samples
  * Penalize the model for not prediction the target word with probability 1
  * Penalize the model for not predicting the negative samples with probability 0
    * All in log space, of course

---

## Using The Embedding

* Let's say we bothered to get everything required to train a great embedding
  * What next?
* The embedding itself only tells us what words are related to other words
* Our simple linear n-gram predictor needs more context to make good predictions
  * But longer context windows will take huge amounts of memory

---

## Naive Linear Network

* We might want to make our N-gram predictor into a deeper network and give it a larger context window
* The weights of the first layer grow linearly with context length, but adding a hidden layer will cause weights to grow with the square of the context length

```python
        self.predictor = torch.nn.Sequential(
                torch.nn.Flatten(),
                torch.nn.Linear(context_length * embedding_size, lin_size),
                torch.nn.LayerNorm(lin_size),
                torch.nn.GELU(),
                torch.nn.Linear(lin_size, num_tokens))
```

---

## Recurrent Networks

* This problem motivates the development of more interesting models
* One powerful idea is that of a *recurrent* neural network
  * That means a network remembers its own hidden state

---

## RNN

* What if we use each new embedding to modify the current hidden state?
* The RNN is a network that outputs both a prediction and a new hidden state
* The hidden state is then fed back to the RNN at the next time step

---

## RNN

* If the hidden state has enough information, we can predict the next token from it directly
* This has the advantage that the hidden state theoretically contains all past history
  * But at each step, we only forward through a small network

---

## Problems

* But let's look at this diagram
* During training, we need to backpropagate through the same set of weights multiple times
  * The network may be smaller, but we will still run into gradient problems

---

## Solutions

* But we know solutions to train deep neural networks!
* ResNets added a residual to the current feature map
  * Learn $f(X) = X + H(X)$, where H is a simpler function
  * Does addition instead of subtraction, so gradients are more controlled

---

## Replacing Multiplication

* So we could change the update of the hidden state to use additions
  * You might think that ResNet work contribued something back to NLP
    * Nope!
  * This technique was introduce [in 1997](https://ieeexplore.ieee.org/abstract/document/6795963)!
* LSTM: Long Short-Term Memory

---

## LSTMs

* Replaces the hidden state multiplicative update with an addition
* Also makes a gating function, using Tanh
  * Either retain the current hidden state
  * Or update it with an addition of a function of the input token
  * $s = s + gy$
* $g$ selects whether or not, and how much, of the update $y$ to use

---

## LSTM Diagram

* As originally described by Horchreiter and Shcmidhuber

---

## Context

* The linear neural network always had context because individual weights corresponded to a token at a particular position
* The recurrent, including LSTM, networks must learn to encode that into their hidden states
  * This makes long-term dependencies difficult to learn
  * Akin to remembering a character from chapter 1 who isn't mentioned again until chapter 10

---

## Attention

* Researchers tried to simplify the task, adding shortcut paths to ease training
* This was originally called [alignment or soft-search](https://arxiv.org/abs/1409.0473) but later was called attention
* The [Attention is All You Need](https://proceedings.neurips.cc/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html) paper did away with the recurrent network, showing that attention with a *tranformer architecture* could do the heavy lifting

---

## Long-Term Memory

* Hopefully your long-term memories are working
  * Because we'll have to pick this up next week
* Don't forget that we'll have a quiz next Thursday
* We've covered topics from residual networks, and we've introduced embeddings
  * Don't forget all of the ConvNeXt stuff because we switched topics