# CS 462 - Lecture 16 ## Embeddings and Language Models Bernhard Firner 2026-03-26 --- ## Review * Forgot to post the code for CIFAR10 training: ```python import argparse import os import numpy as np import random from PIL import Image import torch import torchvision import tarfile from torchvision.ops.stochastic_depth import StochasticDepth import pickle def unpickle(file_obj): with open(file, 'rb') as fo: dict = pickle.load(fo, encoding='bytes') return dict def read_cifar(file_path): """ The tar.gz file should contain the following files: readme.html batches.meta data_batch_1 data_batch_2 data_batch_3 data_batch_4 data_batch_5 test_batch This function returns a tuple of training data, training labels, testing data, testing labels, and label names. """ with tarfile.open(file_path, 'r:gz') as f: # If you have limited RAM, then extracting these one at a time would be better dict1 = pickle.load(f.extractfile('cifar-10-batches-py/data_batch_1'), encoding='bytes') print(f"keys are {list(dict1.keys())}") dict2 = pickle.load(f.extractfile('cifar-10-batches-py/data_batch_2'), encoding='bytes') dict3 = pickle.load(f.extractfile('cifar-10-batches-py/data_batch_3'), encoding='bytes') dict4 = pickle.load(f.extractfile('cifar-10-batches-py/data_batch_4'), encoding='bytes') dict5 = pickle.load(f.extractfile('cifar-10-batches-py/data_batch_5'), encoding='bytes') test_dict = pickle.load(f.extractfile('cifar-10-batches-py/test_batch'), encoding='bytes') metadata = pickle.load(f.extractfile('cifar-10-batches-py/batches.meta'), encoding='bytes') training_data = np.concatenate((dict1[b'data'], dict2[b'data'], dict3[b'data'], dict4[b'data'], dict5[b'data'])) training_labels = np.concatenate((dict1[b'labels'], dict2[b'labels'], dict3[b'labels'], dict4[b'labels'], dict5[b'labels'])) # This should already be a numpy array of type uint8 test_data = test_dict[b'data'] test_labels = np.array(test_dict[b'labels']) label_names = metadata[b'label_names'] return training_data, training_labels, test_data, test_labels, label_names class Linear(torch.nn.Module): """A linear neural network.""" def __init__(self, nonlinearity = torch.nn.ReLU, classes=10): super(Linear, self).__init__() self.net = torch.nn.Sequential( torch.nn.Flatten(), torch.nn.Linear(3*32*32, 512), nonlinearity(), torch.nn.Linear(512, 120), nonlinearity(), torch.nn.Linear(120, 84), torch.nn.Linear(84, classes) ) self.decision = torch.nn.Softmax(dim=1) for layer in [1, 3, 5]: torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu") def forward(self, x): """Forward through the network.""" y_hat = self.decision(self.net(x)) return y_hat class LeNet5(torch.nn.Module): """A mostly faithful recreation of LeNet 5.""" def __init__(self, nonlinearity = torch.nn.ReLU, classes=10): super(LeNet5, self).__init__() self.net = torch.nn.Sequential( # 5x5 convolution with 6 output feature maps torch.nn.Conv2d(3, 6, 5, padding=2), # 2x2 subsampling learned bias and weight, called S2 in the paper. # We'll use an average pool and then a 1x1 conv with 6 groups to emulate that. torch.nn.AvgPool2d(kernel_size=2, stride=2), torch.nn.Conv2d(6, 6, kernel_size=1, groups=6), nonlinearity(), # 5x5 convolution with 6 output feature maps of size 5x5 torch.nn.Conv2d(6, 16, kernel_size=5), # This again, emulating layer S4 from the paper. torch.nn.AvgPool2d(kernel_size=2, stride=2), torch.nn.Conv2d(16, 16, kernel_size=1, groups=16), nonlinearity(), # The final convolution reduces features to 1x1 if this is 28x28, 2x2 if 32x32 torch.nn.Conv2d(16, 120, kernel_size=5, stride=1), torch.nn.Flatten(), torch.nn.Linear(480, 84), # Training with cross entropy, not the exemplar method in the original LeNet torch.nn.Linear(84, classes), ) self.decision = torch.nn.Softmax(dim=1) for layer in [0, 2, 4, 6, 8, 10, 11]: torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu") def forward(self, x): """Forward through the network.""" y_hat = self.decision(self.net(x)) return y_hat class ConvNet(torch.nn.Module): """A mostly faithful recreation of LeNet 5.""" def __init__(self, nonlinearity = torch.nn.ReLU, classes=10): super(ConvNet, self).__init__() self.net = torch.nn.Sequential( # Basic ConvNet as in the 20xx years torch.nn.Conv2d(in_channels=3, out_channels=15, kernel_size=(3,3), padding=1, stride=2), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2), nonlinearity(), torch.nn.Flatten(), torch.nn.Linear(240, 60), torch.nn.Linear(60, classes), ) self.decision = torch.nn.Softmax(dim=1) for layer in [0, 2, 4, 7]: torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu") def forward(self, x): """Forward through the network.""" y_hat = self.decision(self.net(x)) return y_hat class WideConvNet(torch.nn.Module): """A mostly faithful recreation of LeNet 5.""" def __init__(self, nonlinearity = torch.nn.ReLU, classes=10): super(WideConvNet, self).__init__() self.net = torch.nn.Sequential( # Basic ConvNet as in the 20xx years torch.nn.Conv2d(in_channels=3, out_channels=15, kernel_size=(3,3), padding=1, stride=2), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=30, kernel_size=(3,3), padding=1, stride=2), #torch.nn.BatchNorm2d(15), nonlinearity(), torch.nn.Conv2d(in_channels=30, out_channels=60, kernel_size=(3,3), padding=1, stride=2), nonlinearity(), torch.nn.Flatten(), torch.nn.Linear(4*240, 60), torch.nn.Linear(60, classes), ) self.decision = torch.nn.Softmax(dim=1) for layer in [0, 2, 4, 7]: torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu") def forward(self, x): """Forward through the network.""" y_hat = self.decision(self.net(x)) return y_hat class DeepConvNet(torch.nn.Module): """A mostly faithful recreation of LeNet 5.""" def __init__(self, nonlinearity = torch.nn.ReLU, classes=10): super(DeepConvNet, self).__init__() self.net = torch.nn.Sequential( # Basic ConvNet as in the 20xx years torch.nn.Conv2d(in_channels=3, out_channels=15, kernel_size=(3,3), padding=1, stride=1), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2), #torch.nn.BatchNorm2d(15), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=1), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2), #torch.nn.BatchNorm2d(15), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2), nonlinearity(), torch.nn.Flatten(), torch.nn.Linear(240, 60), torch.nn.Linear(60, classes), ) self.decision = torch.nn.Softmax(dim=1) for layer in [0, 2, 4, 6, 8]: torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu") def forward(self, x): """Forward through the network.""" y_hat = self.decision(self.net(x)) return y_hat class DeeperConvNet(torch.nn.Module): """A mostly faithful recreation of LeNet 5.""" def __init__(self, nonlinearity = torch.nn.ReLU, classes=10): super(DeeperConvNet, self).__init__() self.net = torch.nn.Sequential( # Basic ConvNet as in the 20xx years torch.nn.Conv2d(in_channels=3, out_channels=15, kernel_size=(3,3), padding=1, stride=1), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2), #torch.nn.BatchNorm2d(15), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=1), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2), #torch.nn.BatchNorm2d(15), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=1), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2), #torch.nn.BatchNorm2d(15), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2), nonlinearity(), torch.nn.Flatten(), torch.nn.Linear(60, 60), torch.nn.Linear(60, classes), ) self.decision = torch.nn.Softmax(dim=1) for layer in [0, 2, 4, 6, 8, 10, 12]: torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu") def forward(self, x): """Forward through the network.""" y_hat = self.decision(self.net(x)) return y_hat class DeeperBNConvNet(torch.nn.Module): """A mostly faithful recreation of LeNet 5.""" def __init__(self, nonlinearity = torch.nn.ReLU, classes=10): super(DeeperBNConvNet, self).__init__() self.net = torch.nn.Sequential( # Basic ConvNet as in the 20xx years torch.nn.Conv2d(in_channels=3, out_channels=15, kernel_size=(3,3), padding=1, stride=1), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2), torch.nn.BatchNorm2d(15), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=1), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2), torch.nn.BatchNorm2d(15), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=1), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2), torch.nn.BatchNorm2d(15), nonlinearity(), torch.nn.Conv2d(in_channels=15, out_channels=15, kernel_size=(3,3), padding=1, stride=2), nonlinearity(), torch.nn.Flatten(), torch.nn.Linear(60, 60), torch.nn.Linear(60, classes), ) self.decision = torch.nn.Softmax(dim=1) for layer in [0, 2, 5, 7, 10, 12, 15]: torch.nn.init.kaiming_normal_(self.net[layer].weight.data, nonlinearity="relu") def forward(self, x): """Forward through the network.""" y_hat = self.decision(self.net(x)) return y_hat class VariableBNConvNet(torch.nn.Module): """ConvNet with batch normal and a variable width and depth.""" def __init__(self, nonlinearity = torch.nn.ReLU, classes=10, repeats=1, width=30): super(VariableBNConvNet, self).__init__() layers = [] layers.append(torch.nn.Conv2d(in_channels=3, out_channels=width, kernel_size=(3,3), padding=1, stride=1)) torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu") layers.append(nonlinearity()) # The layer conv will have stride 2, and we've already done one with stride 1 for _ in range(repeats - 2): layers.append(torch.nn.Conv2d(in_channels=width, out_channels=width, kernel_size=(3,3), padding=1, stride=1)) torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu") layers.append(torch.nn.BatchNorm2d(width)) layers.append(nonlinearity()) # Stride 2 convolution layers.append(torch.nn.Conv2d(in_channels=width, out_channels=width, kernel_size=(3,3), padding=1, stride=2)) torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu") layers.append(torch.nn.BatchNorm2d(width)) layers.append(nonlinearity()) for _ in range(2): for _ in range(repeats - 1): layers.append(torch.nn.Conv2d(in_channels=width, out_channels=width, kernel_size=(3,3), padding=1, stride=1)) torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu") layers.append(torch.nn.BatchNorm2d(width)) layers.append(nonlinearity()) # Stride 2 convolution layers.append(torch.nn.Conv2d(in_channels=width, out_channels=width, kernel_size=(3,3), padding=1, stride=2)) torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu") layers.append(torch.nn.BatchNorm2d(width)) layers.append(nonlinearity()) # Final conv layers.append(torch.nn.Conv2d(in_channels=width, out_channels=width, kernel_size=(3,3), padding=1, stride=2)) torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu") layers.append(nonlinearity()) layers.append(torch.nn.Flatten()) layers.append(torch.nn.Linear(4*width, 60)) layers.append(torch.nn.Linear(60, classes)) self.net = torch.nn.Sequential(*layers) self.decision = torch.nn.Softmax(dim=1) def forward(self, x): """Forward through the network.""" y_hat = self.decision(self.net(x)) return y_hat class ResBlock(torch.nn.Module): """Simplifies using a residual block.""" def __init__(self, nonlinearity, in_channels, out_channels, kernel_size, padding, stride, depth=None): super(ResBlock, self).__init__() self.net = torch.nn.Sequential( torch.nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, padding=padding, stride=1), torch.nn.BatchNorm2d(in_channels), nonlinearity(), torch.nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, padding=padding, stride=stride), torch.nn.BatchNorm2d(out_channels), ) torch.nn.init.kaiming_normal_(self.net[3].weight.data, nonlinearity="relu") self.a = nonlinearity() # Either preserve the original input or use a 1x1 convolution to # increase channels or decrease dimensions. This is consistent with the original paper. if in_channels == out_channels and stride == 1: self.identity = lambda x: x else: self.identity = torch.nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, padding=0, stride=stride) if depth is None: self.depth = None else: self.depth = StochasticDepth(depth, "row") def forward(self, x): y = self.net(x) if self.depth is not None: y = self.depth(y) x_prime = self.identity(x) return self.a(y + x_prime) class VariableResNet(torch.nn.Module): """Resnet type thing.""" def __init__(self, nonlinearity = torch.nn.ReLU, classes=10, blocks=30, width=30, depth_prob=None): super(VariableResNet, self).__init__() layers = [] layers.append(torch.nn.Conv2d(in_channels=3, out_channels=width, kernel_size=(3,3), padding=1, stride=1)) torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu") layers.append(nonlinearity()) # The layer conv will have stride 2, and we've already done one with stride 1 for _ in range(blocks - 2): layers.append(ResBlock(nonlinearity, width, width, 3, 1, 1)) # Stride 2 convolution layers.append(ResBlock(nonlinearity, width, width, 3, 1, 2)) # Need to downscale twice more, so divide the blocks evenly between the two feature map sizes blocks_1 = (blocks - 2)//2 blocks_2 = blocks - 2 - blocks_1 for block_idx in range(blocks_1): if depth_prob is not None: idx_prob = 0.4 * block_idx / (blocks_1 + blocks_2 - 1.0) layers.append(ResBlock(nonlinearity, width, width, 3, 1, 1, idx_prob)) else: layers.append(ResBlock(nonlinearity, width, width, 3, 1, 1)) # Stride 2 convolution layers.append(ResBlock(nonlinearity, width, width, 3, 1, 2)) for block_idx in range(blocks_2): if depth_prob is not None: idx_prob = 0.4 * (blocks_1 + block_idx) / (blocks_1 + blocks_2 - 1.0) layers.append(ResBlock(nonlinearity, width, width, 3, 1, 1, idx_prob)) else: layers.append(ResBlock(nonlinearity, width, width, 3, 1, 1)) # Stride 2 convolution layers.append(ResBlock(nonlinearity, width, width, 3, 1, 2)) # Final conv layers.append(torch.nn.Conv2d(in_channels=width, out_channels=width, kernel_size=(3,3), padding=1, stride=2)) torch.nn.init.kaiming_normal_(layers[-1].weight.data, nonlinearity="relu") layers.append(nonlinearity()) layers.append(torch.nn.Flatten()) layers.append(torch.nn.Linear(4*width, 60)) layers.append(torch.nn.Linear(60, classes)) self.net = torch.nn.Sequential(*layers) self.decision = torch.nn.Softmax(dim=1) def forward(self, x): """Forward through the network.""" y_hat = self.decision(self.net(x)) return y_hat def preprocess(images, order, device): # Normalize and then add a batch dimension # This will be required for convolutions, although it doesn't matter for the linear network. preprocessed = torch.tensor(images[order]).float() # Convert to 0 mean and unit variance var, mean = torch.var_mean(preprocessed) preprocessed = (preprocessed - mean) / var # Add a channel dimension return preprocessed.reshape((-1, 3, 32, 32)).to(device) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--cifar", required=True, help="gzipped tar archive, as from https://www.cs.toronto.edu/~kriz/cifar.html.") parser.add_argument( "--epochs", required=False, type=int, default=10, help="Number of epochs to train.") parser.add_argument( "--train_samples", required=False, type=int, default=50000, help="Number of samples to use for training.") parser.add_argument( "--save_mismatch", required=False, type=int, default=0, help="The number of mismatches to save.") parser.add_argument( "--batch_size", required=False, type=int, default=8, help="The batch size.") parser.add_argument( "--random_seed", required=False, type=int, default=112, help="The random seed.") parser.add_argument( "--device", required=False, type=str, default=None, help="Override the automatically determined device (cuda or cpu).") parser.add_argument( "--model", required=False, type=str, default=None, help="mlp (multi-layer perceptron or conv") parser.add_argument( "--error_rate", required=False, type=float, default=0.0, help="The training label error rate.") parser.add_argument( "--weight_decay", required=False, type=float, default=0.0, help="Weight decay.") parser.add_argument( "--stochastic_depth", required=False, type=float, default=None, help="Max stochastic depth.") parser.add_argument( "--flips", required=False, default=False, action="store_true", help="Enable flip preprocessing.") args = parser.parse_args() # Seed all of the random number generators for repeatability. # Keep in mind though, that some algorithms are nondeterministic, so this # doesn't guarantee fully repeatable results. np.random.default_rng(args.random_seed) torch.manual_seed(args.random_seed) random.seed(args.random_seed) if args.device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") else: device = torch.device(args.device) print(f"Using device: {device}") print("Loading data") X_train, Y_train, X_test, Y_test, label_names = read_cifar(args.cifar) unique_classes = np.unique(Y_test) total_classes = len(unique_classes) # Make sure the first class is 0 if min(unique_classes) != 0: Y_train = Y_train - min(unique_classes) X_train = X_train - min(unique_classes) ## If you want to save some examples #for i in range (100): # # PIL Images expect the channel dimension last # Image.fromarray(np.moveaxis(X_test[i].reshape((3, 32, 32)), 0, -1)).save(f"example_test_cifar_{i}.png") #print(f"Saved images have classes {[str(pos) + ':' + str(label_names[idx]) for pos, idx in enumerate(Y_test[:100])]}") # Create the model if args.model == "mlp": model = Linear(classes=total_classes) elif args.model == "conv": model = ConvNet(classes=total_classes) elif args.model == "wide": model = WideConvNet(classes=total_classes) elif args.model == "deep": model = DeepConvNet(classes=total_classes) elif args.model == "deeper": model = DeeperConvNet(classes=total_classes) elif args.model == "deeperbn": model = DeeperBNConvNet(classes=total_classes) elif args.model == "lenet": model = LeNet5(classes=total_classes) elif args.model == "res": model = VariableResNet(classes=total_classes, depth_prob=args.stochastic_depth) elif int(args.model) > 0: model = VariableBNConvNet(classes=total_classes, repeats=int(args.model)) else: print(f"Unknown model type requested: {args.model}") os.quit() # Don't shuffle the test data, but otherwise treat it the same as the training data. X_test = preprocess(X_test, np.arange(X_test.shape[0]), device) Y_test = torch.tensor(Y_test).long().to(device) test_batch_size = 1000 if args.error_rate > 0.0: # Insert errors into the training data at the given error rate total_errors = int(args.error_rate * len(Y_train)) to_change = random.choices(np.arange(len(Y_train)), k=total_errors) possible_labels = [] for original in np.arange(10): # The possible wrong labels are every value but the correct one possible_labels.append(list(np.arange(original)) + list(np.arange(original+1, 10))) # This is read only, so make a writeable copy Y_train = Y_train.copy() for idx in to_change: original = Y_train[idx] Y_train[idx] = random.choice(possible_labels[original]) # Shuffle and preprocess the training data order = np.arange(X_train.shape[0]) np.random.shuffle(order) X_train = preprocess(X_train, order, device) Y_train = torch.tensor(Y_train[order]).long().to(device) # Are we doing training, or just reloading? model.to(device) if args.model == "res": optimizer = torch.optim.AdamW(model.parameters(), lr=0.0002, weight_decay=args.weight_decay) criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1) else: optimizer = torch.optim.Adam(model.parameters(), lr=0.0002, weight_decay=args.weight_decay) criterion = torch.nn.CrossEntropyLoss() if args.flips: tx = torchvision.transforms.Compose([ torchvision.transforms.RandomHorizontalFlip(p=0.5) ]) else: tx = None # See how many batches we'll use per epoch batches = int(np.ceil(X_train.shape[0]/float(args.batch_size))) # We could just do this in one step, but let's assume that memory is finite test_batches = int(np.ceil(X_test.shape[0]/float(test_batch_size))) print(f"Training on {X_train.shape[0]} examples over {batches} batches") print(model) num_params = sum(param.numel() for param in model.parameters() if param.requires_grad) print(f"Model has {num_params} parameters.") for epoch in range(args.epochs): total_loss = 0.0 model.train() for batch in range(batches): begin = batch*args.batch_size end = (batch+1)*args.batch_size if tx is None: X_batch = X_train[begin:end] else: X_batch = tx(X_train[begin:end]) Y_batch = Y_train[begin:end] # Zero gradients before gradient calculation optimizer.zero_grad() y_hat = model(X_batch) loss = criterion(y_hat, Y_batch) total_loss += loss.item() * X_batch.size(0) # Gradient calculation loss.backward() # Update weights optimizer.step() epoch_loss = total_loss / X_train.size(0) print(f"Epoch {epoch} training loss {epoch_loss}") # Evaluation # Don't calculate gradients during these steps model.eval() with torch.no_grad(): total_loss = 0.0 for batch in range(test_batches): begin = batch*test_batch_size end = (batch+1)*test_batch_size X_batch = X_test[begin:end] Y_batch = Y_test[begin:end] y_hat = model(X_batch) loss = criterion(y_hat, Y_batch) total_loss += loss.item() * X_batch.size(0) epoch_loss = total_loss / X_test.size(0) print(f"Epoch {epoch} testing loss {epoch_loss}") ## Accuracy values # We can't just run over everything, that takes too much memory. Chop it up. matches = 0 mismatches = 0 for testbatch in range(int(np.ceil(X_train.shape[0]/float(test_batch_size)))): begin = testbatch*test_batch_size end = (testbatch+1)*test_batch_size y_hat = model(X_train[begin:end]) classes = torch.argmax(y_hat, dim=1) matches += torch.sum(classes == Y_train[begin:end]) mismatches += torch.sum(classes != Y_train[begin:end]) train_accuracy = matches/X_train.size(0) matches = 0 mismatches = 0 for testbatch in range(int(np.ceil(X_test.shape[0]/float(test_batch_size)))): begin = testbatch*test_batch_size end = (testbatch+1)*test_batch_size y_hat = model(X_test[begin:end]) classes = torch.argmax(y_hat, dim=1) matches += torch.sum(classes == Y_test[begin:end]) mismatches += torch.sum(classes != Y_test[begin:end]) test_accuracy = matches/X_test.size(0) print(f"Epoch {epoch} accuracies are {train_accuracy} {test_accuracy}") # Final evaluation model.eval() with torch.no_grad(): # DNN classification matches = 0 mismatches = 0 failed_indices = [] for testbatch in range(int(np.ceil(X_test.shape[0]/float(test_batch_size)))): begin = testbatch*test_batch_size end = (testbatch+1)*test_batch_size y_hat = model(X_test[begin:end]) classes = torch.argmax(y_hat, dim=1) matches += torch.sum(classes == Y_test[begin:end]) mismatches += torch.sum(classes != Y_test[begin:end]) failed_indices.append([begin + idx for idx in torch.where(classes != Y_test[begin:end])]) sum_matches = matches test_accuracy = sum_matches/X_test.size(0) print(f"Final accuracy {sum_matches}/{X_test.size(0)} ({test_accuracy})") print(f"Final failures at indices {failed_indices[0][0].tolist()}") ``` --- ## Review: Hidden States * Let's say we want to predict the next observation in a sequence * Obviously we need to understand something about what is generating the observation * In Markov Model terminology, we need to understand the "hidden state" of the system --- ## Learning Hidden States * We begin not knowing anything about the hidden states, and must derive them from observations * So let's think of that hidden state as a function of our past observations * $O_{0:t-1}$, in formally * Now, we likely cannot look all the way back to time 0, so we'll use a context window --- ## Here Comes ML * That already sounds like we can make this an ML problem * given $f(O_{t-c:t})$ predict $O_{t+1}$ * But wait! What are the observations? Can we shove them into a neural network? --- ## Word Problems * In natural language processing, our observations are words and punctuation * This is incredibly sparse, and has strange nonlinearities * For example, the words "Punchy" and "Judy" are related, but share only a couple of letters * "Sleepy" and "Creepy" share more letters, but their meanins are far apart --- ## Review: Embeddings * So we need to convert our words into something different * Let's choose a vector representation, called an embedding * Notice that we can make embedding to any "token" * Token here could be a word, phrase, a song in your playlist, etc --- ## Training Embeddings * Step one of accomplishing anything is training an embedding * Required before learning hidden states and making predictions * But how? * Two methods suggested in the [paper that lead to word2vec](https://arxiv.org/abs/1301.3781) are CBOW and Skip-grams --- ## CBOW Example * *Continuous Bag of Words* * Let's take this sentence from *Anna Karenina* and a context window of 5: > Happy families are all alike; every unhappy family is unhappy in its own way. * A bag of words might get words "happy families all alike" * It would predict "are" since that is the word most associated with the others * If given "unhappy in own way" we would predict "its" --- ## Skip-Gram Example > Happy families are all alike; every unhappy family is unhappy in its own way. * Now we take a single word and predict the others * Given "are" we would predict, with equal probabilities every other word * Obviously the exact prediction will depend upon the training corpus * And common words, such as "is", will be associated with many other words * And being equally associated with many things will end up being a weak association --- ## Training * This won't require a sophisticated model, just lots of data * Although these are just for illustration ```python class SkipEmbedder(torch.nn.Module): def __init__(self, num_tokens, embedding_size): """ Initialize a predictor. Arguments: num_tokens (int): The number of unique tokens in the vocabulary. embedding_size (int): The number of features to represent the hidden state. """ super(SkipEmbedder, self).__init__() self.embedding = torch.nn.Embedding(num_tokens, embedding_size) # Use the hidden state to predict the next token self.predictor = torch.nn.Sequential( torch.nn.Linear(embedding_size, num_tokens)) self.decision = torch.nn.Softmax(dim=1) self.embedding.weight.data.uniform_(-0.5/embedding_size, 0.5/embedding_size) def forward(self, observation): return self.decision(self.predictor(self.embedding(observation))) class CBOWEmbedder(torch.nn.Module): def __init__(self, num_tokens, embedding_size): """ Initialize a predictor. Arguments: num_tokens (int): The number of unique tokens in the vocabulary. embedding_size (int): The number of features to represent the hidden state. """ super(CBOWEmbedder, self).__init__() self.embedding = torch.nn.Embedding(num_tokens, embedding_size) # Use the hidden state to predict the next token self.predictor = torch.nn.Sequential( torch.nn.Linear(embedding_size, num_tokens)) self.decision = torch.nn.Softmax(dim=1) self.embedding.weight.data.uniform_(-0.5/embedding_size, 0.5/embedding_size) def forward(self, observation): embeds = self.embedding(observation) # By taking the mean we make sentences easier to learn from embeddings # since they no longer have any ordering. return self.decision(self.predictor(torch.mean(embeds, dim=1))) ``` --- ## Learning Examples * Let's learn from something familiar * Going to use [Romeo and Juliet, from project Gutenberg](https://www.gutenberg.org/ebooks/1513) * It turns out the Anna Karenina is long and uses difficult words * And this code is too basic for anything serious * Note that we'll get the copyright notice at the top, just like any good model --- ## Learning N-Grams * Let's make our target task word prediction * Given previous n tokens, predict the next one * We will learn the embedding first, and then the task * Remember, we are just doing something to learn and test out an embedding * Then we'll talk about bigger concepts --- ## N-Gram Model * Notice that increasing the context eats up memory * Linear layer grows by embedding_size * lin_size with each additional token ```python class NGramPredictor(torch.nn.Module): def __init__(self, num_tokens, embedding_size, context_length): """ Initialize a predictor. Arguments: num_tokens (int): The number of unique tokens in the vocabulary. embedding_size (int): The number of features to represent the hidden state. context_length (int): The number of tokens of context. """ super(NGramPredictor, self).__init__() self.embedding = torch.nn.Embedding(num_tokens, embedding_size) # Use the hidden state to predict the next token lin_size = 6*math.ceil(math.sqrt(num_tokens)) self.predictor = torch.nn.Sequential( torch.nn.Flatten(), torch.nn.Linear(context_length * embedding_size, lin_size), torch.nn.LayerNorm(lin_size), torch.nn.GELU(), torch.nn.Linear(lin_size, num_tokens)) self.decision = torch.nn.Softmax(dim=1) self.embedding.weight.data.uniform_(-0.1, 0.1) def forward(self, observation): embeds = self.embedding(observation) return self.decision(self.predictor(embeds)) ``` --- ## Preprocessing * Going to use the nltk (natural language toolkit) for preprocessing * Will also remove stage directions * It is also common to replace rare words with a standin, such as "UNK" * We'll keep things simple ```python with open(args.corpus, 'r') as file: text = file.read() # Remove stage directions from Shapespeare's corpus text = re.sub('^[A-Z]+:$', '', text, flags=re.MULTILINE) text = re.sub('^[A-Z]+.$', '', text, flags=re.MULTILINE) text = re.sub('^ Enter.*$', '', text, flags=re.MULTILINE) text = re.sub(r'\[.*?\]', '', text, flags=re.MULTILINE) text = re.sub(r'\*\*\*.*\*\*\*', '', text) words = nltk.word_tokenize(words.lower()) flat_words = np.array(words) # First, count the number of unique words unique_tokens = np.unique(flat_words) num_tokens = len(unique_tokens) print("Converting all words to numbers.") word_to_index = {} for idx, token in enumerate(unique_tokens): word_to_index[token] = idx # Convert the words to numbers. These will look up a vector embedding in our model's embedding. sentence_indices = [word_to_index[word] for word in flat_words] ``` --- ## Preprocessing * Preprocessing reduces Romeo and Juliet to 4119 unique tokens * The length of the training corpus is 35,098 * So words are used on average less than 9 times * We know that a few prepositions are going to grab most of that * This will make things challenging --- ## Pretraining ```python # Make a continuous bag of words model cbow_model = CBOWEmbedder(num_tokens, args.embedding_size, 2*args.context_length) print("Building model.") print(cbow_model) optimizer = torch.optim.AdamW(cbow_model.parameters(), lr=0.001, weight_decay=args.weight_decay) criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.001) # Pretrain the embedding with CBOW if args.load is None: cbow_model.to(device) cbow_model.train() rng = np.random.default_rng() # Report loss every 1000 steps running_loss = 0.0 alpha = 0.01 for step in range(args.steps): # Make a batch begins = rng.integers(low=0, high=len(sentence_indices) - 2*args.context_length - 1, size=args.batch_size) # For continuous bag of words CBOW # This is generally used for pretraining groups = [sentence_indices[begin:begin+2*args.context_length+1] for begin in begins] left = torch.tensor([gr[:args.context_length] for gr in groups], dtype=torch.long) labels = torch.tensor([gr[args.context_length] for gr in groups], dtype=torch.long).to(device) right = torch.tensor([gr[args.context_length+1:] for gr in groups], dtype=torch.long) batch = torch.concatenate((left, right), dim=1).to(device) # Get ready to learn cbow_model.zero_grad() # Create a context window of the token embeddings y_hat = cbow_model(batch) loss = criterion(y_hat, labels) running_loss += alpha*(loss.item() - running_loss) # Gradient calculation loss.backward() torch.nn.utils.clip_grad_norm_(cbow_model.parameters(), 1.0) # Update weights optimizer.step() ``` --- ## Embedding Training ```python # Now train the NGRAM predictor model = NGramEmbedder(num_tokens, args.embedding_size, args.context_length) # Use the pretrained result from the CBOW with torch.no_grad(): model.embedding.weight.copy_(cbow_model.embedding.weight) # If we trusted the embedding, we could turn off learning #model.embedding.weight.requires_grad = False print("Building model.") print(model) if args.load is not None: model.load_state_dict(torch.load(args.load, weights_only=True)) model.to(device) else: optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=args.weight_decay/10) criterion = torch.nn.CrossEntropyLoss() model.to(device) model.train() rng = np.random.default_rng() # Report loss every 1000 steps running_loss = 0.0 alpha = 0.01 for step in range(args.steps): # Make a batch begins = rng.integers(low=0, high=len(sentence_indices) - args.context_length - 1, size=args.batch_size) # For next word prediction batch = torch.tensor([sentence_indices[begin:begin+args.context_length] for begin in begins], dtype=torch.long).to(device) # The labels are the following token labels = torch.tensor([sentence_indices[begin+args.context_length] for begin in begins], dtype=torch.long).to(device) # Get ready to learn model.zero_grad() # Create a context window of the token embeddings y_hat = model(batch) loss = criterion(y_hat, labels) running_loss += alpha*(loss.item() - running_loss) # Gradient calculation loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update weights optimizer.step() ``` --- ## Copying the Embedding * Notice that we can simply copy over the pretrained embedding * Remember that question about what we can do with initialization? * This should start us off nearer to a good minima ```python # Use the pretrained result from the CBOW with torch.no_grad(): model.embedding.weight.copy_(cbow_model.embedding.weight) ``` --- ## Prediction ```python # Now try to predict something. # Make 5 predictions, just to see how much variety we can get. print(f"Initial prompt is '{args.prompt}'") for _ in range(5): prompt_words = np.array(nltk.word_tokenize(args.prompt.lower())) # Convert prompt words to token values prompt_tokens = [word_to_index[word] for word in prompt_words] model.eval() if len(prompt_tokens) < args.context_length: padding = [np.where(unique_tokens == '.')[0][0]] * (args.context_length - len(prompt_tokens)) prompt_tokens = padding + prompt_tokens for i in range(20): # Create a context window of the token embeddings X = torch.tensor([prompt_tokens[-args.context_length:]]).to(device) # Find the next token probabilities y_hat = model(X) next_word_index = np.random.choice(np.arange(len(unique_tokens)), p=y_hat[0].cpu().detach().numpy()) prompt_tokens.append(next_word_index) prompt_words = [unique_tokens[idx] for idx in prompt_tokens] # Clean up the tokens. for idx in range(len(prompt_words)): if idx+1 < len(prompt_words) and prompt_words[idx+1] in ['.', '!', '?', ';', ',']: punctuation = prompt_words[idx+1] prompt_words[idx] = prompt_words[idx] + punctuation prompt_words[idx+1] = '' # Make the word after a sentence ending upper case. if punctuation in ['.', '?', '!'] and idx+2 < len(prompt_words): prompt_words[idx+2] = prompt_words[idx+2][:1].upper() + prompt_words[idx+2][1:] if idx+2 < len(prompt_words) and prompt_words[idx+1] == "’" and len(prompt_words[idx+2]) == 1: prompt_words[idx] = prompt_words[idx] + prompt_words[idx+1] + prompt_words[idx+2] prompt_words[idx+1] = '' prompt_words[idx+2] = '' print(re.sub(r' +', ' ', ' '.join(prompt_words))) ``` --- ## Note About Training * The loss will look bad * In fact, the model will be mostly wrong * Of course! This is language, guessing what word comes next will often fail * But, as long as the predictions are reasonable we are doing okay --- ## Sanity Test * If you want to be sure that things are working, begin with a subset of the training corpus * This model will learn a few hundred lines perfectly, regurgitating everything in the original * (assuming a large enough context window) * With that, we can proceed with actual training --- ## Results ``` Initial prompt is 'do you know the cause' do you know the cause? I will not thy lips. O, my romeo, friar lawrence. Romeo, alack the day do you know the cause? I have you love, that i must to my thee. Scene ii. Friar lawrence’s do you know the cause? I will not thy lips. Cheeks, and lie with her, but let me take it. do you know the cause? I will you and to bed. Ah, juliet’s to; for i will not away do you know the cause? I would you are? What, shall i groan! Romeo, and would i can not love ``` --- ## More Results ``` Initial prompt is 'into this bed of death' into this bed of death? They romeo in juliet? I will not thy lips. Put thee in, and, and bid into this bed of death, they romeo in juliet? I will not away. What, the here? I will not well into this bed of death, they romeo in juliet? I will not away. What, the pox of such a lisping, into this bed of death? They romeo, friar lawrence. Romeo, go you to bed. Ah, the’s my into this bed of death, they romeo in juliet? I will not away. What, the here i s. Go and ``` --- ## Preposition Problems * What is most likely after "of"? * Romeo, it turns out ``` Initial prompt is 'into this bed of' . into this bed of romeo? No, not he that tybalt is’s to bed. Ah, juliet’s to . into this bed of romeo? No, i he not his the that of must, and convoy in the fearful of do . into this bed of enter? Where here comes romeo? Friar lawrence. I hear some noise. If i come, , . into this bed of romeo? No, i he not his the tybalt of the, and i will give me occasion. . into this bed of awhile, for romeo and in a montague, the only lacks a cover : the fish of if thou ``` --- ## Embedding Influence * The word "descend" is associated with "death" here ``` Initial prompt is 'descend into this bed of' descend into this bed of death, they romeo in juliet, i will; for it is wisely, and; i come, descend into this bed of death? They romeo, friar lawrence. Go with me to the fresh i will give you, sir descend into this bed of death? They romeo, friar lawrence. Ay, forsooth. Well, he may chance to do some descend into this bed of death? They romeo, friar lawrence. Romeo, there? What is it in that? I will descend into this bed of death, they romeo in juliet, he is there lies, tybalt. First and, and you will ``` --- ## Embedding Influence * The other words may not even matter ``` Initial prompt is 'descend into this rose of' descend into this rose of death, and romeo in juliet, he is it is my man, but i ’ ll be hanged descend into this rose of death, and romeo in juliet, he is not, o, she was well? She’s descend into this rose of death, and romeo in juliet, he is not, spited, grief you shall be a with sweet descend into this rose of death, and romeo in juliet, he is not away. What, the pox of such a lisping descend into this rose of death, and romeo in juliet, he is it is my man, but i ’ ll be hanged ``` --- ## Good Embeddings * Suppose we had a good embedding * We should be able to do something cool, like math on concepts in embedded space * $count - man + woman \approx countess$ * $Paris - France + Itely \approx Rome$ * And what does it take to get such good embeddings? * Just a dataset of hundreds of billions of words --- ## Comparison With Images * The embedding here is similar to the initial large convolution in ResNext * But learning the embedding is more troublesome * Obviously everything afterwards depends upon it --- ## Improving Embeddings * Training just the embedding is difficult * But we've had good methods for more than 10 years * For an example, see [word2vec](https://arxiv.org/abs/1301.3781) * And now you can download an embedding * Here are a few: [https://fasttext.cc/docs/en/crawl-vectors.html](https://fasttext.cc/docs/en/crawl-vectors.html) --- ## Examples * Here are some "nearest neighbors" from a decent embedding ``` Query word? countess Countess 0.786198 baroness 0.749479 marchioness 0.743028 duchess 0.741611 viscountess 0.736536 countesses 0.7086 marquess 0.674638 lady-in-waiting 0.653861 noblewoman 0.652758 nobleman 0.646855 Query word? count counts 0.812811 counted 0.7097 counting 0.685638 count. 0.628533 Count 0.603085 count.And 0.597393 count.The 0.585259 count.I 0.580847 count.If 0.578759 count.So 0.56037 Query word? Bob Jim 0.785597 Dave 0.734101 Mike 0.727408 Doug 0.710619 Tom 0.703408 .Bob 0.683471 Steve 0.67823 Ted 0.677032 Gary 0.674185 Greg 0.668247 ``` --- ## Embedding Math * We can try to play with the embeddings a bit ```python import io import torch def load_vectors(fname): fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') #n, d = map(int, fin.readline().split()) data = {} for line in fin: tokens = line.rstrip().split(' ') data[tokens[0]] = torch.tensor([float(token) for token in tokens[1:]]).float() return data data = load_vectors("16_king_queen_vectors.txt") print(data) print(f"king & queen cosine similarity: {torch.cosine_similarity(data['king'], data['queen'], dim=0)}") print(f"duke & duchess cosine similarity: {torch.cosine_similarity(data['duke'], data['duchess'], dim=0)}") print(f"man & woman cosine similarity: {torch.cosine_similarity(data['man'], data['woman'], dim=0)}") print(f"king & man cosine similarity: {torch.cosine_similarity(data['king'], data['man'], dim=0)}") print(f"queen & woman cosine similarity: {torch.cosine_similarity(data['queen'], data['woman'], dim=0)}") print(f"(king-man+woman) & queen cosine similarity: {torch.cosine_similarity(data['king']-data['man']+data['woman'], data['queen'], dim=0)}") print(f"(duke-man+woman) & duchess cosine similarity: {torch.cosine_similarity(data['duke']-data['man']+data['woman'], data['duchess'], dim=0)}") print(f"the & duchess cosine similarity: {torch.cosine_similarity(data['the'], data['duchess'], dim=0)}") print(f"the & of cosine similarity: {torch.cosine_similarity(data['the'], data['of'], dim=0)}") print(f"cat & king cosine similarity: {torch.cosine_similarity(data['cat'], data['king'], dim=0)}") print(f"cat & queen cosine similarity: {torch.cosine_similarity(data['cat'], data['queen'], dim=0)}") print(f"cat & dog cosine similarity: {torch.cosine_similarity(data['cat'], data['dog'], dim=0)}") ``` --- ## Outputs ``` king & queen cosine similarity: 0.7068513631820679 duke & duchess cosine similarity: 0.7073330879211426 man & woman cosine similarity: 0.7658399939537048 king & man cosine similarity: 0.3418329954147339 queen & woman cosine similarity: 0.36072516441345215 (king-man+woman) & queen cosine similarity: 0.6542676091194153 (duke-man+woman) & duchess cosine similarity: 0.641383707523346 the & duchess cosine similarity: 0.09982849657535553 the & of cosine similarity: 0.5719975233078003 cat & king cosine similarity: 0.2159331738948822 cat & queen cosine similarity: 0.2383471429347992 cat & dog cosine similarity: 0.707862377166748 ``` --- ## Implications * Notice that the embeddings must be capturing both parts of speech and meaning * 'the' and 'duchess' are not close, but 'the' and 'of' are * But not all nouns are close * 'cat' and 'dog' are used interchangeably, but not 'cat' and 'queen' * Math works better than noise * Reall great embeddings should produce more consistency --- ## Improvements * So why is our embedding mediocre? * We should really be applying techniques to improve up training * At least * Subsampling * Negative sampling * And we need a dataset of a billion tokens --- ## Subsampling * When training CBOW or Skip-grams, the ordering of words doesn't matter * Since structure has been tossed out of the window, we can also toss the over-common words * So if "of" shows up too often, just drop it * The sentences will be mangled, but training the embedding will improve without so much redundant data * We can also search for common phrases and combine them into single tokens --- ## Negative Sampling * Notice that the model predicts a single word out of thousands, and learns wrong guesses quickly * So the majority of the math in the loss function is pushing the same weights to 0 repeatedly * We can change the loss to only look at the target word and $K$ negative samples * Penalize the model for not prediction the target word with probability 1 * Penalize the model for not predicting the negative samples with probability 0 * All in log space, of course --- ## Using The Embedding * Let's say we bothered to get everything required to train a great embedding * What next? * The embedding itself only tells us what words are related to other words * Our simple linear n-gram predictor needs more context to make good predictions * But longer context windows will take huge amounts of memory --- ## Naive Linear Network * We might want to make our N-gram predictor into a deeper network and give it a larger context window * The weights of the first layer grow linearly with context length, but adding a hidden layer will cause weights to grow with the square of the context length ```python self.predictor = torch.nn.Sequential( torch.nn.Flatten(), torch.nn.Linear(context_length * embedding_size, lin_size), torch.nn.LayerNorm(lin_size), torch.nn.GELU(), torch.nn.Linear(lin_size, num_tokens)) ``` --- ## Recurrent Networks * This problem motivates the development of more interesting models * One powerful idea is that of a *recurrent* neural network * That means a network remembers its own hidden state --- ## RNN * What if we use each new embedding to modify the current hidden state? * The RNN is a network that outputs both a prediction and a new hidden state * The hidden state is then fed back to the RNN at the next time step
RNN from the UDL Book
--- ## RNN * If the hidden state has enough information, we can predict the next token from it directly * This has the advantage that the hidden state theoretically contains all past history * But at each step, we only forward through a small network
RNN from the UDL Book
--- ## Problems * But let's look at this diagram * During training, we need to backpropagate through the same set of weights multiple times * The network may be smaller, but we will still run into gradient problems
RNN from the UDL Book
--- ## Solutions * But we know solutions to train deep neural networks! * ResNets added a residual to the current feature map * Learn $f(X) = X + H(X)$, where H is a simpler function * Does addition instead of subtraction, so gradients are more controlled --- ## Replacing Multiplication * So we could change the update of the hidden state to use additions * You might think that ResNet work contribued something back to NLP * Nope! * This technique was introduce [in 1997](https://ieeexplore.ieee.org/abstract/document/6795963)! * LSTM: Long Short-Term Memory --- ## LSTMs * Replaces the hidden state multiplicative update with an addition * Also makes a gating function, using Tanh * Either retain the current hidden state * Or update it with an addition of a function of the input token * $s = s + gy$ * $g$ selects whether or not, and how much, of the update $y$ to use --- ## LSTM Diagram * As originally described by Horchreiter and Shcmidhuber
--- ## Context * The linear neural network always had context because individual weights corresponded to a token at a particular position * The recurrent, including LSTM, networks must learn to encode that into their hidden states * This makes long-term dependencies difficult to learn * Akin to remembering a character from chapter 1 who isn't mentioned again until chapter 10 --- ## Attention * Researchers tried to simplify the task, adding shortcut paths to ease training * This was originally called [alignment or soft-search](https://arxiv.org/abs/1409.0473) but later was called attention * The [Attention is All You Need](https://proceedings.neurips.cc/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html) paper did away with the recurrent network, showing that attention with a *tranformer architecture* could do the heavy lifting --- ## Long-Term Memory * Hopefully your long-term memories are working * Because we'll have to pick this up next week * Don't forget that we'll have a quiz next Thursday * We've covered topics from residual networks, and we've introduced embeddings * Don't forget all of the ConvNeXt stuff because we switched topics