# This installs the ipynb package which enables importing functions defined in other notebooks.
# %pip install ipynb
Pretraining from unlabeled data
This notebook explores pretraining process of LLMs based on Sebastian Raschka’s book (Chapter 5). In particular, it discusses the following:
- Computing the training and validation set losses to assess the quality of LLM-generated text during training
- Implementing a training function and pretraining the LLM
- Saving and loading model weights to continue training an LLM
- Loading pretrained weights from OpenAI
Acknowledgment
All concepts, architectures, and implementation approaches are credited to Sebastian Raschka’s work. This repository serves as my personal implementation and notes while working through the book’s content.
Resources
from typing import Optional
import numpy as np
import torch
import torch.nn as nn
import tiktoken
from tqdm.notebook import tqdm
# Import previous chapter dependencies.
# See https://stackoverflow.com/questions/44116194/import-a-function-from-another-ipynb-file
# NOTE: Importing these functions seems to run the entire cell the symbol is defined in, which would
# suggest that symbols should be defined in separate cells from the test code.
from ipynb.fs.full.chapter_04_gpt_from_scratch import (
GPTConfig,
GPTModel,
generate_text_simple,
)from ipynb.fs.full.chapter_02_dataset_creation import create_dataloader_v1
# Instantiate the GPT-2 configuration with shortened context length.
= GPTConfig(
GPT_CONFIG_124M =50257, # as used by the BPE tokenizer for GPT-2.
vocab_size=256,
context_length=768,
emb_dim=12,
n_heads=12,
n_layers=0.1,
dropout_rate=False,
qkv_bias )
# Create two training examples in a batch.
= tiktoken.get_encoding("gpt2")
tokenizer
= []
batch = "Every effort moves you"
txt1 = "Every day holds a"
txt2
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))= torch.stack(batch, dim=0) batch
# Test the GPT model.
123)
torch.manual_seed(
# Run the model on the batch.
= GPTModel(GPT_CONFIG_124M)
model eval()
model.= model(batch)
out
print(f"Input batch: {batch}")
print(f"Output shape: {out.shape}")
Text encoder and decoder utilities
Text to token conversion
def text_to_token_ids(
str, tokenizer: Optional[tiktoken.Encoding] = None
text: -> torch.Tensor:
) """Convert a text string to a tensor of token IDs.
Args:
text: The text to convert to token IDs.
tokenizer: The tokenizer to use.
Returns:
torch.Tensor: A tensor of token IDs.
"""
# Instantiate a default tokenizer (if non was provided).
# Tokenize the input text.
= tokenizer.encode(text, allowed_special={"<|endoftext|>"})
encoded
# Convert the tokenized text to a tensor.
# NOTE: .unsqueeze(0) adds the batch dimension.
= torch.tensor(encoded).unsqueeze(0)
encoded_tensor return encoded_tensor
Token to text conversion
def token_ids_to_text(
= None
token_ids: torch.Tensor, tokenizer: Optional[tiktoken.Encoding] -> str:
) """Convert a tensor of token IDs to a text string.
Args:
token_ids: The tensor of token IDs to convert to text.
tokenizer: The tokenizer to use.
Returns:
str: The text string.
"""
# Instantiate a default tokenizer (if non was provided).
# NOTE: .squeeze(0) removes the batch dimension.
= token_ids.squeeze(0)
flat return tokenizer.decode(flat.tolist())
# Test the text to token conversion.
= "Every effort moves you"
start_context = tiktoken.get_encoding("gpt2")
tokenizer
= generate_text_simple(
token_ids =model,
model=text_to_token_ids(start_context, tokenizer),
idx=10,
max_new_tokens=GPT_CONFIG_124M.context_length,
context_size
)print("Output text:\n", token_ids_to_text(token_ids, tokenizer))
Loss function
Computing the loss involves 5 steps as shown in the following figure. The example below uses a seven word vocabulary for illustration purposes.
For each of the three input tokens, shown on the left, we compute a vector containing probability scores corresponding to each token in the vocabulary. The index position of the highest probability score in each vector represents the most likely next token ID. These token IDs associated with the highest probability scores are selected and mapped back into a text that represents the text generated by the model.
Example - step by step
# Develop the loss function using a batch of two simple examples.
= torch.tensor(
inputs 16833, 3626, 6100], [40, 1107, 588]], # ["every effort moves", "I really like"]
[[
)
# Define the targets, which are the next tokens in the sequences.
= torch.tensor(
targets
[3626, 6100, 345],
[1107, 588, 11311],
[# [" effort moves you", " really like chocolate"]
]
)
# Compute the logits for the inputs.
# NOTE: We disable gradient computation since gradients are only used for training.
with torch.no_grad():
= model(inputs)
logits
# Compute the probabilities of each token in the vocabulary.
# NOTE: The shape of probas is [B, T, V] where
#
# B is the batch size
# T is the sequence length
# V is the vocabulary size.
= torch.softmax(logits, dim=-1)
probas print(f"Probas shape: {probas.shape}")
# Step 3 and 4: Convert the probabilities to token IDs via a greedy decoding strategy.
= torch.argmax(probas, dim=-1, keepdim=True)
token_ids # Print both batches of token IDs.
print("Token IDs:\n", token_ids)
# Step 5: Convert the token IDs back to text.
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1:" f" {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")
# For each of the two input texts, we can print the initial softmax probability scores
# corresponding to the target tokens using the following code:
= 0
batch_idx # TODO: Why can't we just use probas[batch_idx, :, targets[batch_idx]] since T = 3?
= probas[batch_idx, [0, 1, 2], targets[batch_idx]]
target_probas_1 print(f"probas.shape: {probas.shape}")
print("Text 1:", target_probas_1)
= 1
batch_idx = probas[batch_idx, [0, 1, 2], targets[batch_idx]]
target_probas_2 print("Text 2:", target_probas_2)
Computing the loss step by step
# Compute the log probabilities of the target tokens.
# NOTE: Working with logarithms of probability scores is more manageable in mathematical
# optimization than handling the scores directly.
= torch.log(torch.cat((target_probas_1, target_probas_2)))
log_probas print(f"log_probas: {log_probas}")
# Compute the average log probability of the target tokens.
= torch.mean(log_probas)
avg_log_probas print(f"avg_log_probas: {avg_log_probas}")
# The goal is to get the average log probability as close to 0 as possible by updating the model’s
# weights as part of the training process. However, in deep learning, the common practice isn’t to
# push the average log probability up to 0 but rather to bring the negative average log probability
# down to 0. The negative average log probability is simply the average log probability multiplied
# by –1.
= avg_log_probas * -1
neg_avg_log_probas print(f"neg_avg_log_probas: {neg_avg_log_probas}")
# As we can see, the logits tensor has three dimensions: batch size, number of tokens, and
# vocabulary size. The targets tensor has two dimensions: batch size and number of tokens.
# For the cross_entropy loss function in PyTorch, we want to flatten these tensors by combining
# them over the batch dimension:
print("Logits shape:", logits.shape)
print("Targets shape:", targets.shape)
= logits.flatten(0, 1)
logits_flat = targets.flatten()
targets_flat print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)
= torch.nn.functional.cross_entropy(logits_flat, targets_flat)
loss print(loss)
The difference between cross-entropy, perplexity, and KL-divergence
Cross-entropy
Cross-entropy measures how well a predicted probability distribution \(q\) matches a true distribution \(p\). It’s defined as:
\[ H(p, q) = -\sum_{x} p(x) \log q(x) \]
where \(x\) runs over all possible events. Intuitively, it’s the average number of bits needed to encode samples from \(p\), if they’re encoded according to \(q\). The lower the cross-entropy, the closer \(q\) is to \(p\).
According to Wikipedia, in information theory, the cross-entropy between two probability distributions \({\displaystyle p}\) and \({\displaystyle q}\), over the same underlying set of events, measures the average number of bits needed to identify an event drawn from the set when the coding scheme used for the set is optimized for an estimated probability distribution \({\displaystyle q}\), rather than the true distribution \({\displaystyle p}\).
This statement reflects a fundamental idea from information theory: cross-entropy measures the cost of encoding data from one distribution \(p\) under the assumptions of another distribution \(q\). The unit “bits” arises because we’re working in the context of binary information encoding. Intuitively, each bit represents a yes/no choice, and the cross-entropy tells us, on average, how many such choices we’d need to make to encode the true outcomes from \(p\), given that our model assigns probabilities according to \(q\).
- If \(q\) perfectly matches \(p\), the encoding is as efficient as possible—this is essentially the entropy \(H(p)\) of the true distribution.
- If \(q\) differs from \(p\), the encoder based on \(q\) will make less informed decisions, leading to longer or more error-prone codes on average.
- The “lower” cross-entropy means we’re closer to the ideal scenario where \(q \approx p\), which indicates our model (represented by \(q\)) is doing a better job of approximating the true distribution \(p\).
- Conversely, a higher cross-entropy indicates that \(q\) diverges significantly from \(p\), causing inefficiencies and increasing the average number of bits needed.
So, the cross-entropy not only quantifies the difference between two distributions, but also translates that difference into the practical costs of encoding data.
Example:
- True distribution: \(p = [0.7, 0.2, 0.1]\) - Predicted distribution 1: \(q_1 = [0.6, 0.3, 0.1]\) - Predicted distribution 2: \(q_2 = [0.9, 0.05, 0.05]\) - \(H(p, q_1)\) will be lower than \(H(p, q_2)\), because \(q_1\) is closer to \(p\) than \(q_2\).
Perplexity
Perplexity is often used in language modeling and other probabilistic models to measure how well a model predicts a sample. It’s defined as the exponentiated average negative log-probability:
\[ \text{Perplexity}(p, q) = 2^{H(p, q)} \]
This represents the effective number of choices the model assigns to each outcome. A lower perplexity means the model is more confident in its predictions. Perplexity is often viewed as a normalized measure of cross-entropy, expressed in terms of the equivalent branching factor. For instance, if a language model’s perplexity is 10, it implies the model is, on average, as uncertain as making a single choice out of 10 equally likely outcomes.
According to Wikipedia, in information theory, perplexity is a measure of uncertainty in the value of a sample from a discrete probability distribution. The larger the perplexity, the less likely it is that an observer can guess the value which will be drawn from the distribution.
From Sebastian Raschka’s book:
Perplexity is a measure often used alongside cross entropy loss to evaluate the performance of models in tasks like language modeling. It can provide a more interpretable way to understand the uncertainty of a model in predicting the next token in a sequence.
Perplexity measures how well the probability distribution predicted by the model matches the actual distribution of the words in the dataset. Similar to the loss, a lower perplexity indicates that the model predictions are closer to the actual distribution. Perplexity can be calculated as perplexity = torch.exp(loss)
, which returns tensor(48725.8203)
when applied to the previously calculated loss.
Perplexity is often considered more interpretable than the raw loss value because it signifies the effective vocabulary size about which the model is uncertain at each step. In the given example, this would translate to the model being unsure about which among 48,725 tokens in the vocabulary to generate as the next token.
ChatGPT provides a similar intuitive explanation. If we consider a language model predicting the next word in a sentence, perplexity provides a numerical summary of how uncertain or “perplexed” the model is, on average, when choosing among possible outcomes. A perplexity value of 10, for example, indicates that the model’s uncertainty is equivalent to having 10 equally likely choices for each word it predicts. In other words, lower perplexity means the model is more confident in its predictions, as it can narrow down the possible outcomes to a smaller, more focused set. Higher perplexity indicates greater uncertainty or poorer model performance, since the model must spread its probability mass across more outcomes, essentially “considering” a larger range of possibilities before making a prediction.
This interpretation of perplexity as a kind of “average branching factor” makes it particularly useful in evaluating the quality of language models. Instead of dealing with abstract bits or logarithms (as in cross-entropy), perplexity translates the model’s predictive efficiency into a form that’s more intuitive.
Example:
- Suppose a language model predicts a sentence like “The cat sat on the ____” with probabilities for possible words:
- \(p(\text{mat})\) = 0.8, \(p(\text{floor})\) = 0.15, \(p(\text{roof})\) = 0.05 - If the true word is “mat” and the model’s probabilities closely match this, the perplexity will be low.
- If the model assigns much lower probability to “mat” and higher to other options, the perplexity will increase, indicating worse predictions.
KL Divergence (Kullback-Leibler Divergence)
KL divergence measures how one probability distribution ( q ) diverges from a reference distribution ( p ). It’s given by:
\[ D_{KL}(p \parallel q) = \sum_{x} p(x) \log\frac{p(x)}{q(x)} \]
KL divergence is always non-negative and equals zero only when ( p = q ). Unlike cross-entropy, it explicitly quantifies the “distance” (in an information-theoretic sense) between the two distributions. While cross-entropy tells us how many bits are needed to encode ( p ) using ( q ), KL divergence tells us how many extra bits are needed compared to using the true distribution ( p ) itself.
Example:
- True distribution: p = [0.5, 0.5] - Predicted distribution 1: \(q_1\) = [0.6, 0.4] - Predicted distribution 2: \(q_2\) = [0.9, 0.1] - \(D_{KL}(p \parallel q_1)\) is smaller than \(D_{KL}(p \parallel q_2)\), because \(q_1\) is closer to \(p\).
- If \(q_1\) becomes equal to \(p\), the KL divergence will be zero.
Comparing the Concepts
- Cross-Entropy vs. KL Divergence:
- Cross-entropy combines the entropy of \(p\), which is fixed for a given \(p\), and the KL divergence from \(p\) to \(q\):
\[ H(p, q) = H(p) + D_{KL}(p \parallel q) \] - While cross-entropy measures the total coding cost under \(q\), KL divergence isolates the inefficiency due to \(q\)’s divergence from \(p\).
- Cross-entropy combines the entropy of \(p\), which is fixed for a given \(p\), and the KL divergence from \(p\) to \(q\):
- Perplexity and Cross-Entropy:
- Perplexity is derived directly from cross-entropy, converting the measure into an interpretable “average number of choices.” It essentially provides a more human-readable version of the model’s performance.
- Both low perplexity and low cross-entropy indicate a better model fit, but perplexity is the exponential form and gives a more intuitive sense of the model’s uncertainty.
- Perplexity is derived directly from cross-entropy, converting the measure into an interpretable “average number of choices.” It essentially provides a more human-readable version of the model’s performance.
- Perplexity and KL Divergence:
- While perplexity is connected to cross-entropy, KL divergence is a more nuanced measure that focuses on how much \(q\) deviates from \(p\) rather than the raw efficiency of encoding.
- Perplexity doesn’t directly measure divergence; instead, it measures how well the model predicts, which can be related to divergence indirectly through the cross-entropy.
- While perplexity is connected to cross-entropy, KL divergence is a more nuanced measure that focuses on how much \(q\) deviates from \(p\) rather than the raw efficiency of encoding.
In summary, cross-entropy and perplexity are practical metrics for evaluating how well a predictive model matches a true distribution, with perplexity offering a more intuitive interpretation. KL divergence, on the other hand, is a more fundamental information-theoretic measure that quantifies how much one distribution differs from another, forming a building block for understanding the inefficiencies captured by cross-entropy.
Training and validation set losses
When preparing the data loaders, we split the input text into training and validation set portions. Then we tokenize the text (only shown for the training set portion for simplicity) and divide the tokenized text into chunks of a user-specified length (here, 6). Finally, we shuffle the rows and organize the chunked text into batches (here, batch size 2), which we can use for model training.
# Load example dataset.
= "data/the_verdict.txt"
file_path with open(file_path, "r", encoding="utf-8") as file:
= file.read()
text_data
# Pritn statistics.
= len(text_data)
total_characters = len(tokenizer.encode(text_data))
total_tokens print("Characters:", total_characters)
print("Tokens:", total_tokens)
# Divide the dataset into training and validation sets.
# NOTE: This is a simple and naive approach to splitting the dataset and should be replaced with
# tooling from pytorch (e.g. https://pytorch.org/docs/stable/data.html#torch.utils.data.random_split)
= 0.90
train_ratio = int(train_ratio * len(text_data))
split_idx = text_data[:split_idx]
train_data = text_data[split_idx:]
val_data print("Train data (chars):", len(train_data))
print("Validation data (chars):", len(val_data))
# Create the dataloaders.
123)
torch.manual_seed(= create_dataloader_v1(
train_loader
train_data,=2,
batch_size=GPT_CONFIG_124M.context_length,
max_length=GPT_CONFIG_124M.context_length,
stride=True,
drop_last=True,
shuffle=0,
num_workers
)= create_dataloader_v1(
val_loader
val_data,=2,
batch_size=GPT_CONFIG_124M.context_length,
max_length=GPT_CONFIG_124M.context_length,
stride=False,
drop_last=False,
shuffle=0,
num_workers
)
# Based on the preceding code output, we have nine training set batches with two samples and 256
# tokens each. Since we allocated only 10% of the data for validation, there is only one validation
# batch consisting of two input examples.
#
# Each sample is of shape B x T where B is the batch size and T is the sequence length (i.e. 256).
print("Train loader:")
for x, y in train_loader:
print("\t", x.shape, y.shape)
print("\nValidation loader:")
for x, y in val_loader:
print("\t", x.shape, y.shape)
Utility function to compute the cross-entropy loss for a given batch.
def calc_loss_batch(
input_batch: torch.Tensor,
target_batch: torch.Tensor,
model: nn.Module,
device: torch.device,-> torch.Tensor:
) """Compute the cross-entropy loss for a given batch.
Args:
input_batch: The input batch.
target_batch: The target batch.
model: The model.
device: The device to compute the loss on.
Returns:
The cross-entropy loss for the input batch.
"""
# Transfer the input and target batches to the device.
= input_batch.to(device)
input_batch = target_batch.to(device)
target_batch
# Compute the logits for the input batch.
= model(input_batch)
logits
# Compute the cross-entropy loss for the input batch.
# NOTE: We flatten the logits and targets to have a shape of B * T where B is the batch size.
#
# logits: [B, T, V] -> [B * T, V]
# targets: [B, T] -> [B * T]
= torch.nn.functional.cross_entropy(
loss 0, 1), target_batch.flatten()
logits.flatten(
)return loss
Utility function to compute the loss for a data loader
def calc_loss_loader(
data_loader: torch.utils.data.DataLoader,
model: nn.Module,
device: torch.device,int] = None,
num_batches: Optional[-> float:
) """Compute the cross-entropy loss for a given data loader.
Args:
data_loader: The data loader.
model: The model.
device: The device to compute the loss on.
num_batches: The number of batches to compute the loss on.
Returns:
The cross-entropy loss for the entire data loader.
"""
= 0.0
total_loss if len(data_loader) == 0:
return float("nan")
elif num_batches is None:
# Iteratives over all batches if no fixed num_batches is specified
= len(data_loader)
num_batches else:
# Reduces the number of batches to match the total number of batches in the data loader if
# num_batches exceeds the number of batches in the data loader.
= min(num_batches, len(data_loader))
num_batches
# Iterate over all batches in the data loader (or a subset thereof).
for i, (input_batch, target_batch) in enumerate(data_loader):
if i < num_batches:
# Compute the loss for the input batch.
= calc_loss_batch(input_batch, target_batch, model, device)
loss
# Sum the loss for each batch.
+= loss.item()
total_loss else:
break
# Return the average loss over the number of batches.
return total_loss / num_batches
# Test the loss computation.
# If you have a machine with a CUDA-supported GPU, the LLM will train on the GPU without making any
# changes to the code.
= torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
model.to(device)print(f"Using device: {device}")
# Disables gradient tracking for efficiency because we are not training yet.
with torch.no_grad():
# Via the “device” setting, we ensure the data is loaded onto the same device as the LLM model.
= calc_loss_loader(train_loader, model, device)
train_loss = calc_loss_loader(val_loader, model, device)
val_loss
print(f"Training loss : {train_loss}")
print(f"Validation loss: {val_loss}")
Training an LLM
A typical training loop for training deep neural networks in PyTorch consists of numerous steps, iterating over the batches in the training set for several epochs. In each loop, we calculate the loss for each training set batch to determine loss gradients, which we use to update the model weights so that the training set loss is minimized.
Evaluation utilities
def evaluate_model(
model: nn.Module,
train_loader: torch.utils.data.DataLoader,
val_loader: torch.utils.data.DataLoader,
device: torch.device,int,
eval_iter:
):"""Evaluate the model on the training and validation sets."""
# Set the model to evaluation mode.
# NOTE: In evaluation mode, certain layers like dropout are disabled to ensure stable,
# reproducible results.
eval()
model.
# Disables gradient tracking, which is not required during evaluation (to reduce the
# computational overhead).
with torch.no_grad():
= calc_loss_loader(
train_loss =eval_iter
train_loader, model, device, num_batches
)= calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
val_loss
# Sets the model back to training mode.
model.train()
return train_loss, val_loss
def generate_and_print_sample(
model: nn.Module,
tokenizer: tiktoken.core.Encoding,
device: torch.device,str,
start_context: -> None:
) """Generate and print a sample from the model."""
# Set the model to evaluation mode.
eval()
model.
# Get the context size from the model's positional embedding weight.
= model.pos_emb.weight.shape[0]
context_size
# Encode the start context and move to the device.
= text_to_token_ids(start_context, tokenizer).to(device)
encoded
# Generate the text.
with torch.no_grad():
= generate_text_simple(
token_ids =model, idx=encoded, max_new_tokens=50, context_size=context_size
model
)
# Decode the generated text.
= token_ids_to_text(token_ids, tokenizer)
decoded_text print(decoded_text.replace("\n", " "))
# Set the model back to training mode.
model.train()
Pretraining function
def train_model_simple(
model: nn.Module,
train_loader: torch.utils.data.DataLoader,
val_loader: torch.utils.data.DataLoader,
optimizer: torch.optim.Optimizer,
device: torch.device,str,
start_context:
tokenizer: tiktoken.core.Encoding,int = 10,
num_epochs: int = 5,
eval_freq: int = 5,
eval_iter:
):# Initializes lists to track losses and tokens seen.
# TODO: Tracking of training statistics can be done more efficiently and elegantly.
= [], [], []
train_losses, val_losses, track_tokens_seen = 0, -1
tokens_seen, global_step
for epoch in range(num_epochs):
model.train()
# Start the main training loop.
# Use tqdm to show progress with epoch and local step information
for local_step, (input_batch, target_batch) in enumerate(
=f"Epoch {epoch+1}", leave=False)
tqdm(train_loader, desc
):# Resets loss gradients from the previous batch iteration.
optimizer.zero_grad()
# Compute the loss for the input batch.
= calc_loss_batch(input_batch, target_batch, model, device)
loss
# Calculates loss gradients.
loss.backward()
# Updates model weights using loss gradients
optimizer.step()
+= input_batch.numel()
tokens_seen += 1
global_step
# Optional evaluation step.
if global_step % eval_freq == 0:
= evaluate_model(
train_loss, val_loss
model, train_loader, val_loader, device, eval_iter
)
train_losses.append(train_loss)
val_losses.append(val_loss)
track_tokens_seen.append(tokens_seen)print(
f"Ep {epoch+1} (Step {global_step:06d}, Batch {local_step+1}): "
f"Train loss {train_loss:.3f}, "
f"Val loss {val_loss:.3f}"
)
# Prints a sample text after each epoch.
generate_and_print_sample(model, tokenizer, device, start_context)
return train_losses, val_losses, track_tokens_seen
# Test the training loop.
123)
torch.manual_seed(
# Instantiate the model and move it to the device.
= GPTModel(GPT_CONFIG_124M)
model
model.to(device)
# Instantiate the optimizer.
= torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
optimizer
# Train the model.
= 10
num_epochs = train_model_simple(
train_losses, val_losses, tokens_seen
model,
train_loader,
val_loader,
optimizer,
device,="Every effort moves you",
start_context=tokenizer,
tokenizer )
Plot losses
from typing import Tuple
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
def plot_losses(
epochs_seen: torch.Tensor,
tokens_seen: torch.Tensor,
train_losses: torch.Tensor,
val_losses: torch.Tensor,int, int] = (8, 6),
figsize: Tuple[
):"""Plot the training and validation losses."""
= plt.subplots(figsize=figsize)
fig, ax1 ="Training loss")
ax1.plot(epochs_seen, train_losses, label="-.", label="Validation loss")
ax1.plot(epochs_seen, val_losses, linestyle"Epochs")
ax1.set_xlabel("Loss")
ax1.set_ylabel(="upper right")
ax1.legend(loc=True))
ax1.xaxis.set_major_locator(MaxNLocator(integer= ax1.twiny()
ax2 =0)
ax2.plot(tokens_seen, train_losses, alpha"Tokens seen")
ax2.set_xlabel(
fig.tight_layout()
plt.show()
= torch.linspace(0, num_epochs, len(train_losses))
epochs_tensor plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)
Decoding strategies
# Set the model to evaluation mode for inference.
eval()
model.
model.to(device)
# Set random seeds for reproducibility.
123)
torch.manual_seed(
# Generate some text.
= tiktoken.get_encoding("gpt2")
tokenizer = generate_text_simple(
token_ids =model,
model=text_to_token_ids("Every effort moves you", tokenizer).to(device),
idx=25,
max_new_tokens=GPT_CONFIG_124M.context_length,
context_size
)print("Output text:\n", token_ids_to_text(token_ids, tokenizer))
Temperature sampling
Temperature scaling is a technique that adds a probabilistic selection process to the next-token generation task. Instead of always sampling the token with the highest probability as the next token using torch.argmax, also known as greedy decoding, we can replace argmax with a function that samples from a probability distribution (to generate text with more variety).
# Use a small sample vocabulary to illustrate temperature sampling.
= {
vocab "closer": 0,
"every": 1,
"effort": 2,
"forward": 3,
"inches": 4,
"moves": 5,
"pizza": 6,
"toward": 7,
"you": 8,
}= {v: k for k, v in vocab.items()}
inverse_vocab
# Assume the LLM generated the following logits for the next token, i.e. "every effort moves you".
= torch.tensor(
next_token_logits 4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79]
[
)
# We convert the logits into probabilities via the softmax function and obtain the token ID
# corresponding to the generated token via the argmax function, which we can then map back
# into text via the inverse vocabulary:
= torch.softmax(next_token_logits, dim=0)
probas = torch.argmax(probas).item()
next_token_id print(f"Greedy decoding: {inverse_vocab[next_token_id]}")
# Instead of greedy decoding via argmax, we can sample from the probability distribution
# to generate text with more variety (via a multinomial distribution).
# This is done via replacing the argmax with a sampling process from an multinomial
# distribution.
123)
torch.manual_seed(= torch.multinomial(probas, num_samples=1).item()
next_token_id print(f"Temperature sampling: {inverse_vocab[next_token_id]}")
def print_sampled_tokens(probas: torch.Tensor, num_samples: int = 1_000):
"""Print the sampled tokens from the probability distribution."""
123)
torch.manual_seed(= [
sample =1).item() for i in range(num_samples)
torch.multinomial(probas, num_samples
]
= torch.bincount(torch.tensor(sample))
sampled_ids for i, freq in enumerate(sampled_ids):
print(f"{freq} x {inverse_vocab[i]}")
print_sampled_tokens(probas)
# We can further control the distribution and selection process via a concept called temperature
# scaling. Temperature scaling is just a fancy description for dividing the logits by a number
# greater than 0.
def softmax_with_temperature(logits: torch.Tensor, temperature: float) -> torch.Tensor:
"""Apply softmax with temperature scaling."""
= logits / temperature
scaled_logits return torch.softmax(scaled_logits, dim=0)
# Sample with original, lower, and higher confidence.
# NOTE: In the plot below, temperature scaling manifests itself with sharper (lower temperatures)
# or more diffuse (higher temperatures) probability distributions.
# NOTE: A temperature of 1 corresponds to no temperature scaling.
# NOTE: Decreasing the temperature to 0.1 sharpens the distribution, so the most likely token
# (here, “forward”) will have an even higher probability score. Likewise, increasing the
# temperature to 5 makes the distribution more uniform.
# NOTE: A temperature of 5 results in a more uniform distribution where other tokens are selected
# more often. This can add more variety to the generated texts but also more often results
# in nonsensical text.
= [1, 0.1, 5]
temperatures
# Temperature scaling the logits.
= [softmax_with_temperature(next_token_logits, T) for T in temperatures]
scaled_probas
# Plot the results.
= torch.arange(len(vocab))
x = 0.15
bar_width = plt.subplots(figsize=(8, 6))
fig, ax
for i, T in enumerate(temperatures):
= ax.bar(
rects + i * bar_width, scaled_probas[i], bar_width, label=f"Temperature = {T}"
x
)
"Probability")
ax.set_ylabel(
ax.set_xticks(x)=90)
ax.set_xticklabels(vocab.keys(), rotation
ax.legend()
plt.tight_layout() plt.show()
Top-k sampling
Naive temperature sampling with higher temperatures leads to potentially more interesting and creative outputs. However, one downside of this approach is that it sometimes leads to grammatically incorrect or completely nonsensical outputs.
Top-k sampling, when combined with probabilistic sampling and temperature scaling, can improve the text generation results. In top-k sampling, we can restrict the sampled tokens to the top-k most likely tokens and exclude all other tokens from the selection process by masking their probability scores.
The top-k approach replaces all nonselected logits with negative infinity value (-inf), such that when computing the softmax values, the probability scores of the non-top-k tokens are 0, and the remaining probabilities sum up to 1 (this is a similar masking trick as in the causal attention module).
# Assume the LLM generated the following logits for the next token, i.e. "every effort moves you".
= torch.tensor(
next_token_logits 4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79]
[
)
# Define the top-k value.
= 3
top_k
# Get the top-k logits and their positions.
= torch.topk(next_token_logits, top_k)
top_logits, top_pos print(f"Top logits: {top_logits}")
print(f"Top positions: {top_pos}")
# Subsequently, we apply PyTorch’s where function to set the logit values of tokens that are below
# the lowest logit value within our top-three selection to negative infinity (-inf):
= torch.where(
new_logits # Identifies logits less than the minimum in the top 3.
=next_token_logits < top_logits[-1],
condition# Assigns –inf to these lower logits.
input=torch.tensor(float("-inf")),
# Keeps the original logits for the top-k tokens.
=next_token_logits,
other
)print(f"New logits (top-k sampling): {new_logits}")
# Apply the softmax function to turn these into next-token probabilities.
# NOTE: We can now apply the temperature scaling and multinomial function for probabilistic
# sampling to select the next token among these three non-zero probability scores to
# generate the next token.
= torch.softmax(new_logits, dim=0)
topk_probas print(f"Top-k probabilities: {topk_probas}")
An updated text generation function
def generate(
model: nn.Module,
idx: torch.Tensor,int,
max_new_tokens: int,
context_size: float = 0.0,
temperature: int] = None,
top_k: Optional[int] = None,
eos_id: Optional[
):"""Generate text with the model.
Args:
model: The model to use for generation.
idx: The input tokens.
max_new_tokens: The maximum number of tokens to generate.
context_size: The size of the context window.
temperature: The temperature to use for sampling.
top_k: The top-k value to use for sampling.
eos_id: The end-of-sequence token.
Returns:
The generated tokens.
"""
# The for loop is the same as before: gets logits and only focuses on the last time step.
# NOTE: Generate at most max_new_tokens tokens.
for _ in range(max_new_tokens):
# Only consider the last context_size tokens (this is typically informed by the model's
# supported context length or length of the positional embedding weight).
= idx[:, -context_size:]
idx_cond with torch.no_grad():
= model(idx_cond)
logits
# Only consider the last time step (i.e. the next token).
= logits[:, -1, :]
logits
# Optionally filter logits with top_k sampling.
if top_k is not None:
= torch.topk(logits, top_k)
top_logits, _ = top_logits[:, -1]
min_val = torch.where(
logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits
logits
)
# Optionally apply temperature scaling.
if temperature > 0.0:
= logits / temperature
logits = torch.softmax(logits, dim=-1)
probs = torch.multinomial(probs, num_samples=1)
idx_next else:
# If temperature is 0, use greedy decoding.
= torch.argmax(logits, dim=-1, keepdim=True)
idx_next
# Stop generating if we encounter the EOS (end of sequence) token.
if idx_next == eos_id:
break
# Concatenate the next token to the running sequence.
= torch.cat((idx, idx_next), dim=1)
idx
return idx
# Test the generation function.
123)
torch.manual_seed(= generate(
token_ids =model.to(device), # Ensure model is on the correct device
model=text_to_token_ids("Every effort moves you", tokenizer).to(device),
idx=15,
max_new_tokens=GPT_CONFIG_124M.context_length,
context_size=15,
top_k=1.4,
temperature
)print("Output text:\n", token_ids_to_text(token_ids, tokenizer))
Loading and saving model weights
Without the optimizer state
# Saving a PyTorch model is relatively straightforward. The recommended way is to save a model’s
# state_dict, a dictionary mapping each layer to its parameters, using the torch.save function:
# NOTE: For the GPT2-124M model, this results in a file of roughly 623M.
"model.pth") torch.save(model.state_dict(),
# Loading the model is equally straightforward. Note, however, that one needs to reinitialize the
# model architecture first and then load the state_dict into an existing model instance:
= GPTModel(GPT_CONFIG_124M)
model "model.pth", map_location=device))
model.load_state_dict(torch.load(
# NOTE: Set the model to evaluation mode since a model is most likely loaded for inference tasks
# (since the optimizer state is not saved/loaded to/from disk).
eval() model.
With the optimizer state
# If we plan to continue pre-training a model later—for example, using the train_model_simple
# function we defined earlier in this chapter—saving the optimizer state is also recommended.
# Adaptive optimizers such as AdamW store additional parameters for each model weight. AdamW uses
# historical data to adjust learning rates for each model parameter dynamically. Without it, the
# optimizer resets, and the model may learn suboptimally or even fail to converge properly, which
# means it will lose the ability to generate coherent text.
# NOTE: For the GPT2-124M model, this results in a file of roughly 1.9G (or almost 3x the size of
# the model weights alone).
torch.save(
{"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
},"model_and_optimizer.pth",
)
# Define the checkpoint to load the model and optimizer states from disk.
= torch.load("model_and_optimizer.pth", map_location=device)
checkpoint
# Load the model and optimizer states from the checkpoint.
= GPTModel(GPT_CONFIG_124M)
model "model_state_dict"])
model.load_state_dict(checkpoint[= torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
optimizer "optimizer_state_dict"])
optimizer.load_state_dict(checkpoint[
# NOTE: Set the model to training mode since a model is most likely loaded for further training
# (since the optimizer state is saved/loaded to/from disk).
model.train()
Loading pretrained weights from OpenAI
Fortunately, OpenAI openly shared the weights of their GPT-2 models, thus eliminating the need to invest tens to hundreds of thousands of dollars in retraining the model on a large corpus ourselves. So, let’s load these weights into our GPTModel class and use the model for text generation. Here, weights refer to the weight parameters stored in the .weight attributes of PyTorch’s Linear and Embedding layers, for example.
Note that OpenAI originally saved the GPT-2 weights via TensorFlow, which we have to install to load the weights in Python. The following code will use a progress bar tool called tqdm to track the download process, which we also have to install.
The overall architecture of these differently sized GPT models is the same, as shown below, except that different architectural elements are repeated different numbers of times and the embedding size differs.
%pip install tensorflow>=2.15.0 tqdm>=4.66
# The download code is relatively long, mostly boilerplate, and not very interesting. Hence, instead
# of devoting precious space to discussing Python code for fetching files from the internet, we
# download the gpt_download.py Python module directly from this chapter’s online repository:
import urllib.request
= (
url "https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch05/"
"01_main-chapter-code/gpt_download.py"
)
= url.split("/")[-1]
filename print(f"Downloading GPT download script to {filename}...")
urllib.request.urlretrieve(url, filename)
from gpt_download import download_and_load_gpt2
# Download the GPT-2 weights.
= download_and_load_gpt2(model_size="124M", models_dir="gpt2") settings, params
# Both settings and params are Python dictionaries. The settings dictionary stores the
# LLM architecture settings similarly to our manually defined GPT_CONFIG_124M settings.
# The params dictionary contains the actual weight tensors.
print(f"Settings: {settings}")
print(f"Parameter dictionary keys: {params.keys()}")
# Show example shape of a weight tensor.
print(f"Token embedding weight tensor dimensions: {params['wte'].shape}")
Creating the right config
from dataclasses import asdict
asdict(GPT_CONFIG_124M)
# Update the model configuration to conform to the model size.
= {
model_configs "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
"gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
"gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
"gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
# Instantiate a base config.
= asdict(GPT_CONFIG_124M)
tmp_config
# Load the overlay parameters.
= "gpt2-small (124M)"
model_name
tmp_config.update(model_configs[model_name])
# Update the context length to match OpenAI's GPT-2 models.
"context_length": 1024})
tmp_config.update({
# OpenAI used bias vectors in the multi-head attention module’s linear layers to implement the
# query, key, and value matrix computations. Bias vectors are not commonly used in LLMs anymore as
# they don’t improve the modeling performance and are thus unnecessary. However, since we are
# working with pretrained weights, we need to match the settings for consistency and enable these
# bias vectors.
"qkv_bias": True})
tmp_config.update({
# Instantiate the new configuration.
= GPTConfig(**tmp_config)
NEW_CONFIG
# Initialize the model with the new configuration.
= GPTModel(NEW_CONFIG)
gpt eval() gpt.
Loading weights into the model
def assign(left, right):
"""Safely assign the right weight tensor to the left layer.
Checks whether two tensors or arrays (left and right) have the same dimensions or shape and
returns the right tensor as trainable PyTorch parameters.
"""
if left.shape != right.shape:
raise ValueError(f"Shape mismatch. Left: {left.shape}, " "Right: {right.shape}")
return torch.nn.Parameter(torch.tensor(right))
def load_weights_into_gpt(gpt: GPTModel, params: dict):
# Sets the model’s positional and token embedding weights to those specified in params.
= assign(gpt.pos_emb.weight, params["wpe"])
gpt.pos_emb.weight = assign(gpt.tok_emb.weight, params["wte"])
gpt.tok_emb.weight
# Iterates over each transformer block in the model.
for b in range(len(params["blocks"])):
# The np.split function is used to divide the attention and bias weights into three equal
# parts for the query, key, and value components.
= np.split(
q_w, k_w, v_w "blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1
(params[
)= assign(
gpt.trf_blocks[b].mha.W_q.weight
gpt.trf_blocks[b].mha.W_q.weight, q_w.T
)= assign(
gpt.trf_blocks[b].mha.W_k.weight
gpt.trf_blocks[b].mha.W_k.weight, k_w.T
)= assign(
gpt.trf_blocks[b].mha.W_v.weight
gpt.trf_blocks[b].mha.W_v.weight, v_w.T
)= np.split(
q_b, k_b, v_b "blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1
(params[
)= assign(gpt.trf_blocks[b].mha.W_q.bias, q_b)
gpt.trf_blocks[b].mha.W_q.bias = assign(gpt.trf_blocks[b].mha.W_k.bias, k_b)
gpt.trf_blocks[b].mha.W_k.bias = assign(gpt.trf_blocks[b].mha.W_v.bias, v_b)
gpt.trf_blocks[b].mha.W_v.bias = assign(
gpt.trf_blocks[b].mha.out_proj.weight
gpt.trf_blocks[b].mha.out_proj.weight,"blocks"][b]["attn"]["c_proj"]["w"].T,
params[
)= assign(
gpt.trf_blocks[b].mha.out_proj.bias
gpt.trf_blocks[b].mha.out_proj.bias,"blocks"][b]["attn"]["c_proj"]["b"],
params[
)0].weight = assign(
gpt.trf_blocks[b].ff.layers[0].weight,
gpt.trf_blocks[b].ff.layers["blocks"][b]["mlp"]["c_fc"]["w"].T,
params[
)0].bias = assign(
gpt.trf_blocks[b].ff.layers[0].bias, params["blocks"][b]["mlp"]["c_fc"]["b"]
gpt.trf_blocks[b].ff.layers[
)2].weight = assign(
gpt.trf_blocks[b].ff.layers[2].weight,
gpt.trf_blocks[b].ff.layers["blocks"][b]["mlp"]["c_proj"]["w"].T,
params[
)2].bias = assign(
gpt.trf_blocks[b].ff.layers[2].bias,
gpt.trf_blocks[b].ff.layers["blocks"][b]["mlp"]["c_proj"]["b"],
params[
)= assign(
gpt.trf_blocks[b].pre_attention_norm.scale "blocks"][b]["ln_1"]["g"]
gpt.trf_blocks[b].pre_attention_norm.scale, params[
)= assign(
gpt.trf_blocks[b].pre_attention_norm.shift "blocks"][b]["ln_1"]["b"]
gpt.trf_blocks[b].pre_attention_norm.shift, params[
)= assign(
gpt.trf_blocks[b].pre_ff_norm.scale "blocks"][b]["ln_2"]["g"]
gpt.trf_blocks[b].pre_ff_norm.scale, params[
)= assign(
gpt.trf_blocks[b].pre_ff_norm.shift "blocks"][b]["ln_2"]["b"]
gpt.trf_blocks[b].pre_ff_norm.shift, params[
)
= assign(gpt.final_norm.scale, params["g"])
gpt.final_norm.scale = assign(gpt.final_norm.shift, params["b"])
gpt.final_norm.shift
# The original GPT-2 model by OpenAI reused the token embedding weights in the output layer
# to reduce the total number of parameters, which is a concept known as weight tying.
= assign(gpt.out_head.weight, params["wte"]) gpt.out_head.weight
# Load the weights into the model.
load_weights_into_gpt(gpt, params) gpt.to(device)
# Test the model to verify that it can generate coherent text.
123)
torch.manual_seed(= generate(
token_ids =gpt,
model=text_to_token_ids("Every effort moves you", tokenizer).to(device),
idx=25,
max_new_tokens=NEW_CONFIG.context_length,
context_size=50,
top_k=1.5,
temperature
)print("Output text:\n", token_ids_to_text(token_ids, tokenizer))