import dataclasses
import json
import functools
import os
import pathlib
import psutil
from pprint import pprint
import time
from typing import Any, Dict, List, Optional, Tuple
import urllib.request
import urllib
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import tiktoken
from tqdm.notebook import tqdm
import pandas as pd
# Import previous chapter dependencies.
# See https://stackoverflow.com/questions/44116194/import-a-function-from-another-ipynb-file
# NOTE: Importing these functions seems to run the entire cell the symbol is defined in, which would
# suggest that symbols should be defined in separate cells from the test code.
# NOTE: Importing another ipynb file basically runs the entire imported notebook.
import import_ipynb
from gpt_download import download_and_load_gpt2
# Chapter 4 dependencies.
from chapter_04_gpt_from_scratch import (
GPTConfig,
GPTModel,
)
# Chapter 5 dependencies.
from chapter_05_pretraining_on_unlabeled_data import (
generate,
token_ids_to_text,
text_to_token_ids,
load_weights_into_gpt,
calc_loss_loader,
train_model_simple,
plot_losses, )
Fine-tuning to follow instructions
This notebook explores the fine-tuning process of LLMs with the purpose of creating instruction fine-tuned model based on Sebastian Raschka’s book (Chapter 7). In particular, it discusses the following:
- The instruction fine-tuning process of LLMs
- Preparing a dataset for supervised instruction fine-tuning
- Organizing instruction data in training batches
- Loading a pretrained LLM and fine-tuning it to follow human instructions
- Extracting LLM-generated instruction responses for evaluation
- Evaluating an instruction-fine-tuned LLM
Acknowledgment
All concepts, architectures, and implementation approaches are credited to Sebastian Raschka’s work. This repository serves as my personal implementation and notes while working through the book’s content.
Resources
# Define the base config.
= GPTConfig(
GPT_CONFIG_124M =50257, # as used by the BPE tokenizer for GPT-2.
vocab_size=1024,
context_length=768,
emb_dim=12,
n_heads=12,
n_layers=0.0, # disable dropout for inference
dropout_rate=False,
qkv_bias
)
# Determine the device to run the model on.
= torch.device("cuda" if torch.cuda.is_available() else "cpu")
device print(f"Using device: {device}")
Stage 1: Preparing the dataset
We now know that pretraining an LLM involves a training procedure where it learns to generate one word at a time. The resulting pretrained LLM is capable of text completion, meaning it can finish sentences or write text paragraphs given a fragment as input.
Here, we focus on improving the LLM’s ability to follow such instructions and generate a desired response. Preparing the dataset is a key aspect of instruction fine-tuning.
Download and load the dataset
The dataset consists of 1,100 instruction–response pairs. This dataset was created specifically for this book. The following code implements and executes a function to download this dataset, which is a relatively small file (only 204 KB) in JSON forma.
As we can see, the example entries are Python dictionary objects containing an instruction
, input
, and output
.
The input
field may occasionally be empty.
def download_and_load_file(file_path: pathlib.Path, url: str) -> Dict[str, Any]:
"""Download and load a file from a URL.
Args:
file_path: The path to the file to download.
url: The URL to download the file from.
Returns:
The loaded data.
"""
# Skips download if file was already downloaded
if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response:
= response.read().decode("utf-8")
text_data with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data)
else:
with open(file_path, "r", encoding="utf-8") as file:
= file.read()
text_data
# Load and decode the data from the file.
with open(file_path, "r") as file:
= json.load(file)
data
return data
= pathlib.Path("data/instruction-data.json")
file_path = (
url "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
"/main/ch07/01_main-chapter-code/instruction-data.json"
)= download_and_load_file(file_path, url)
data print("Number of entries:", len(data))
50]) pprint(data[
Prompt formatting
Instruction fine-tuning involves training a model on a dataset where the input-output pairs, like those we extracted from the JSON file, are explicitly provided. There are various methods to format these entries for LLMs.
There are various methods to format these entries for LLMs, often referred to as prompt styles. The most commonly used ones are the following:
- Alpaca prompt style
- Phi-3 prompt style
Alpaca was one of the early LLMs to publicly detail its instruction fine-tuning process. Phi-3, developed by Microsoft, is included to demonstrate the diversity in prompt styles. The rest of this notebook uses the Alpaca prompt style since it is one of the most popular ones, largely because it helped define the original approach to fine-tuning.
def format_input(entry: Dict[str, Any], add_output: bool = False) -> str:
"""Format an entry for the Alpaca prompt style.
Args:
entry: A dictionary containing an `instruction` and `input` key.
add_output: Whether to add the `output` key to the formatted string.
Returns:
The formatted string.
"""
# Add the 'system' prompt and the entry's instruction.
= (
instruction_text f"Below is an instruction that describes a task. "
f"Write a response that appropriately completes the request."
f"\n\n### Instruction:\n{entry['instruction']}"
)
# Add the entry's input if it exists.
# NOTE: The 'input' section is skipped if the field is empty.
= f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
input_text
# Optionally add the desired response.
= f"\n\n### Response:\n{entry['output']}" if add_output else ""
desired_response
return instruction_text + input_text + desired_response
# Format the example entry.
= format_input(data[50], add_output=True)
model_input print(model_input)
Splitting the datast
# TODO: This section should reuse functions from chapter 6.
# Use 85% of the data for training, 10% for testing, and 5% for validation.
= int(len(data) * 0.85)
train_portion = int(len(data) * 0.1)
test_portion = len(data) - train_portion - test_portion
val_portion
= data[:train_portion]
train_data = data[train_portion : train_portion + test_portion]
test_data = data[train_portion + test_portion :]
val_data
print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))
Organizing data into training batches
In this section, we learn how to efficiently pad the data samples to equal lengths so we can assemble multiple instruction examples in a batch.
In the previous chapter, the training batches were created automatically by the PyTorch DataLoader class, which employs a default collate function to combine lists of samples into batches. A collate function is responsible for taking a list of individual data samples and merging them into a single batch that can be processed efficiently by the model during training. Here, we create our own custom collate function to handle specific requirements and formatting (pre-tokenization and formatting of inputs) of our instruction dataset.
Create a dataset class
Similar to the approach used for classification fine-tuning, we want to accelerate training by collecting multiple training examples in a batch, which necessitates padding all inputs to a similar length. As with classification fine-tuning, we use the <|endoftext|> token as a padding token.
class InstructionDataset(Dataset):
"""Dataset class for instruction fine-tuning."""
def __init__(self, data: List[Dict[str, Any]], tokenizer: tiktoken.Encoding):
# Cache the raw and encoded texts.
self.data = data
self.encoded_texts = []
# Pretokenizes texts
for entry in data:
= format_input(entry=entry, add_output=True)
full_text self.encoded_texts.append(tokenizer.encode(full_text))
def __getitem__(self, index: int) -> List[int]:
return self.encoded_texts[index]
def __len__(self) -> int:
return len(self.data)
= tiktoken.get_encoding("gpt2")
tokenizer = InstructionDataset(train_data, tokenizer)
dataset print(f"Length of dataset: {len(dataset)}")
# Instead of appending the <|endoftext|> tokens to the text inputs, we can append the token ID
# corresponding to <|endoftext|> to the pretokenized inputs directly. We can use the tokenizer’s
# .encode method on an <|endoftext|> token to remind us which token ID we should use:
= tiktoken.get_encoding("gpt2")
tokenizer print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))
Custom collate function
This custom collate function pads the training examples in each batch to the same length while allowing different batches to have different lengths, as demonstrated in the figure below. This approach minimizes unnecessary padding by only extending sequences to match the longest one in each batch, not the whole dataset.
def custom_collate_draft_1(
int]], pad_token_id: int = 50256, device: str = "cpu"
batch: List[List[-> torch.Tensor:
) """Custom collate function for instruction fine-tuning.
Args:
batch: A list of lists of integers representing the training examples.
pad_token_id: The token ID to use for padding.
device: The device to move the resulting tensor to.
Returns:
A tensor of the padded inputs.
"""
# Find the longest sequence in the batch.
= max(len(item) + 1 for item in batch)
batch_max_length
# Pad and prepare the inputs.
= []
inputs_lst for item in batch:
# Copy the item and append a single padding token.
= item.copy()
new_item += [pad_token_id]
new_item
# Pad the sequence to the longest sequence in the batch.
= new_item + [pad_token_id] * (batch_max_length - len(new_item))
padded
# Convert the padded sequence to a tensor, remove the extra padded token added earlier, and
# add it to the list of inputs.
# NOTE: The purpose of this will become clear later in the second draft of this collate
# function.
= torch.tensor(padded[:-1])
inputs
inputs_lst.append(inputs)
# Stack the inputs into a single tensor and move it to the specified device.
= torch.stack(inputs_lst).to(device)
inputs_tensor return inputs_tensor
# Test the custom collate function.
# NOTE: This output shows all inputs have been padded to the length of the longest input list,
# inputs_1, containing five token IDs.
= [0, 1, 2, 3, 4]
inputs_1 = [5, 6]
inputs_2 = [7, 8, 9]
inputs_3 = (
batch
inputs_1,
inputs_2,
inputs_3,
)print(custom_collate_draft_1(batch))
Adding target tokens to the collate function
We also need to create batches with the target token IDs corresponding to the batch of input IDs. These target IDs, as shown in the figure below, are crucial because they represent what we want the model to generate and what we need during training to calculate the loss for the weight updates. That is, we modify our custom collate function to return the target token IDs in addition to the input token IDs.
Similar to the process we used to pretrain an LLM, the target token IDs match the input token IDs but are shifted one position to the right.
def custom_collate_draft_2(
int]], pad_token_id: int = 50256, device: str = "cpu"
batch: List[List[-> Tuple[torch.Tensor, torch.Tensor]:
) """Custom collate function for instruction fine-tuning.
Args:
batch: A list of lists of integers representing the training examples.
pad_token_id: The token ID to use for padding.
device: The device to move the resulting tensor to.
Returns:
A tuple of tensors of the padded inputs and targets.
"""
# Find the longest sequence in the batch.
= max(len(item) + 1 for item in batch)
batch_max_length
# Initialize the outputs.
= [], []
inputs_lst, targets_lst
# Pad and prepare the inputs.
for item in batch:
# Copy the item and append a single padding token.
= item.copy()
new_item += [pad_token_id]
new_item
# Pad the sequence to the longest sequence in the batch.
= new_item + [pad_token_id] * (batch_max_length - len(new_item))
padded
# Truncates the last token for inputs.
= torch.tensor(padded[:-1])
inputs # Shifts +1 to the right for targets.
= torch.tensor(padded[1:])
targets
# Append the inputs and targets to the lists.
inputs_lst.append(inputs)
targets_lst.append(targets)
# Stack the inputs into a single tensor and move it to the specified device.
= torch.stack(inputs_lst).to(device)
inputs_tensor = torch.stack(targets_lst).to(device)
targets_tensor return inputs_tensor, targets_tensor
# Test the custom collate function.
= custom_collate_draft_2(batch)
inputs, targets print(inputs)
print(targets)
Replace padding tokens with -100
We assign a -100 placeholder value to all padding tokens. This special value allows us to exclude these padding tokens from contributing to the training loss calculation, ensuring that only meaningful data influences model learning.
The default setting of the cross entropy function in PyTorch is cross_entropy(..., ignore_index=-100)
. This means that it ignores targets labeled with -100.
However, note that we retain one end-of-text token, ID 50256, in the target list. Retaining it allows the LLM to learn when to generate an end-of-text token in response to instructions, which we use as an indicator that the generated response is complete.
In addition to masking out padding tokens, it is also common to mask out the target token IDs that correspond to the instruction, as illustrated in figure 7.13. By masking out the LLM’s target token IDs corresponding to the instruction, the cross entropy loss is only computed for the generated response target IDs. Thus, the model is trained to focus on generating accurate responses rather than memorizing instructions, which can help reduce overfitting.
As of this writing, researchers are divided on whether masking the instructions is universally beneficial during instruction fine-tuning. For instance, the 2024 paper by Shi et al., Instruction Tuning With Loss Over Instructions, demonstrated that not masking the instructions benefits the LLM performance (see appendix B for more details). Here, we will not apply instruction masking.
def custom_collate_fn(
int]],
batch: List[List[int = 50256,
pad_token_id: =-100,
ignore_index=None,
allowed_max_lengthstr = "cpu",
device: -> Tuple[torch.Tensor, torch.Tensor]:
) """Custom collate function for instruction fine-tuning.
Args:
batch: A list of lists of integers representing the training examples.
pad_token_id: The token ID to use for padding.
ignore_index: The value to use for padding tokens.
allowed_max_length: The maximum length of the input sequences.
device: The device to move the resulting tensor to.
Returns:
A tuple of tensors of the padded inputs and targets.
"""
# Find the longest sequence in the batch.
= max(len(item) + 1 for item in batch)
batch_max_length
# Initialize the outputs.
= [], []
inputs_lst, targets_lst
# Pad and prepare the inputs.
for item in batch:
# Copy the item and append a single padding token.
= item.copy()
new_item += [pad_token_id]
new_item
# Pad the sequence to the longest sequence in the batch.
= new_item + [pad_token_id] * (batch_max_length - len(new_item))
padded
# Truncates the last token for inputs.
= torch.tensor(padded[:-1])
inputs # Shifts +1 to the right for targets.
= torch.tensor(padded[1:])
targets
# Replaces all but the first padding tokens in targets by ignore_index.
= targets == pad_token_id
mask = torch.nonzero(mask).squeeze()
indices if indices.numel() > 1:
1:]] = ignore_index
targets[indices[
# Optionally truncates to the maximum sequence length.
if allowed_max_length is not None:
= inputs[:allowed_max_length]
inputs = targets[:allowed_max_length]
targets
# Append the inputs and targets to the lists.
inputs_lst.append(inputs)
targets_lst.append(targets)
# Stack the inputs into a single tensor and move it to the specified device.
= torch.stack(inputs_lst).to(device)
inputs_tensor = torch.stack(targets_lst).to(device)
targets_tensor return inputs_tensor, targets_tensor
# Test the custom collate function.
= custom_collate_fn(batch)
inputs, targets print(inputs)
print(targets)
Creating the data loader
= functools.partial(
customized_collate_fn =device, allowed_max_length=1024
custom_collate_fn, device )
# You can try to increase this number if parallel Python processes are supported by your operating
# system.
= 0
num_workers = 8
batch_size
# Set the seed for reproducibility.
123)
torch.manual_seed(
# Create the datasets.
= InstructionDataset(train_data, tokenizer)
train_dataset = DataLoader(
train_loader
train_dataset,=batch_size,
batch_size=customized_collate_fn,
collate_fn=True,
shuffle=True,
drop_last=num_workers,
num_workers
)= InstructionDataset(val_data, tokenizer)
val_dataset = DataLoader(
val_loader
val_dataset,=batch_size,
batch_size=customized_collate_fn,
collate_fn=False,
shuffle=False,
drop_last=num_workers,
num_workers
)= InstructionDataset(test_data, tokenizer)
test_dataset = DataLoader(
test_loader
test_dataset,=batch_size,
batch_size=customized_collate_fn,
collate_fn=False,
shuffle=False,
drop_last=num_workers,
num_workers
)
# Print the shape of all batches in the train loader.
# NOTE: Each batch contains 8 examples but the length of the sequences can vary from batch to batch.
print(f"Train loader (length: {len(train_loader)}):")
for inputs, targets in train_loader:
print(inputs.shape, targets.shape)
Stage 2 - Fine-tuning the LLM
Before beginning instruction fine-tuning, we must first load a pretrained GPT model that we want to fine-tune, a process we have undertaken previously. However, instead of using the smallest 124-million-parameter model as before, we load the medium-sized model with 355 million parameters. The reason for this choice is that the 124-million-parameter model is too limited in capacity to achieve satisfactory results via instruction fine-tuning. Specifically, smaller models lack the necessary capacity to learn and retain the intricate patterns and nuanced behaviors required for high-quality instruction-following tasks.
The following supplementary section in this book’s code repository lists several options for using cloud GPUs: https://mng.bz/EOEq.
Loading a pre-trained LLM
# Load the base config.
= GPTConfig(
GPT_CONFIG =50257, # as used by the BPE tokenizer for GPT-2.
vocab_size=1024,
context_length=768,
emb_dim=12,
n_heads=12,
n_layers=0.0, # disable dropout for inference
dropout_rate=False,
qkv_bias
)
# Update the model configuration to conform to the model size.
= {
model_configs "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
"gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
"gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
"gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
# Instantiate a base config.
= dataclasses.asdict(GPT_CONFIG)
tmp_config
# Load the overlay parameters.
= "gpt2-medium (355M)"
model_name
tmp_config.update(model_configs[model_name])
# Update the context length to match OpenAI's GPT-2 models.
"context_length": 1024})
tmp_config.update({
# OpenAI used bias vectors in the multi-head attention module’s linear layers to implement the
# query, key, and value matrix computations. Bias vectors are not commonly used in LLMs anymore as
# they don’t improve the modeling performance and are thus unnecessary. However, since we are
# working with pretrained weights, we need to match the settings for consistency and enable these
# bias vectors.
"qkv_bias": True})
tmp_config.update({
# Instantiate the new configuration.
= GPTConfig(**tmp_config)
NEW_CONFIG
# Download the pretrained weights.
= model_name.split(" ")[-1].lstrip("(").rstrip(")")
model_size print(f"Downloading pretrained weights for {model_size} model...")
= download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
settings, params
# Initialize the model with the new configuration.
= GPTModel(NEW_CONFIG)
model
load_weights_into_gpt(model, params)eval() model.
# Sanity check the model outputs on a random example.
123)
torch.manual_seed(
# Print an example from the validation set.
= format_input(val_data[0])
input_text print(input_text)
# Generate a response from the model.
# NOTE: The generate function returns the combined input and output text. This behavior was
# previously convenient since pretrained LLMs are primarily designed as text-completion
# models, where the input and output are concatenated to create coherent and legible
# text. However, when evaluating the model’s performance on a specific task, we often
# want to focus solely on the model’s generated response.
= generate(
token_ids =model,
model=text_to_token_ids(input_text, tokenizer),
idx=35,
max_new_tokens=NEW_CONFIG.context_length,
context_size=50256,
eos_id
)
# Print the model's response without the input text (i.e. the instruction).
# NOTE: To isolate the model’s response text, we need to subtract the length of the input
# instruction from the start of the generated_text.
= token_ids_to_text(token_ids, tokenizer)
generated_text = generated_text[len(input_text) :].strip()
response_text print(response_text)
Instruction fine-tuning the LLM
- Exercise 7.3 Fine-tuning on the original Alpaca dataset
The Alpaca dataset, by researchers at Stanford, is one of the earliest and most popular openly shared instruction datasets, consisting of 52,002 entries. As an alternative to the instruction-data.json file we use here, consider fine-tuning an LLM on this dataset. The dataset is available at https://mng.bz/NBnE. This dataset contains 52,002 entries, which is approximately 50 times more than those we used here, and most entries are longer. Thus, I highly recommend using a GPU to conduct the training, which will accelerate the fine-tuning process. If you encounter out-of-memory errors, consider reducing the batch_size from 8 to 4, 2, or even 1. Lowering the allowed_max_length from 1,024 to 512 or 256 can also help manage memory problems.
# Calculate baseline train and validation loss (before any fine-tuning).
model.to(device)123)
torch.manual_seed(
with torch.no_grad():
= calc_loss_loader(
train_loss =train_loader, model=model, device=device, num_batches=5
data_loader
)= calc_loss_loader(
val_loss =val_loader, model=model, device=device, num_batches=5
data_loader
)
print(f"Training loss: {train_loss}")
print(f"Validation loss: {val_loss}")
A note on weight decay
AdamW implements weight decay by subtracting a scaled version of the weights from the parameter update, rather than modifying the loss function like L2 regularization. This decoupling of weight decay from the gradient calculation ensures that momentum and adaptive learning rates in Adam are not affected by weight decay, leading to more consistent and effective regularization (see this post or this post or this post)
- Weight Decay vs. L2 Regularization:
- Weight Decay: Modifies the parameter update step to penalize large weights. It directly subtracts a portion of the weights from the update, effectively shrinking them towards zero.
- L2 Regularization: Modifies the loss function by adding a penalty term proportional to the squared magnitude of the weights. This penalty term increases the loss for large weights, making it more difficult for the model to learn large values.
- AdamW’s Approach:
- AdamW implements weight decay by directly subtracting a scaled version of the weights from the parameter update, without changing the loss function.
- This ensures that the momentum and adaptive learning rates in Adam, which are crucial for efficient training, are not affected by the weight decay process.
- Mathematical Representation: Let’s consider the following:
- \(\theta_t\): The weights at iteration \(t\).
- \(\nabla L(\theta_t)\): The gradient of the loss function with respect to the weights at iteration \(t\).
- \(\eta_t\): The learning rate at iteration \(t\).
- \(\lambda\): The weight decay hyperparameter.
AdamW Update Rule: \[\theta_{t+1} = \theta_t - \eta_t \cdot \nabla L(\theta_t) - \eta_t \cdot \lambda \cdot \theta_t\]
Explanation: - The first term (\(- \eta_t \cdot \nabla L(\theta_t)\)) is the standard gradient descent update. - The second term (\(- \eta_t \cdot \lambda \cdot \theta_t\)) is the weight decay term. It subtracts a portion of the weights (\(\lambda \cdot \theta_t\)) from the update, scaled by the learning rate (\(\eta_t\)). - This direct subtraction ensures that the weights are gradually shrunk towards zero during training.
- Advantages of AdamW:
- Consistent Regularization: AdamW applies weight decay directly to the parameters, ensuring consistent regularization regardless of the magnitude of the gradients.
- Improved Generalization: By effectively shrinking weights towards zero, AdamW can help prevent overfitting and improve the model’s ability to generalize to unseen data.
- Better Convergence: AdamW can lead to faster and more stable convergence during training, especially when dealing with large models and datasets.
# Set the seed for reproducibility.
123)
torch.manual_seed(
# Initialize the optimizer.
= torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
optimizer
# Set the number of epochs.
= 2
num_epochs
# Start the timer.
= time.time()
start_time = train_model_simple(
train_losses, val_losses, tokens_seen
model,
train_loader,
val_loader,
optimizer,
device,=num_epochs,
num_epochs=5,
eval_freq=5,
eval_iter=format_input(val_data[0]),
start_context=tokenizer,
tokenizer
)
# Calculate the execution time.
# NOTE: With an NVIDIA NVIDIA RTX 5000 GPU (32GB VRAM), this should take about 0.70 minutes.
= time.time()
end_time = (end_time - start_time) / 60
execution_time_minutes print(f"Training completed in {execution_time_minutes:.2f} minutes.")
Training/validation loss curves
From the loss plot shown in figure 7.17, we can see that the model’s performance on both the training and validation sets improves substantially over the course of training. The rapid decrease in losses during the initial phase indicates that the model quickly learns meaningful patterns and representations from the data. Then, as training progresses to the second epoch, the losses continue to decrease but at a slower rate, suggesting that the model is fine-tuning its learned representations and converging to a stable solution.
= torch.linspace(0, num_epochs, len(train_losses))
epochs_tensor plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)
Stage 3 - Evaluating the LLM
Extracting and saving responses
This is for evaluating the model’s performance on the hold-out dataset which requires generating a response for each input in the test dataset.
Spot-checking examples
# Spot check a few examples.
123)
torch.manual_seed(
# Iterates over the first three test set samples
for entry in test_data[:3]:
# Format the input instructions.
= format_input(entry)
input_text
# Generate a response from the model.
= generate(
token_ids =model,
model=text_to_token_ids(input_text, tokenizer).to(device),
idx=256,
max_new_tokens=NEW_CONFIG.context_length,
context_size=50256,
eos_id
)
# Decode the generated token IDs into text.
= token_ids_to_text(token_ids, tokenizer)
generated_text
# Remove the "### Response:" prefix and strip any leading or trailing whitespace.
= (
response_text len(input_text) :].replace("### Response:", "").strip()
generated_text[
)
print("---------- INPUT ----------------------")
print(input_text)
print("---------- CORRECT RESPONSE ------------")
print(f"\nCorrect response:\n>> {entry['output']}")
print("---------- MODEL RESPONSE --------------")
print(f"\nModel response:\n>> {response_text.strip()}")
print("-------------------------------------")
Generate responses for the entire test set
# Generate responses for the entire test set.
for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
# Generate the response.
= format_input(entry)
input_text = generate(
token_ids =model,
model=text_to_token_ids(input_text, tokenizer).to(device),
idx=256,
max_new_tokens=NEW_CONFIG.context_length,
context_size=50256,
eos_id
)= token_ids_to_text(token_ids, tokenizer)
generated_text = (
response_text len(input_text) :].replace("### Response:", "").strip()
generated_text[
)
# Update the test data with the model's response.
"model_response"] = response_text
test_data[i][
# Save the test data with the model's responses for later use.
with open("instruction-data-with-response.json", "w") as file:
file, indent=4) json.dump(test_data,
Save the fine-tuned model
import re
# Removes white spaces and parentheses from file name.
= f"{re.sub(r'[ ()]', '', model_name) }-sft.pth"
file_name
torch.save(model.state_dict(), file_name)print(f"Model saved as {file_name}")
Load the fine-tuned model
"gpt2-medium355M-sft.pth"))
model.load_state_dict(torch.load(eval() model.
Evaluating the fine-tuned model
This section details the implementation of a method to automate the response evaluation of the fine-tuned LLM using another, larger LLM. To evaluate test set responses in an automated fashion, we utilize an existing instruction-fine-tuned 8-billion-parameter Llama 3 model developed by Meta AI. This model can be run locally using the open source Ollama application (https://ollama.com).
NOTE: Ollama is an efficient application for running LLMs on a laptop. It serves as a wrapper around the open source llama.cpp library (https://github.com/ggerganov/llama.cpp), which implements LLMs in pure C/C++ to maximize efficiency. However, Ollama is only a tool for generating text using LLMs (inference) and does not support training or fine-tuning LLMs.
The 8-billion-parameter Llama 3 model is a very capable LLM that runs locally. However, it’s not as capable as large proprietary LLMs such as GPT-4 offered by OpenAI. For readers interested in exploring how to utilize GPT-4 through the OpenAI API to assess generated model responses, an optional code notebook is available within the supplementary materials accompanying this book at https://mng.bz/BgEv.
Utility functions for Ollama
# Utility function to verify Ollama is running.
def check_if_running(process_name: str) -> bool:
"""Check if the specified process is running."""
= False
running for proc in psutil.process_iter(["name"]):
if process_name in proc.info["name"]:
= True
running break
return running
= check_if_running("ollama")
ollama_running if not ollama_running:
raise RuntimeError("Ollama not running. Launch ollama before proceeding.")
print("Ollama running:", check_if_running("ollama"))
# REST API-based query function for Ollama.
= "http://localhost:11434/api/chat"
LOCAL_HOST_OLLAMA_URL
def query_model(prompt: str, model: str = "llama3", url: str = LOCAL_HOST_OLLAMA_URL):
"""Query the Ollama model using the REST API.
Args:
prompt: The prompt to query the model with.
model: The model to query.
url: The URL of the Ollama server.
Returns:
The response from the model.
"""
# Creates the data payload as a dictionary
= {
data "model": model,
"messages": [{"role": "user", "content": prompt}],
# Settings for deterministic responses
"options": {
"seed": 123,
"temperature": 0,
"num_ctx": 2048,
},
}
# Converts the dictionary to a JSON-formatted string and encodes it to bytes.
= json.dumps(data).encode("utf-8")
payload
# Creates a request object, setting the method to POST and adding necessary headers.
= urllib.request.Request(url, data=payload, method="POST")
request "Content-Type", "application/json")
request.add_header(
# Sends the request and captures the response.
= ""
response_data with urllib.request.urlopen(request) as response:
while True:
# Reads the response line by line.
= response.readline().decode("utf-8")
line if not line:
break
# Parses the JSON-formatted line into a dictionary.
= json.loads(line)
response_json
# Appends the response content to the response data.
+= response_json["message"]["content"]
response_data
return response_data
Test the REST API call to Ollama
= "llama3"
model = query_model("What do Llamas eat?", model)
result print(result)
Score the instruction fine-tuned responses via Ollama
# Evaluate the fine-tuned model and score some examples.
for entry in test_data[:3]:
= (
prompt f"Given the input `{format_input(entry)}` "
f"and correct output `{entry['output']}`, "
f"score the model response `{entry['model_response']}`"
f" on a scale from 0 to 100, where 100 is the best score. "
)print("\nDataset response:")
print(">>", entry["output"])
print("\nModel response:")
print(">>", entry["model_response"])
print("\nScore:")
print(">>", query_model(prompt))
print("\n-------------------------")
Numeric model scoring
It’s worth noting that Ollama is not entirely deterministic across operating systems at the time of this writing, which means that the scores you obtain might vary slightly from the previous scores. To obtain more robust results, you can repeat the evaluation multiple times and average the resulting scores.
To further improve our model’s performance, we can explore various strategies, such as: - Adjusting the hyperparameters during fine-tuning, such as the learning rate, batch size, or number of epochs - Increasing the size of the training dataset or diversifying the examples to cover a broader range of topics and styles - Experimenting with different prompts or instruction formats to guide the model’s responses more effectively - Using a larger pretrained model, which may have greater capacity to capture complex patterns and generate more accurate responses
def generate_model_scores(
str, Any]], json_key: str, model: str = "llama3"
json_data: List[Dict[-> List[int]:
) """Generate scores for a model based on a JSON dataset.
Args:
json_data: The JSON dataset to score.
json_key: The key in the JSON dataset to score.
model: The model to score with.
Returns:
A list of scores.
"""
= []
scores for entry in tqdm(json_data, desc="Scoring entries"):
# Modified instruction line to only return the numeric score (without any explanation).
= (
prompt f"Given the input `{format_input(entry)}` "
f"and correct output `{entry['output']}`, "
f"score the model response `{entry[json_key]}`"
f" on a scale from 0 to 100, where 100 is the best score. "
f"Respond with the integer number only."
)
# Query the model and get the score.
= query_model(prompt, model)
score
# Try to convert the score to an integer.
try:
int(score))
scores.append(except ValueError:
print(f"Could not convert score: {score}")
continue
return scores
= generate_model_scores(test_data, "model_response")
scores print(f"Number of scores: {len(scores)} of {len(test_data)}")
print(f"Average score: {sum(scores)/len(scores):.2f}\n")
Stage 4 - Preference fine-tuning with DPO
While we covered the most essential steps, there is an optional step that can be performed after instruction fine-tuning: preference fine-tuning. Preference fine-tuning is particularly useful for customizing a model to better align with specific user preferences. If you are interested in exploring this further, see the 04_preference-tuning- with-dpo folder in this book’s supplementary GitHub repository at https://mng.bz/dZwD.
In addition to the main content covered in this book, the GitHub repository also contains a large selection of bonus material that you may find valuable. To learn more about these additional resources, visit the Bonus Material section on the repository’s README page: https://mng.bz/r12g.
from datasets import load_dataset
= load_dataset(
ds ="wikimedia/wikipedia",
path="20231101.en",
name="~/data/datasets/wikipedia_en",
cache_dir )
print(ds["train"][0]["text"])