Skip to content
Snippets Groups Projects
Commit cb893907 authored by Vlad-Andrei BĂDOIU (78692)'s avatar Vlad-Andrei BĂDOIU (78692)
Browse files

Merge branch 'vladb/add_inference' into 'main'

Add inference code

See merge request !10
parents 83f7b518 22338e60
No related branches found
No related tags found
1 merge request!10Add inference code
import sys
import time
import math
import argparse
import fire
import torch
from torch import nn
from torch.utils.data import Dataset
from datasets.wikitext103 import WikiText103Dataset
from tokenizer import Tokenizer
from dataloader import OptimusDataLoader
from model import Transformer
from trainer import Trainer
def main(batch_size: int = 8,
grad_acc_steps: int = 1,
seq_len: int = 512,
lr_max: float = 1e-4,
grad_clip_norm: float = 1.0,
epochs: int = 1,
tokenizer_path: str = 'optimus.model',
checkpoints_path: str = 'best_model.pth',
n_layers: int = 6,
dim: int = 512,
n_heads: int = 8,
dropout: float = 0.0,
device: str = 'cuda',
prompt = "Once upon time"
):
"""
Run the main training loop for the model.
Args:
batch_size (int): Batch size for training.
grad_acc_steps (int): Number of batches to accumulate gradients for
before running backpropagation to update weights.
seq_len (int): Context length for training.
lr_max (float): Maximum learning rate, used for one-cycle scheduling.
grad_clip_norm (float): Gradient clipping value for gradient's norm.
epochs (int): Number of epochs to train for.
tokenizer_path (str): Path to the tokenizer model.
checkpoints_path (str): Where to save the trained model. Should be a .pt
or .pth file.
n_layers (int): Number of layers for the model.
dim (int): Dimension of the model.
n_heads (int): Number of heads inside an attention layer for the model.
dropout (float): Dropout to use for the model.
device (str): Device where to train the model. Viable options are 'cpu',
'cuda', 'cuda:2' etc.
"""
print(f"Running with:\n"
f"\t- batch size: {batch_size}\n"
f"\t- gradient accumulation steps: {grad_acc_steps}\n"
f"\t- context length: {seq_len}\n"
f"\t- max learning rate: {lr_max}\n"
f"\t- gradient clipping norm: {grad_clip_norm}\n"
f"\t- epochs: {epochs}\n"
f"\t- tokenizer: {tokenizer_path}\n"
f"\t- checkpoints path: {checkpoints_path}\n"
f"\t- model layers: {n_layers}\n"
f"\t- model dimension: {dim}\n"
f"\t- model attention heads: {n_heads}\n"
f"\t- model dropout: {dropout}\n"
f"\t- training on device: {device}\n"
f"Please see '--help' if you want to change these settings")
# load tokenizer
tok = Tokenizer(model_path=tokenizer_path)
# create model
model = Transformer(len(tok),
n_layers=n_layers,
dim=dim,
n_heads=n_heads,
p_drop=dropout,
weight_tying=False)
# load checkpoint
checkpoint = torch.load(checkpoints_path, map_location=device)
state_dict = checkpoint
model.load_state_dict(state_dict)
model.eval()
model = model.to(device)
_total_params = sum(p.numel() for p in model.parameters())
print(f"Number of model parameters: {_total_params}")
# create trainer and start fitting
trainer = Trainer(dl=None,
model=model,
criterion=None,
optimizer=None,
lr=lr_max,
grad_acc_steps=grad_acc_steps,
grad_clip_norm=grad_clip_norm,
model_save_path=checkpoints_path,
progress_bar=True)
# Run the generation for 128 tokens
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = torch.amp.autocast(device_type=device, dtype=ptdtype)
with torch.no_grad():
with ctx:
start_ids = tok.encode(prompt, bos=False, eos=False)
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
print(f"Finished training! Best model weights saved at '{checkpoints_path}'")
if __name__=="__main__":
fire.Fire(main)
...@@ -196,3 +196,28 @@ class Trainer(): ...@@ -196,3 +196,28 @@ class Trainer():
f"\tTotal valid batches: {len(self.dl.test):10d} | " f"\tTotal valid batches: {len(self.dl.test):10d} | "
f"Valid loss: {self.val_loss: 7.2f} | " f"Valid loss: {self.val_loss: 7.2f} | "
f"Valid perplexity: {self.val_ppl: 8.2f}") f"Valid perplexity: {self.val_ppl: 8.2f}")
@torch.no_grad()
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
"""
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
the sequence max_new_tokens times, feeding the predictions back into the model each time.
Most likely you'll want to make sure to be in model.eval() mode of operation for this.
"""
for _ in range(max_new_tokens):
idx_cond = idx
# TODO: Once we have access to context_size change the line to
# if idx.size(1) <= self.context_size else idx[:, -self.context_size:]
logits = self(idx_cond)
logits = logits[:, -1, :] / temperature
if top_k is not None:
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
logits[logits < v[:, [-1]]] = -float('Inf')
probs = F.softmax(logits, dim=-1)
idx_next = torch.multinomial(probs, num_samples=1)
idx = torch.cat((idx, idx_next), dim=1)
return idx
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment