Merge branch 'vladb/add_inference' into 'main'

Add inference code See merge request !10

Merge branch 'vladb/add_inference' into 'main'
Add inference code See merge request !10
cb893907 · Vlad-Andrei BĂDOIU (78692) · 83f7b518 · 22338e60 · cb893907 · cb893907
Commit cb893907 authored 1 year ago by Vlad-Andrei BĂDOIU (78692)
--- a/optimus/example_inference.py
+++ b/optimus/example_inference.py
+import sys
+import time
+import math
+import argparse
+import fire
+import torch
+from torch import nn
+from torch.utils.data import Dataset
+from datasets.wikitext103 import WikiText103Dataset
+from tokenizer import Tokenizer
+from dataloader import OptimusDataLoader
+from model import Transformer
+from trainer import Trainer
+def main(batch_size: int = 8,
+         grad_acc_steps: int = 1,
+         seq_len: int = 512,
+         lr_max: float = 1e-4,
+         grad_clip_norm: float = 1.0,
+         epochs: int = 1,
+         tokenizer_path: str = 'optimus.model',
+         checkpoints_path: str = 'best_model.pth',
+         n_layers: int = 6,
+         dim: int = 512,
+         n_heads: int = 8,
+         dropout: float = 0.0,
+         device: str = 'cuda',
+         prompt = "Once upon time"
+):
+    """
+    Run the main training loop for the model.
+    Args:
+        batch_size (int): Batch size for training.
+        grad_acc_steps (int): Number of batches to accumulate gradients for
+            before running backpropagation to update weights.
+        seq_len (int): Context length for training.
+        lr_max (float): Maximum learning rate, used for one-cycle scheduling.
+        grad_clip_norm (float): Gradient clipping value for gradient's norm.
+        epochs (int): Number of epochs to train for.
+        tokenizer_path (str): Path to the tokenizer model.
+        checkpoints_path (str): Where to save the trained model. Should be a .pt
+            or .pth file.
+        n_layers (int): Number of layers for the model.
+        dim (int): Dimension of the model.
+        n_heads (int): Number of heads inside an attention layer for the model.
+        dropout (float): Dropout to use for the model.
+        device (str): Device where to train the model. Viable options are 'cpu',
+            'cuda', 'cuda:2' etc.
+    """
+    print(f"Running with:\n"
+        f"\t- batch size: {batch_size}\n"
+        f"\t- gradient accumulation steps: {grad_acc_steps}\n"
+        f"\t- context length: {seq_len}\n"
+        f"\t- max learning rate: {lr_max}\n"
+        f"\t- gradient clipping norm: {grad_clip_norm}\n"
+        f"\t- epochs: {epochs}\n"
+        f"\t- tokenizer: {tokenizer_path}\n"
+        f"\t- checkpoints path: {checkpoints_path}\n"
+        f"\t- model layers: {n_layers}\n"
+        f"\t- model dimension: {dim}\n"
+        f"\t- model attention heads: {n_heads}\n"
+        f"\t- model dropout: {dropout}\n"
+        f"\t- training on device: {device}\n"
+        f"Please see '--help' if you want to change these settings")
+    # load tokenizer
+    tok = Tokenizer(model_path=tokenizer_path)
+    # create model
+    model = Transformer(len(tok),
+                        n_layers=n_layers,
+                        dim=dim,
+                        n_heads=n_heads,
+                        p_drop=dropout,
+                        weight_tying=False)
+    # load checkpoint
+    checkpoint = torch.load(checkpoints_path, map_location=device)
+    state_dict = checkpoint
+    model.load_state_dict(state_dict)
+    model.eval()
+    model = model.to(device)
+    _total_params = sum(p.numel() for p in model.parameters())
+    print(f"Number of model parameters: {_total_params}")
+    # create trainer and start fitting
+    trainer = Trainer(dl=None,
+                      model=model,
+                      criterion=None,
+                      optimizer=None,
+                      lr=lr_max,
+                      grad_acc_steps=grad_acc_steps,
+                      grad_clip_norm=grad_clip_norm,
+                      model_save_path=checkpoints_path,
+                      progress_bar=True)
+    # Run the generation for 128 tokens
+    dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
+    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
+    ctx = torch.amp.autocast(device_type=device, dtype=ptdtype)
+    with torch.no_grad():
+        with ctx:
+            start_ids = tok.encode(prompt, bos=False, eos=False)
+            x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
+    print(f"Finished training! Best model weights saved at '{checkpoints_path}'")
+if __name__=="__main__":
+    fire.Fire(main)
--- a/optimus/trainer.py
+++ b/optimus/trainer.py
@@ -196,3 +196,28 @@ class Trainer():
                f"\tTotal valid batches: {len(self.dl.test):10d} | "
                f"Valid loss: {self.val_loss: 7.2f} | "
                f"Valid perplexity: {self.val_ppl: 8.2f}")
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        for _ in range(max_new_tokens):
+            idx_cond = idx 
+            # TODO: Once we have access to context_size change the line to
+            # if idx.size(1) <= self.context_size else idx[:, -self.context_size:]
+            logits = self(idx_cond)
+            logits = logits[:, -1, :] / temperature
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx