From fa6aaa1b14ebda7bc8732886fa8e8585cda4dc81 Mon Sep 17 00:00:00 2001
From: Alexandru Gherghescu <gherghescu_alex1@yahoo.ro>
Date: Tue, 4 Jun 2024 00:19:46 +0300
Subject: [PATCH] Add TrainerArguments and adjust training loop

Add Trainer configuration as a separate class. This holds all the
training options as a separate dataclass; this can also be easily passed
in as a json file.

More code organization in the main training loop.
---
 optimus/trainer.py           | 351 +++++++++++++++++++----------------
 optimus/utils/setup_utils.py |  61 ++++++
 training.py                  | 242 ++++++++++++++++--------
 3 files changed, 414 insertions(+), 240 deletions(-)

diff --git a/optimus/trainer.py b/optimus/trainer.py
index ef011a2..39db79a 100644
--- a/optimus/trainer.py
+++ b/optimus/trainer.py
@@ -1,221 +1,250 @@
+import json
 from pathlib import Path
-from typing import Optional, Callable
+from typing import Optional
 
+from tqdm import tqdm
 import torch
 import torch.nn as nn
-from torch import optim
 from torch.utils.data import DataLoader
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-from fastprogress.fastprogress import master_bar, progress_bar, format_time
 
 
-class Trainer():
+class TrainingArguments():
     """
-    Trainer implementation for Optimus models.
+    Training arguments class to hold important switches and knobs related to
+    training.
 
     Args:
-        dl (OptimusDataLoader): Dataloader used to train the model.
-        model (nn.Module): Model to train.
-        criterion (callable): A suitable loss function. The trainer assumes
-            nn.CrossEntropyLoss, though other loss funcs (like
-            label-smoothed cross entropy loss) can be used.
-        optimizer (torch.optim.Optimizer): Optimizer to use for training.
-        lr (float): Max learning rate value to use for one-cycle scheduling.
-        grad_acc_steps (int): Number of gradient accumulation steps before
-            running backpropagation. Setting to 1 effectively disables
-            gradient accumulation.
-        grad_clip_norm (float): Gradient clipping norm value.
-        model_save_path (str): The best model (based on validation loss) is
-            saved to the specified path.
-        use_fp16 (bool): Whether to train the model in 16-bit floating point
-            precision. If such hardware is not supported, a warning is
-            issued and normal 32-bit precision is used instead.
-        progress_bar (bool): Whether to show a progress bar in console while
-            training. This is automatically disabled if output is a file,
-            however some stats are printed after finishing epochs. If False,
-            no stats are printed at all during training, whether the output
-            is a console or a file.
+        device (torch.device): GPU device on which to train.
+        log_steps (int): Log training progress each number of steps. This
+            considers number of updates, so gradient_accumulation_steps
+            influences this.
+        show_progress (bool, defaults to True): Whether to show progress during
+            training.
+        seed (int): Seed used for reproducibility purposes.
+        optimizer (torch.optim.Optimizer): Optimizer used for training.
+        lr_scheduler (torch.optim.lr_scheduler.LRScheduler): Scheduler used for
+            learning rate adjustment during training.
+        num_train_epochs (int): Number of training epochs.
+        per_device_batch_size (int): Batch size per device (GPU).
+        gradient_accumulation_steps (int): Steps to accumulate the gradient for.
+        max_grad_norm (float): Max gradient clipping norm.
+        use_fp16 (bool): Whether to train in floating point 16 accuracy. Uses
+            bfloat16 if available, otherwise regular fp16.
+        checkpoints_dir (Path): Path to where training checkpoints should be
+            saved.
+        save_steps (int): Save checkpoints every number of steps. If 0, doesn't
+            save checkpoints.
+        save_limit (int): Limit number of checkpoints to `save_limit`. Starts
+            deleting from the oldest when this number is reached.
 
     """
-    def __init__(self,
-                 model: nn.Module,
-                 train_dataloader: DataLoader,
-                 eval_dataloader: DataLoader,
-                 tokenizer: PreTrainedTokenizerBase,
-                 criterion: Callable,
-                 optimizer: optim.Optimizer,
-                 lr: float,
-                 grad_acc_steps: int,
-                 grad_clip_norm: float,
-                 model_save_path: str,
-                 use_fp16: bool,
-                 progress_bar: bool = True):
-        self.train_dataloader = train_dataloader
-        self.eval_dataloader = eval_dataloader
-        self.model = model
-        self.criterion = criterion
+    def __init__(
+        self,
+        device: torch.device,
+        log_steps: int,
+        seed: int,
+        optimizer: torch.optim.Optimizer,
+        lr_scheduler: torch.optim.lr_scheduler.LRScheduler,
+        num_train_epochs: int,
+        per_device_batch_size: int,
+        gradient_accumulation_steps: int,
+        max_grad_norm: float,
+        use_fp16: bool,
+        checkpoints_dir: Path,
+        save_steps: int,
+        save_limit: int,
+        show_progress: bool = True,
+    ):
+        self.device = device
+        self.log_steps = log_steps
+        self.show_progress = show_progress
+        self.seed = seed
         self.optimizer = optimizer
-        self.lr = lr
+        self.lr_scheduler = lr_scheduler
+        self.num_train_epochs = num_train_epochs
+        self.per_device_batch_size = per_device_batch_size
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.max_grad_norm = max_grad_norm
+        self.use_fp16 = use_fp16
+        self.checkpoints_dir = checkpoints_dir
+        self.save_steps = save_steps
+        self.save_limit = save_limit
 
-        assert type(grad_acc_steps) is int and grad_acc_steps > 0
-        self.grad_acc_steps = grad_acc_steps
+    @classmethod
+    def from_json_file(cls, file_path):
+        with open(file_path, 'r') as file:
+            return cls(**json.load(file))
 
-        self.grad_clip_norm = grad_clip_norm
-        self.model_save_path = model_save_path
 
-        self.use_fp16 = use_fp16
-        self.fp16_dtype = torch.float16
+class Trainer():
+    """
+    Generic PyTorch trainer implementation.
 
-        self.progress_bar = progress_bar
+    Args:
+        train_args (TrainingArguments): Training arguments.
+        model (nn.Module): Model to train.
+        train_dataloader (torch.utils.data.DataLoader): Dataloader used for model
+            training.
+        eval_dataloader (torch.utils.data.DataLoader): Dataloader used for model
+            evaluation.
+        tokenizer (PreTrainedTokenizerBase, optional): HuggingFace tokenizer
+            used for data collation purposes. Gets saved along with the model.
+            If not passed, a default collator is used.
 
-    def fit(self, n_epochs: int) -> None:
-        """
-        Fit the model, using the trainer, on the data inside the dataloader
-        object.
+    """
 
-        Args:
-            n_epochs (int): Number of epochs to train for.
+    def __init__(
+        self,
+        train_args: TrainingArguments,
+        model: nn.Module,
+        train_dataloader: DataLoader,
+        eval_dataloader: DataLoader,
+        tokenizer: Optional[PreTrainedTokenizerBase],
+    ):
+        self.args = train_args
+        self.model = model
+        self.train_dataloader = train_dataloader
+        self.eval_dataloader = eval_dataloader
+        self.tokenizer = tokenizer
 
+    def train(self) -> None:
         """
-        # this is fastai's implementation of one cycle
-        self.scheduler = torch.optim.lr_scheduler.OneCycleLR(
-            optimizer=self.optimizer,
-            max_lr=self.lr,
-            epochs=n_epochs,
-            steps_per_epoch=len(self.dl.train) // self.grad_acc_steps)
+        Training loop of the trainer.
 
-        # scaler used for mixed precision fp16 training on GPU
-        self.scaler = torch.cuda.amp.GradScaler(enabled=self.use_fp16)
+        """
+        num_examples = len(self.train_dataloader) * self.args.per_device_batch_size
+        num_update_steps_per_epoch = len(self.train_dataloader) // self.args.gradient_accumulation_steps
+        max_steps = self.args.num_train_epochs * num_update_steps_per_epoch
+        global_batch_size = self.args.per_device_batch_size * self.args.gradient_accumulation_steps * 1
+
+        fp16_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
 
         loss_fn = torch.nn.CrossEntropyLoss()
 
-        # progress bar for epochs
-        self.mb = master_bar(list(range(n_epochs)))
+        self.progress = tqdm(range(max_steps), disable=(not self.args.show_progress))
+
+        # scaler used for mixed precision fp16 training on GPU
+        self.scaler = torch.cuda.amp.GradScaler(enabled=self.args.use_fp16)
 
-        # start training for n_epochs
-        for self.epoch in range(n_epochs):
+        print("***** Running training *****")
+        print(f"  Num examples = {num_examples:,}")
+        print(f"  Num epochs = {self.args.num_train_epochs:,}")
+        print(f"  Instantaneous batch size per device = {self.args.per_device_batch_size:,}")
+        print(f"  Gradient Accumulation steps = {self.args.gradient_accumulation_steps}")
+        print(f"  Global batch size (w. distributed & accumulation) = {global_batch_size:,}")
+        print(f"  Total optimization steps = {max_steps:,}")
 
-            if self.progress_bar is True:
-                self.mb.update(self.epoch)
+        self.model.train()
 
-            start_time = time.time()
-            self._do_epoch_train()
-            self._do_epoch_validate()
-            self.epoch_time = time.time() - start_time
+        # start training for num_train_epochs
+        for epoch in range(self.args.num_train_epochs):
 
-            # write end of epoch stats
-            self._write_epoch_stats()
+            # needed for distributed sampler RNG state
+            if hasattr(self.train_dataloader, "set_epoch"):
+                self.train_dataloader.set_epoch(epoch)
 
-            # if better model on validation loss, save it
-            if self.val_loss < best_val_loss:
-                best_val_loss = self.val_loss
-                torch.save(self.model.state_dict(), self.model_save_path)
+            for step, inputs in enumerate(self.train_dataloader):
 
-    def _do_epoch_train(self):
-        self.model.train() # put model in training mode
+                inputs = inputs['input_ids']
+                inputs = inputs.to(self.args.device)
 
-        # compute average train loss, train perplexity and ms/batch every ~200
-        # batches, or every 10% of training dataset (whichever is smaller),
-        # rounded to gradient accumulation steps
-        est_interval = int(max(min(200, 0.1 * len(self.dl.train)) // self.grad_acc_steps, 1) * self.grad_acc_steps)
+                with torch.cuda.amp.autocast(dtype=fp16_dtype,
+                                             enabled=self.args.use_fp16):
+                    logits = self.model(inputs)
 
-        # progress bar for batches
-        pb = progress_bar(range(len(self.dl.train)), parent=self.mb)
+                    labels = inputs[..., 1:].contiguous().view(-1)
+                    logits = logits[..., :-1, :].contiguous().view(-1, self.model.module.vocab_size)
 
-        self.ms_per_batch = 0.
-        total_loss = 0.
-        start_time = time.time()
+                    loss = loss_fn(logits, labels)
 
-        for i, (x, y) in enumerate(self.dl.train):
+                    tr_loss = loss.item()
+                    loss = loss / self.args.gradient_accumulation_steps # normalize to account for gradient accumulation
 
-            if self.progress_bar is True:
-                pb.update(i)
+                self.scaler.scale(loss).backward()
 
-            # automatic mixed precision training
-            with torch.cuda.amp.autocast(dtype=self.fp16_dtype,
-                                         enabled=self.use_fp16):
-                output = self.model(x)
-                loss = self.criterion(output.view(-1, len(self.dl.train.tok)),
-                                      y.reshape(-1))
+                # update only after gradient_accumulation_steps
+                if (step + 1) % self.args.gradient_accumulation_steps == 0:
+                    # Note: This will ignore the last few batches of the dataset,
+                    # when the gradient accumulation steps are more than 1, and the
+                    # number of batches doesn't cleanly divide by grad_acc_steps
 
-                loss = loss / self.grad_acc_steps # normalize to account for gradient accumulation
+                    # gradient clipping
+                    self.scaler.unscale_(self.args.optimizer)
+                    nn.utils.clip_grad_norm_(self.model.parameters(),
+                                             max_norm=self.args.max_grad_norm)
 
-            self.scaler.scale(loss).backward()
+                    self.scaler.step(self.args.optimizer)
+                    self.scaler.update()
+                    self.args.optimizer.zero_grad()
 
-            total_loss += loss.item()
+                    self.args.lr_scheduler.step()
+                    # lr = self.args.lr_scheduler.get_last_lr()[0]
 
-            # update only after grad_acc_steps
-            if (i + 1) % self.grad_acc_steps == 0:
-                # Note: This will ignore the last few batches of the dataset,
-                # when the gradient accumulation steps are more than 1, and the
-                # number of batches doesn't cleanly divide by grad_acc_steps
+                    if (step + 1) % self.args.log_steps * self.args.gradient_accumulation_steps == 0:
+                        print(f"Loss is {tr_loss:,}")
 
-                # gradient clipping
-                self.scaler.unscale_(self.optimizer)
-                nn.utils.clip_grad_norm_(self.model.parameters(),
-                                         max_norm=self.grad_clip_norm)
+                    self.progress.update(1)
 
-                self.scaler.step(self.optimizer)
-                self.scaler.update()
-                self.optimizer.zero_grad()
+    # def _do_epoch_validate(self):
+    #     self.model.eval() # put model in eval mode
 
-                self.scheduler.step()
-                lr = self.scheduler.get_last_lr()[0]
+    #     total_loss = 0.
 
-                # update train loss, train ppl and estimated ms/batch
-                if (i + 1) % est_interval == 0:
-                    self.ms_per_batch = (time.time() - start_time) * 1000 / est_interval
-                    self.train_loss = (total_loss * self.grad_acc_steps) / est_interval
-                    self.train_ppl = math.exp(self.train_loss)
+    #     # progress bar for batches
+    #     pb = progress_bar(range(len(self.dl.test)), parent=self.mb)
 
-                    total_loss = 0.
-                    start_time = time.time()
+    #     with torch.no_grad():
+    #         for i, (x, y) in enumerate(self.dl.test):
 
-                self.mb.child.comment = f" | train loss: {loss.item() * self.grad_acc_steps:.4f} | " \
-                                        f"~{self.ms_per_batch:.2f} ms/batch | " \
-                                        f" lr: {lr:.7f}"
+    #             if self.progress_bar is True:
+    #                 pb.update(i)
 
-        pb.on_iter_end()
+    #             with torch.cuda.amp.autocast(dtype=self.fp16_dtype,
+    #                                          enabled=self.use_fp16):
+    #                 output = self.model(x)
+    #                 loss = self.criterion(output.view(-1, len(self.dl.test.tok)),
+    #                                       y.reshape(-1))
 
-    def _do_epoch_validate(self):
-        self.model.eval() # put model in eval mode
+    #             total_loss += loss.item()
 
-        total_loss = 0.
+    #             self.mb.child.comment = f" | valid loss: {loss.item():.4f}"
 
-        # progress bar for batches
-        pb = progress_bar(range(len(self.dl.test)), parent=self.mb)
+    #         self.val_loss = total_loss / (len(self.dl.test) - 1)
+    #         self.val_ppl = math.exp(self.val_loss)
 
-        with torch.no_grad():
-            for i, (x, y) in enumerate(self.dl.test):
+    #         pb.on_iter_end()
 
-                if self.progress_bar is True:
-                    pb.update(i)
+    # def _write_epoch_stats(self):
+    #     if self.progress_bar is True:
+    #         epoch_time = format_time(self.epoch_time)
+    #         self.mb.write(
+    #             f"* End of epoch {self.epoch:3d}:\n"
+    #             f"\tTotal time: {epoch_time:9s} | "
+    #             f"Est. ms/batch: {self.ms_per_batch:.2f}\n"
+    #             f"\tTotal train batches: {len(self.dl.train):10d} | "
+    #             f"Train loss: {self.train_loss: 7.2f} | "
+    #             f"Train perplexity: {self.train_ppl: 8.2f}\n"
+    #             f"\tTotal valid batches: {len(self.dl.test):10d} | "
+    #             f"Valid loss: {self.val_loss: 7.2f} | "
+    #             f"Valid perplexity: {self.val_ppl: 8.2f}")
 
-                with torch.cuda.amp.autocast(dtype=self.fp16_dtype,
-                                             enabled=self.use_fp16):
-                    output = self.model(x)
-                    loss = self.criterion(output.view(-1, len(self.dl.test.tok)),
-                                          y.reshape(-1))
+    def save_model(self, save_dir: Path) -> None:
+        """
+        Save model and tokenizer to a directory.
 
-                total_loss += loss.item()
+        Args:
+            save_dir (Path): Path to save directory.
 
-                self.mb.child.comment = f" | valid loss: {loss.item():.4f}"
+        """
+        pass
 
-            self.val_loss = total_loss / (len(self.dl.test) - 1)
-            self.val_ppl = math.exp(self.val_loss)
+    def save_logs(self, log_dir: Path) -> None:
+        """
+        Save training logs to a directory.
 
-            pb.on_iter_end()
+        Args:
+            log_dir (Path): Path to log directory.
 
-    def _write_epoch_stats(self):
-        if self.progress_bar is True:
-            epoch_time = format_time(self.epoch_time)
-            self.mb.write(
-                f"* End of epoch {self.epoch:3d}:\n"
-                f"\tTotal time: {epoch_time:9s} | "
-                f"Est. ms/batch: {self.ms_per_batch:.2f}\n"
-                f"\tTotal train batches: {len(self.dl.train):10d} | "
-                f"Train loss: {self.train_loss: 7.2f} | "
-                f"Train perplexity: {self.train_ppl: 8.2f}\n"
-                f"\tTotal valid batches: {len(self.dl.test):10d} | "
-                f"Valid loss: {self.val_loss: 7.2f} | "
-                f"Valid perplexity: {self.val_ppl: 8.2f}")
+        """
+        pass
diff --git a/optimus/utils/setup_utils.py b/optimus/utils/setup_utils.py
index 39f8d58..7802ea4 100644
--- a/optimus/utils/setup_utils.py
+++ b/optimus/utils/setup_utils.py
@@ -1,4 +1,52 @@
 from transformers import AutoTokenizer
+from datasets import load_from_disk
+
+from optimus.models import OptimusTransformer
+from optimus.models.optimus import OptimusConfig
+
+
+def load_and_chunk_dataset(data_dir, seq_len):
+    tokenized_datasets = load_from_disk(data_dir)
+
+    # split dataset into chunks of seq_len
+    def split_text_fn(examples):
+        # concatenate texts into batch
+        concatenated_examples = {
+            input_ids: [token for sublist in examples[input_ids] for token in sublist]
+            for input_ids in examples.keys()
+        }
+        total_length = len(concatenated_examples[next(iter(examples.keys()))])
+
+        # drop last chunk
+        total_length = (total_length // seq_len) * seq_len
+
+        # split in chunks of size seq_len
+        result = {
+            input_ids: [text[i:i + seq_len] for i in range(0, total_length, seq_len)]
+            for input_ids, text in concatenated_examples.items()
+        }
+        return result
+
+    # apply text splitting into batches
+    tokenized_datasets = tokenized_datasets.map(
+        split_text_fn,
+        batched=True,
+        num_proc=6,
+        remove_columns=tokenized_datasets['train'].column_names
+    )
+    tokenized_datasets = tokenized_datasets.select_columns(['input_ids'])
+
+    print("Result: ")
+    print(tokenized_datasets)
+
+    ctx_len = len(tokenized_datasets['train'][0]['input_ids'])
+    n_batches = len(tokenized_datasets['train'])
+    print(f"Dataset info:")
+    print(f"  - context length: {ctx_len}")
+    print(f"  - number of batches (train set): {n_batches}")
+    print(f"  - total number of tokens: {ctx_len * n_batches}")
+
+    return tokenized_datasets
 
 
 def create_tokenizer(tokenizer_name):
@@ -12,3 +60,16 @@ def create_tokenizer(tokenizer_name):
         tokenizer.bos_token = tokenizer.eos_token
 
     return tokenizer
+
+
+def create_model(config_file, device):
+    config = OptimusConfig.from_json_file(config_file)
+    model = OptimusTransformer(config)
+    model.to(device)
+
+    print(model)
+
+    _total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Number of trainable model parameters: {_total_params}")
+
+    return model
diff --git a/training.py b/training.py
index 193de68..50db310 100644
--- a/training.py
+++ b/training.py
@@ -2,41 +2,64 @@ from pathlib import Path
 
 import fire
 import torch
-from torch import nn
 from torch.utils.data import DataLoader
-from datasets import load_from_disk
 
-from optimus.models.optimus import OptimusTransformer, OptimusConfig
-from optimus.trainer import Trainer
+from optimus.trainer import Trainer, TrainingArguments
 from optimus.utils import setup_utils
 
 
-def main(batch_size: int = 8,
-         grad_acc_steps: int = 1,
-         seq_len: int = 512,
-         lr_max: float = 1e-4,
-         grad_clip_norm: float = 1.0,
-         epochs: int = 1,
-         hf_tokenizer_name: str = 'gpt2',
-         data_dir: Path = Path('/workspace'),
-         dataset_dir: Path = Path('wikitext103_tokenized_dataset'),
-         checkpoints_path: str = 'best_model.pth',
-         dim: int = 512,
-         n_layers: int = 6,
-         n_heads: int = 8,
-         dropout: float = 0.0,
-         use_fp16: bool = True):
+def main(
+    # training args
+    batch_size: int = 2, # per GPU
+    grad_acc_steps: int = 4, # per GPU
+    seq_len: int = 4096,
+    lr_max: float = 1e-4,
+    weight_decay: float = 0.001,
+    warmup_steps: int = 1000,
+    epochs: int = 1,
+    grad_clip_norm: float = 1.0,
+    use_fp16: bool = True,
+    seed: int = 42,
+
+    # model + tokenizer
+    model_config_path: Path = Path('config.json'),
+    hf_tokenizer_name: str = 'gpt2',
+
+    # data directory (as this is usually big, it should reside in a different
+    # directory than the source code)
+    data_dir: Path = Path('.'),
+
+    # dataset dir (appended to data_dir)
+    dataset_dir: Path = Path('gpt2_tokenized_wikitext103_no_seq_len'),
+
+    # training-related dirs (appended to data_dir)
+    checkpoints_dir: Path = Path('training_checkpoints'),
+    log_dir: Path = Path('training_logs'),
+    save_dir: Path = Path('trained_model'),
+):
     """
-    Run the main training loop for the model.
+    Prepare the training arguments, then run the training loop.
 
     Args:
-        batch_size (int): Batch size for training.
+        batch_size (int): Batch size for training. In distributed setup, this is
+            per GPU.
         grad_acc_steps (int): Number of batches to accumulate gradients for
-            before running backpropagation to update weights.
-        seq_len (int): Context length for training.
+            before running backpropagation to update weights. In distributed
+            setup, this is per GPU. Global batch size is `batch_size *
+            grad_acc_steps * number of GPU's`. Adjust learning rate as needed!
+        seq_len (int): Context length for training. This is used to split the
+            dataset into batches.
         lr_max (float): Maximum learning rate, used for one-cycle scheduling.
-        grad_clip_norm (float): Gradient clipping value for gradient's norm.
+        weight_decay (float): Weight decay used for the optimizer.
+        warmup_steps (int): Warmup steps used for the optimizer.
         epochs (int): Number of epochs to train for.
+        grad_clip_norm (float): Gradient clipping value for gradient's norm.
+        use_fp16 (bool): Whether to train using floating-point 16-bits
+            precision. Bfloat16 is used if available, otherwise fp16.
+        seed (int): Seed used for reproducibility purposes. Each process will
+            have its initial PyTorch seed set to `seed + process_rank`.
+        model_config_path (Path): Path to a config file which describes the
+            model to be trained.
         hf_tokenizer_name (str): HuggingFace tokenizer name.
         data_dir (Path): Directory located in a large/fast storage, which holds
             data to be used by the model. Should also be capable to accomodate
@@ -47,16 +70,29 @@ def main(batch_size: int = 8,
             Should be tokenized _with the same tokenizer_ as the one used for
             training (`hf_tokenizer_name` above). The dataset should not be
             already split into batches. Given path is appended to `data_dir`.
-        checkpoints_path (str): Where to save the trained model. Should be a .pt
-            or .pth file.
-        dim (int): Dimension of the model.
-        n_layers (int): Number of layers for the model.
-        n_heads (int): Number of heads inside an attention layer for the model.
-        dropout (float): Dropout to use for the model.
-        use_fp16 (bool): Whether to train using floating-point 16-bits
-            precision.
+        checkpoints_dir (Path): Path where training checkpoints should be saved.
+            Will be created if it doesn't exist. Given path is appended to
+            `data_dir`.
+        log_dir (Path): Path where training logs should be saved. Will be
+            created if it doesn't exist. Given path is appended to `data_dir`.
+        save_dir (Path): Path where to save the model upon training completion.
+            Will be created if it doesn't exist. Given path is appended to
+            `data_dir`.
 
     """
+    # create paths
+    if isinstance(data_dir, str):
+        data_dir = Path(data_dir)
+
+    if isinstance(dataset_dir, str):
+        dataset_dir = data_dir / Path(dataset_dir)
+    else:
+        dataset_dir = data_dir / dataset_dir
+
+    # create paths
+    if isinstance(model_config_path, str):
+        model_config_path = Path(model_config_path)
+
     if isinstance(data_dir, str):
         data_dir = Path(data_dir)
 
@@ -65,34 +101,56 @@ def main(batch_size: int = 8,
     else:
         dataset_dir = data_dir / dataset_dir
 
+    if isinstance(checkpoints_dir, str):
+        checkpoints_dir = data_dir / Path(checkpoints_dir)
+    else:
+        checkpoints_dir = data_dir / checkpoints_dir
+
+    if isinstance(log_dir, str):
+        log_dir = data_dir / Path(log_dir)
+    else:
+        log_dir = data_dir / log_dir
+
+    if isinstance(save_dir, str):
+        save_dir = data_dir / Path(save_dir)
+    else:
+        save_dir = data_dir / save_dir
+
     print(f"Running with:\n"
         f"\t- batch size: {batch_size}\n"
         f"\t- gradient accumulation steps: {grad_acc_steps}\n"
         f"\t- context length: {seq_len}\n"
         f"\t- max learning rate: {lr_max}\n"
-        f"\t- gradient clipping norm: {grad_clip_norm}\n"
+        f"\t- weight decay: {weight_decay}\n"
+        f"\t- warmup steps: {warmup_steps}\n"
         f"\t- epochs: {epochs}\n"
+        f"\t- gradient clipping norm: {grad_clip_norm}\n"
+        f"\t- 16-bit floating-point training (fp16): {use_fp16}\n"
+        f"\t- seed: {seed}\n"
+        f"\t- only main rank logs: {log_on_main_rank_only}\n"
+        f"\t- model config file: {model_config_path}\n"
         f"\t- huggingface tokenizer: {hf_tokenizer_name}\n"
         f"\t- training data directory: {str(data_dir)}\n"
         f"\t- dataset directory: {str(dataset_dir)}\n"
-        f"\t- checkpoints path: {checkpoints_path}\n"
-        f"\t- model dimension: {dim}\n"
-        f"\t- model layers: {n_layers}\n"
-        f"\t- model attention heads: {n_heads}\n"
-        f"\t- model dropout: {dropout}\n"
-        f"\t- 16-bit floating-point training (fp16): {use_fp16}\n"
-        f"Please see '--help' if you want to change these settings")
+        f"\t- checkpoints directory: {str(checkpoints_dir)}\n"
+        f"\t- logging directory: {str(log_dir)}\n"
+        f"\t- saved model directory: {str(save_dir)}\n"
+        f"Please seek '--help' if you want to change any of these settings")
+
+    # set device
+    device = f'cuda'
+
+    # load dataset and split into batches
+    dataset = setup_utils.load_and_chunk_dataset(dataset_dir, seq_len)
+    dataset.set_format('torch')
 
     # load tokenizer
     tokenizer = setup_utils.create_tokenizer(hf_tokenizer_name)
 
-    # load dataset
-    dataset = load_from_disk(str(dataset_dir))
-
-    print(f'Number of examples in training set: {len(dataset['train'])}')
-    print(f'Number of examples in testing set: {len(dataset['test'])}')
+    # create model and move to device
+    model = setup_utils.create_model(model_config_path, device)
 
-    # create dataloader objects and move to device
+    # create samplers + dataloader    train_dataloader = DataLoader(
     train_dataloader = DataLoader(
         dataset['train'],
         batch_size=batch_size, # per GPU
@@ -109,48 +167,74 @@ def main(batch_size: int = 8,
         pin_memory=True, # fast CPU-GPU transfer
     )
 
-    # create model and move to device
-    config = OptimusConfig(vocab_size=len(tokenizer),
-                           num_hidden_layers=n_layers,
-                           num_attention_heads=n_heads,
-                           hidden_size=dim,
-                           attention_dropout=dropout,
-                           tie_word_embeddings=False)
-    model = OptimusTransformer(config)
-    model = model.to('cuda')
-
-    _total_params = sum(p.numel() for p in model.parameters())
-    print(f'Number of model parameters: {_total_params}')
-
-    # define loss metric
-    criterion = nn.CrossEntropyLoss()
-
-    # define optimizer
-    # see [1] for a discussion on what the epsilon value should be for amp; 1e-7
-    # is a good default for both amp and normal training
-    # [1]: https://github.com/pytorch/pytorch/issues/26218
-    optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.999), eps=1e-7)
+    # create optimizer
+    optimizer = torch.optim.AdamW(
+        model.parameters(),
+        betas=(0.9, 0.999),
+        eps=1e-7,
+        weight_decay=weight_decay,
+    )
 
-    print('Starting training...')
+    # create learning rate scheduler (fastai's implementation)
+    lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(
+        optimizer=optimizer,
+        max_lr=lr_max,
+        epochs=epochs,
+        steps_per_epoch=len(train_dataloader) // grad_acc_steps,
+    )
+
+    # create training arguments
+    train_args = TrainingArguments(
+        device=torch.device(device),
+
+        # logging
+        log_steps=100,
 
-    # create trainer and start fitting
+        # core training
+        seed=seed,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+        num_train_epochs=epochs,
+        per_device_batch_size=batch_size, # per GPU
+        gradient_accumulation_steps=grad_acc_steps, # per GPU
+        max_grad_norm=grad_clip_norm,
+        use_fp16=use_fp16,
+
+        # training checkpointing
+        checkpoints_dir=checkpoints_dir,
+        save_steps=1000,
+        save_limit=3,
+    )
+
+    # create trainer
     trainer = Trainer(
+        train_args=train_args,
         model=model,
         train_dataloader=train_dataloader,
         eval_dataloader=eval_dataloader,
         tokenizer=tokenizer,
-        criterion=criterion,
-        optimizer=optimizer,
-        lr=lr_max,
-        grad_acc_steps=grad_acc_steps,
-        grad_clip_norm=grad_clip_norm,
-        model_save_path=checkpoints_path,
-        use_fp16=use_fp16,
-        progress_bar=True
     )
-    trainer.fit(epochs)
 
-    print(f"Finished training! Best model weights saved at '{checkpoints_path}'")
+    # create trainer
+    trainer = Trainer(
+        train_args=train_args,
+        model=model,
+        train_dataloader=train_dataloader,
+        eval_dataloader=eval_dataloader,
+        tokenizer=tokenizer,
+    )
+
+    print('Starting training...')
+
+    trainer.train()
+
+    print(f"Finished training! Saving model weights to '{str(save_dir)}'")
+
+    # save model + tokenizer
+    trainer.save_model(save_dir)
+
+    # save log data
+    trainer.save_logs(log_dir)
 
 
 if __name__=='__main__':
-- 
GitLab